zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

avx10_2niintrin.h (110711B) - Raw


      1 /*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 #ifndef __IMMINTRIN_H
     10 #error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead."
     11 #endif
     12 
     13 #ifdef __SSE2__
     14 
     15 #ifndef __AVX10_2NIINTRIN_H
     16 #define __AVX10_2NIINTRIN_H
     17 
     18 #define __DEFAULT_FN_ATTRS128                                                  \
     19   __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
     20                  __min_vector_width__(128)))
     21 #define __DEFAULT_FN_ATTRS256                                                  \
     22   __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
     23                  __min_vector_width__(256)))
     24 
     25 /* VNNI FP16 */
     26 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_dpph_ps(__m128 __W,
     27                                                            __m128h __A,
     28                                                            __m128h __B) {
     29   return (__m128)__builtin_ia32_vdpphps128((__v4sf)__W, (__v8hf)__A,
     30                                            (__v8hf)__B);
     31 }
     32 
     33 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_dpph_ps(__m128 __W,
     34                                                                 __mmask8 __U,
     35                                                                 __m128h __A,
     36                                                                 __m128h __B) {
     37   return (__m128)__builtin_ia32_selectps_128(
     38       (__mmask8)__U, (__v4sf)_mm_dpph_ps(__W, __A, __B), (__v4sf)__W);
     39 }
     40 
     41 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_dpph_ps(__mmask8 __U,
     42                                                                  __m128 __W,
     43                                                                  __m128h __A,
     44                                                                  __m128h __B) {
     45   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
     46                                              (__v4sf)_mm_dpph_ps(__W, __A, __B),
     47                                              (__v4sf)_mm_setzero_ps());
     48 }
     49 
     50 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_dpph_ps(__m256 __W,
     51                                                               __m256h __A,
     52                                                               __m256h __B) {
     53   return (__m256)__builtin_ia32_vdpphps256((__v8sf)__W, (__v16hf)__A,
     54                                            (__v16hf)__B);
     55 }
     56 
     57 static __inline__ __m256 __DEFAULT_FN_ATTRS256
     58 _mm256_mask_dpph_ps(__m256 __W, __mmask8 __U, __m256h __A, __m256h __B) {
     59   return (__m256)__builtin_ia32_selectps_256(
     60       (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), (__v8sf)__W);
     61 }
     62 
     63 static __inline__ __m256 __DEFAULT_FN_ATTRS256
     64 _mm256_maskz_dpph_ps(__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) {
     65   return (__m256)__builtin_ia32_selectps_256(
     66       (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B),
     67       (__v8sf)_mm256_setzero_ps());
     68 }
     69 
     70 /* VMPSADBW */
     71 #define _mm_mask_mpsadbw_epu8(W, U, A, B, imm)                                 \
     72   ((__m128i)__builtin_ia32_selectw_128(                                        \
     73       (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)),                \
     74       (__v8hi)(__m128i)(W)))
     75 
     76 #define _mm_maskz_mpsadbw_epu8(U, A, B, imm)                                   \
     77   ((__m128i)__builtin_ia32_selectw_128(                                        \
     78       (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)),                \
     79       (__v8hi)_mm_setzero_si128()))
     80 
     81 #define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm)                              \
     82   ((__m256i)__builtin_ia32_selectw_256(                                        \
     83       (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)),           \
     84       (__v16hi)(__m256i)(W)))
     85 
     86 #define _mm256_maskz_mpsadbw_epu8(U, A, B, imm)                                \
     87   ((__m256i)__builtin_ia32_selectw_256(                                        \
     88       (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)),           \
     89       (__v16hi)_mm256_setzero_si256()))
     90 
     91 /* VNNI INT8 */
     92 static __inline__ __m128i __DEFAULT_FN_ATTRS128
     93 _mm_mask_dpbssd_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
     94   return (__m128i)__builtin_ia32_selectd_128(
     95       __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B), (__v4si)__W);
     96 }
     97 
     98 static __inline__ __m128i __DEFAULT_FN_ATTRS128
     99 _mm_maskz_dpbssd_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
    100   return (__m128i)__builtin_ia32_selectd_128(
    101       __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B),
    102       (__v4si)_mm_setzero_si128());
    103 }
    104 
    105 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    106 _mm256_mask_dpbssd_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
    107   return (__m256i)__builtin_ia32_selectd_256(
    108       __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B), (__v8si)__W);
    109 }
    110 
    111 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    112 _mm256_maskz_dpbssd_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
    113   return (__m256i)__builtin_ia32_selectd_256(
    114       __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B),
    115       (__v8si)_mm256_setzero_si256());
    116 }
    117 
    118 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    119 _mm_mask_dpbssds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
    120   return (__m128i)__builtin_ia32_selectd_128(
    121       __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B), (__v4si)__W);
    122 }
    123 
    124 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    125 _mm_maskz_dpbssds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
    126   return (__m128i)__builtin_ia32_selectd_128(
    127       __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B),
    128       (__v4si)_mm_setzero_si128());
    129 }
    130 
    131 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    132 _mm256_mask_dpbssds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
    133   return (__m256i)__builtin_ia32_selectd_256(
    134       __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B), (__v8si)__W);
    135 }
    136 
    137 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbssds_epi32(
    138     __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
    139   return (__m256i)__builtin_ia32_selectd_256(
    140       __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B),
    141       (__v8si)_mm256_setzero_si256());
    142 }
    143 
    144 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    145 _mm_mask_dpbsud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
    146   return (__m128i)__builtin_ia32_selectd_128(
    147       __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B), (__v4si)__W);
    148 }
    149 
    150 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    151 _mm_maskz_dpbsud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
    152   return (__m128i)__builtin_ia32_selectd_128(
    153       __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B),
    154       (__v4si)_mm_setzero_si128());
    155 }
    156 
    157 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    158 _mm256_mask_dpbsud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
    159   return (__m256i)__builtin_ia32_selectd_256(
    160       __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B), (__v8si)__W);
    161 }
    162 
    163 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    164 _mm256_maskz_dpbsud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
    165   return (__m256i)__builtin_ia32_selectd_256(
    166       __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B),
    167       (__v8si)_mm256_setzero_si256());
    168 }
    169 
    170 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    171 _mm_mask_dpbsuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
    172   return (__m128i)__builtin_ia32_selectd_128(
    173       __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B), (__v4si)__W);
    174 }
    175 
    176 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    177 _mm_maskz_dpbsuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
    178   return (__m128i)__builtin_ia32_selectd_128(
    179       __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B),
    180       (__v4si)_mm_setzero_si128());
    181 }
    182 
    183 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    184 _mm256_mask_dpbsuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
    185   return (__m256i)__builtin_ia32_selectd_256(
    186       __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B), (__v8si)__W);
    187 }
    188 
    189 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbsuds_epi32(
    190     __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
    191   return (__m256i)__builtin_ia32_selectd_256(
    192       __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B),
    193       (__v8si)_mm256_setzero_si256());
    194 }
    195 
    196 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    197 _mm_mask_dpbuud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
    198   return (__m128i)__builtin_ia32_selectd_128(
    199       __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B), (__v4si)__W);
    200 }
    201 
    202 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    203 _mm_maskz_dpbuud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
    204   return (__m128i)__builtin_ia32_selectd_128(
    205       __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B),
    206       (__v4si)_mm_setzero_si128());
    207 }
    208 
    209 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    210 _mm256_mask_dpbuud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
    211   return (__m256i)__builtin_ia32_selectd_256(
    212       __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B), (__v8si)__W);
    213 }
    214 
    215 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    216 _mm256_maskz_dpbuud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
    217   return (__m256i)__builtin_ia32_selectd_256(
    218       __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B),
    219       (__v8si)_mm256_setzero_si256());
    220 }
    221 
    222 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    223 _mm_mask_dpbuuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
    224   return (__m128i)__builtin_ia32_selectd_128(
    225       __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B), (__v4si)__W);
    226 }
    227 
    228 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    229 _mm_maskz_dpbuuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
    230   return (__m128i)__builtin_ia32_selectd_128(
    231       __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B),
    232       (__v4si)_mm_setzero_si128());
    233 }
    234 
    235 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    236 _mm256_mask_dpbuuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
    237   return (__m256i)__builtin_ia32_selectd_256(
    238       __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B), (__v8si)__W);
    239 }
    240 
    241 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbuuds_epi32(
    242     __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
    243   return (__m256i)__builtin_ia32_selectd_256(
    244       __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B),
    245       (__v8si)_mm256_setzero_si256());
    246 }
    247 
    248 /* VNNI INT16 */
    249 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    250 _mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
    251   return (__m128i)__builtin_ia32_selectd_128(
    252       (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C), (__v4si)__A);
    253 }
    254 
    255 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    256 _mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
    257   return (__m128i)__builtin_ia32_selectd_128(
    258       (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C),
    259       (__v4si)_mm_setzero_si128());
    260 }
    261 
    262 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    263 _mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
    264   return (__m256i)__builtin_ia32_selectd_256(
    265       (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C), (__v8si)__A);
    266 }
    267 
    268 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    269 _mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
    270   return (__m256i)__builtin_ia32_selectd_256(
    271       (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C),
    272       (__v8si)_mm256_setzero_si256());
    273 }
    274 
    275 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    276 _mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
    277   return (__m128i)__builtin_ia32_selectd_128(
    278       (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C), (__v4si)__A);
    279 }
    280 
    281 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    282 _mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
    283   return (__m128i)__builtin_ia32_selectd_128(
    284       (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C),
    285       (__v4si)_mm_setzero_si128());
    286 }
    287 
    288 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    289 _mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
    290   return (__m256i)__builtin_ia32_selectd_256(
    291       (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C), (__v8si)__A);
    292 }
    293 
    294 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwsuds_epi32(
    295     __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
    296   return (__m256i)__builtin_ia32_selectd_256(
    297       (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C),
    298       (__v8si)_mm256_setzero_si256());
    299 }
    300 
    301 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    302 _mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
    303   return (__m128i)__builtin_ia32_selectd_128(
    304       (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C), (__v4si)__A);
    305 }
    306 
    307 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    308 _mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
    309   return (__m128i)__builtin_ia32_selectd_128(
    310       (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C),
    311       (__v4si)_mm_setzero_si128());
    312 }
    313 
    314 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    315 _mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
    316   return (__m256i)__builtin_ia32_selectd_256(
    317       (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C), (__v8si)__A);
    318 }
    319 
    320 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    321 _mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
    322   return (__m256i)__builtin_ia32_selectd_256(
    323       (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C),
    324       (__v8si)_mm256_setzero_si256());
    325 }
    326 
    327 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    328 _mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
    329   return (__m128i)__builtin_ia32_selectd_128(
    330       (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C), (__v4si)__A);
    331 }
    332 
    333 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    334 _mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
    335   return (__m128i)__builtin_ia32_selectd_128(
    336       (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C),
    337       (__v4si)_mm_setzero_si128());
    338 }
    339 
    340 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    341 _mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
    342   return (__m256i)__builtin_ia32_selectd_256(
    343       (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C), (__v8si)__A);
    344 }
    345 
    346 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwusds_epi32(
    347     __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
    348   return (__m256i)__builtin_ia32_selectd_256(
    349       (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C),
    350       (__v8si)_mm256_setzero_si256());
    351 }
    352 
    353 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    354 _mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
    355   return (__m128i)__builtin_ia32_selectd_128(
    356       (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C), (__v4si)__A);
    357 }
    358 
    359 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    360 _mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
    361   return (__m128i)__builtin_ia32_selectd_128(
    362       (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C),
    363       (__v4si)_mm_setzero_si128());
    364 }
    365 
    366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    367 _mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
    368   return (__m256i)__builtin_ia32_selectd_256(
    369       (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C), (__v8si)__A);
    370 }
    371 
    372 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    373 _mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
    374   return (__m256i)__builtin_ia32_selectd_256(
    375       (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C),
    376       (__v8si)_mm256_setzero_si256());
    377 }
    378 
    379 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    380 _mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
    381   return (__m128i)__builtin_ia32_selectd_128(
    382       (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C), (__v4si)__A);
    383 }
    384 
    385 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    386 _mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
    387   return (__m128i)__builtin_ia32_selectd_128(
    388       (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C),
    389       (__v4si)_mm_setzero_si128());
    390 }
    391 
    392 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    393 _mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
    394   return (__m256i)__builtin_ia32_selectd_256(
    395       (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C), (__v8si)__A);
    396 }
    397 
    398 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwuuds_epi32(
    399     __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
    400   return (__m256i)__builtin_ia32_selectd_256(
    401       (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C),
    402       (__v8si)_mm256_setzero_si256());
    403 }
    404 
    405 /* YMM Rounding */
    406 #define _mm256_add_round_pd(A, B, R)                                           \
    407   ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A),               \
    408                                            (__v4df)(__m256d)(B), (int)(R)))
    409 
    410 #define _mm256_mask_add_round_pd(W, U, A, B, R)                                \
    411   ((__m256d)__builtin_ia32_selectpd_256(                                       \
    412       (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)),               \
    413       (__v4df)(__m256d)(W)))
    414 
    415 #define _mm256_maskz_add_round_pd(U, A, B, R)                                  \
    416   ((__m256d)__builtin_ia32_selectpd_256(                                       \
    417       (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)),               \
    418       (__v4df)_mm256_setzero_pd()))
    419 
    420 #define _mm256_add_round_ph(A, B, R)                                           \
    421   ((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A),              \
    422                                            (__v16hf)(__m256h)(B), (int)(R)))
    423 
    424 #define _mm256_mask_add_round_ph(W, U, A, B, R)                                \
    425   ((__m256h)__builtin_ia32_selectph_256(                                       \
    426       (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)),             \
    427       (__v16hf)(__m256h)(W)))
    428 
    429 #define _mm256_maskz_add_round_ph(U, A, B, R)                                  \
    430   ((__m256h)__builtin_ia32_selectph_256(                                       \
    431       (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)),             \
    432       (__v16hf)_mm256_setzero_ph()))
    433 
    434 #define _mm256_add_round_ps(A, B, R)                                           \
    435   ((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A),                 \
    436                                           (__v8sf)(__m256)(B), (int)(R)))
    437 
    438 #define _mm256_mask_add_round_ps(W, U, A, B, R)                                \
    439   ((__m256)__builtin_ia32_selectps_256(                                        \
    440       (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)),               \
    441       (__v8sf)(__m256)(W)))
    442 
    443 #define _mm256_maskz_add_round_ps(U, A, B, R)                                  \
    444   ((__m256)__builtin_ia32_selectps_256(                                        \
    445       (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)),               \
    446       (__v8sf)_mm256_setzero_ps()))
    447 
    448 #define _mm256_cmp_round_pd_mask(A, B, P, R)                                   \
    449   ((__mmask8)__builtin_ia32_vcmppd256_round_mask(                              \
    450       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)-1,      \
    451       (int)(R)))
    452 
    453 #define _mm256_mask_cmp_round_pd_mask(U, A, B, P, R)                           \
    454   ((__mmask8)__builtin_ia32_vcmppd256_round_mask(                              \
    455       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)(U),     \
    456       (int)(R)))
    457 
    458 #define _mm256_cmp_round_ph_mask(A, B, P, R)                                   \
    459   ((__mmask16)__builtin_ia32_vcmpph256_round_mask(                             \
    460       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)-1,   \
    461       (int)(R)))
    462 
    463 #define _mm256_mask_cmp_round_ph_mask(U, A, B, P, R)                           \
    464   ((__mmask16)__builtin_ia32_vcmpph256_round_mask(                             \
    465       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)(U),  \
    466       (int)(R)))
    467 
    468 #define _mm256_cmp_round_ps_mask(A, B, P, R)                                   \
    469   ((__mmask8)__builtin_ia32_vcmpps256_round_mask(                              \
    470       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)-1,        \
    471       (int)(R)))
    472 
    473 #define _mm256_mask_cmp_round_ps_mask(U, A, B, P, R)                           \
    474   ((__mmask8)__builtin_ia32_vcmpps256_round_mask(                              \
    475       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)(U),       \
    476       (int)(R)))
    477 
    478 #define _mm256_cvt_roundepi32_ph(A, R)                                         \
    479   ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask(                            \
    480       (__v8si)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
    481 
    482 #define _mm256_mask_cvt_roundepi32_ph(W, U, A, R)                              \
    483   ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask((__v8si)(A), (__v8hf)(W),   \
    484                                                    (__mmask8)(U), (int)(R)))
    485 
    486 #define _mm256_maskz_cvt_roundepi32_ph(U, A, R)                                \
    487   ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask(                            \
    488       (__v8si)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
    489 
    490 #define _mm256_cvt_roundepi32_ps(A, R)                                         \
    491   ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A),        \
    492                                                   (__v8sf)_mm256_setzero_ps(), \
    493                                                   (__mmask8)-1, (int)(R)))
    494 
    495 #define _mm256_mask_cvt_roundepi32_ps(W, U, A, R)                              \
    496   ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask(                             \
    497       (__v8si)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
    498 
    499 #define _mm256_maskz_cvt_roundepi32_ps(U, A, R)                                \
    500   ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A),        \
    501                                                   (__v8sf)_mm256_setzero_ps(), \
    502                                                   (__mmask8)(U), (int)(R)))
    503 
    504 #define _mm256_cvt_roundpd_epi32(A, R)                                         \
    505   ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask(                            \
    506       (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1,         \
    507       (int)(R)))
    508 
    509 #define _mm256_mask_cvt_roundpd_epi32(W, U, A, R)                              \
    510   ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask(                            \
    511       (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R)))
    512 
    513 #define _mm256_maskz_cvt_roundpd_epi32(U, A, R)                                \
    514   ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask(                            \
    515       (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U),        \
    516       (int)(R)))
    517 
    518 #define _mm256_cvt_roundpd_ph(A, R)                                            \
    519   ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask(                            \
    520       (__v4df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
    521 
    522 #define _mm256_mask_cvt_roundpd_ph(W, U, A, R)                                 \
    523   ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask((__v4df)(A), (__v8hf)(W),   \
    524                                                    (__mmask8)(U), (int)(R)))
    525 
    526 #define _mm256_maskz_cvt_roundpd_ph(U, A, R)                                   \
    527   ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask(                            \
    528       (__v4df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
    529 
    530 #define _mm256_cvt_roundpd_ps(A, R)                                            \
    531   ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask(                             \
    532       (__v4df)(__m256d)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
    533 
    534 #define _mm256_mask_cvt_roundpd_ps(W, U, A, R)                                 \
    535   ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask(                             \
    536       (__v4df)(__m256d)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
    537 
    538 #define _mm256_maskz_cvt_roundpd_ps(U, A, R)                                   \
    539   ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask((__v4df)(__m256d)(A),        \
    540                                                   (__v4sf)_mm_setzero_ps(),    \
    541                                                   (__mmask8)(U), (int)(R)))
    542 
    543 #define _mm256_cvt_roundpd_epi64(A, R)                                         \
    544   ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask(                            \
    545       (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1,      \
    546       (int)(R)))
    547 
    548 #define _mm256_mask_cvt_roundpd_epi64(W, U, A, R)                              \
    549   ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask(                            \
    550       (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
    551 
    552 #define _mm256_maskz_cvt_roundpd_epi64(U, A, R)                                \
    553   ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask(                            \
    554       (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U),     \
    555       (int)(R)))
    556 
    557 #define _mm256_cvt_roundpd_epu32(A, R)                                         \
    558   ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask(                           \
    559       (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1,         \
    560       (int)(R)))
    561 
    562 #define _mm256_mask_cvt_roundpd_epu32(W, U, A, R)                              \
    563   ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask(                           \
    564       (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R)))
    565 
    566 #define _mm256_maskz_cvt_roundpd_epu32(U, A, R)                                \
    567   ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask(                           \
    568       (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U),        \
    569       (int)(R)))
    570 
    571 #define _mm256_cvt_roundpd_epu64(A, R)                                         \
    572   ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask(                           \
    573       (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1,      \
    574       (int)(R)))
    575 
    576 #define _mm256_mask_cvt_roundpd_epu64(W, U, A, R)                              \
    577   ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask(                           \
    578       (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
    579 
    580 #define _mm256_maskz_cvt_roundpd_epu64(U, A, R)                                \
    581   ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask(                           \
    582       (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U),     \
    583       (int)(R)))
    584 
    585 #define _mm256_cvt_roundph_epi32(A, R)                                         \
    586   ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask(                            \
    587       (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1),           \
    588       (int)(R)))
    589 
    590 #define _mm256_mask_cvt_roundph_epi32(W, U, A, R)                              \
    591   ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask((__v8hf)(A), (__v8si)(W),   \
    592                                                    (__mmask8)(U), (int)(R)))
    593 
    594 #define _mm256_maskz_cvt_roundph_epi32(U, A, R)                                \
    595   ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask(                            \
    596       (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
    597 
    598 #define _mm256_cvt_roundph_pd(A, R)                                            \
    599   ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask(                            \
    600       (__v8hf)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)(-1), (int)(R)))
    601 
    602 #define _mm256_mask_cvt_roundph_pd(W, U, A, R)                                 \
    603   ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask((__v8hf)(A), (__v4df)(W),   \
    604                                                    (__mmask8)(U), (int)(R)))
    605 
    606 #define _mm256_maskz_cvt_roundph_pd(U, A, R)                                   \
    607   ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask(                            \
    608       (__v8hf)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
    609 
    610 #define _mm256_cvtx_roundph_ps(A, R)                                           \
    611   ((__m256)__builtin_ia32_vcvtph2psx256_round_mask(                            \
    612       (__v8hf)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)(-1), (int)(R)))
    613 
    614 #define _mm256_mask_cvtx_roundph_ps(W, U, A, R)                                \
    615   ((__m256)__builtin_ia32_vcvtph2psx256_round_mask((__v8hf)(A), (__v8sf)(W),   \
    616                                                    (__mmask8)(U), (int)(R)))
    617 
    618 #define _mm256_maskz_cvtx_roundph_ps(U, A, R)                                  \
    619   ((__m256)__builtin_ia32_vcvtph2psx256_round_mask(                            \
    620       (__v8hf)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
    621 
    622 #define _mm256_cvt_roundph_epi64(A, R)                                         \
    623   ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask(                            \
    624       (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1),           \
    625       (int)(R)))
    626 
    627 #define _mm256_mask_cvt_roundph_epi64(W, U, A, R)                              \
    628   ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask((__v8hf)(A), (__v4di)(W),   \
    629                                                    (__mmask8)(U), (int)(R)))
    630 
    631 #define _mm256_maskz_cvt_roundph_epi64(U, A, R)                                \
    632   ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask(                            \
    633       (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
    634 
    635 #define _mm256_cvt_roundph_epu32(A, R)                                         \
    636   ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask(                           \
    637       (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1),           \
    638       (int)(R)))
    639 
    640 #define _mm256_mask_cvt_roundph_epu32(W, U, A, R)                              \
    641   ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask((__v8hf)(A), (__v8su)(W),  \
    642                                                     (__mmask8)(U), (int)(R)))
    643 
    644 #define _mm256_maskz_cvt_roundph_epu32(U, A, R)                                \
    645   ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask(                           \
    646       (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
    647 
    648 #define _mm256_cvt_roundph_epu64(A, R)                                         \
    649   ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask(                           \
    650       (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1),           \
    651       (int)(R)))
    652 
    653 #define _mm256_mask_cvt_roundph_epu64(W, U, A, R)                              \
    654   ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask((__v8hf)(A), (__v4du)(W),  \
    655                                                     (__mmask8)(U), (int)(R)))
    656 
    657 #define _mm256_maskz_cvt_roundph_epu64(U, A, R)                                \
    658   ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask(                           \
    659       (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
    660 
    661 #define _mm256_cvt_roundph_epu16(A, R)                                         \
    662   ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask(                            \
    663       (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1),        \
    664       (int)(R)))
    665 
    666 #define _mm256_mask_cvt_roundph_epu16(W, U, A, R)                              \
    667   ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask((__v16hf)(A), (__v16hu)(W), \
    668                                                    (__mmask16)(U), (int)(R)))
    669 
    670 #define _mm256_maskz_cvt_roundph_epu16(U, A, R)                                \
    671   ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask(                            \
    672       (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U),           \
    673       (int)(R)))
    674 
    675 #define _mm256_cvt_roundph_epi16(A, R)                                         \
    676   ((__m256i)__builtin_ia32_vcvtph2w256_round_mask(                             \
    677       (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1),        \
    678       (int)(R)))
    679 
    680 #define _mm256_mask_cvt_roundph_epi16(W, U, A, R)                              \
    681   ((__m256i)__builtin_ia32_vcvtph2w256_round_mask((__v16hf)(A), (__v16hi)(W),  \
    682                                                   (__mmask16)(U), (int)(R)))
    683 
    684 #define _mm256_maskz_cvt_roundph_epi16(U, A, R)                                \
    685   ((__m256i)__builtin_ia32_vcvtph2w256_round_mask(                             \
    686       (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U),           \
    687       (int)(R)))
    688 
    689 #define _mm256_cvt_roundps_epi32(A, R)                                         \
    690   ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask(                            \
    691       (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1,       \
    692       (int)(R)))
    693 
    694 #define _mm256_mask_cvt_roundps_epi32(W, U, A, R)                              \
    695   ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask(                            \
    696       (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R)))
    697 
    698 #define _mm256_maskz_cvt_roundps_epi32(U, A, R)                                \
    699   ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask(                            \
    700       (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U),      \
    701       (int)(R)))
    702 
    703 #define _mm256_cvt_roundps_pd(A, R)                                            \
    704   ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask(                            \
    705       (__v4sf)(__m128)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1,        \
    706       (int)(R)))
    707 
    708 #define _mm256_mask_cvt_roundps_pd(W, U, A, R)                                 \
    709   ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask(                            \
    710       (__v4sf)(__m128)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
    711 
    712 #define _mm256_maskz_cvt_roundps_pd(U, A, R)                                   \
    713   ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask(                            \
    714       (__v4sf)(__m128)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U),         \
    715       (int)(R)))
    716 
    717 #define _mm256_cvt_roundps_ph(A, I)                                            \
    718   ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I),    \
    719                                              (__v8hi)_mm_undefined_si128(),    \
    720                                              (__mmask8)-1))
    721 
    722 /* FIXME: We may use these way in future.
    723 #define _mm256_cvt_roundps_ph(A, I)                                            \
    724   ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask(                            \
    725       (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_undefined_si128(),            \
    726       (__mmask8)-1))
    727 #define _mm256_mask_cvt_roundps_ph(U, W, A, I)                                 \
    728   ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask(                            \
    729       (__v8sf)(__m256)(A), (int)(I), (__v8hi)(__m128i)(U), (__mmask8)(W)))
    730 #define _mm256_maskz_cvt_roundps_ph(W, A, I)                                   \
    731   ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask(                            \
    732       (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_setzero_si128(),              \
    733       (__mmask8)(W))) */
    734 
    735 #define _mm256_cvtx_roundps_ph(A, R)                                           \
    736   ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask(                           \
    737       (__v8sf)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
    738 
    739 #define _mm256_mask_cvtx_roundps_ph(W, U, A, R)                                \
    740   ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask((__v8sf)(A), (__v8hf)(W),  \
    741                                                     (__mmask8)(U), (int)(R)))
    742 
    743 #define _mm256_maskz_cvtx_roundps_ph(U, A, R)                                  \
    744   ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask(                           \
    745       (__v8sf)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
    746 
    747 #define _mm256_cvt_roundps_epi64(A, R)                                         \
    748   ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask(                            \
    749       (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1,       \
    750       (int)(R)))
    751 
    752 #define _mm256_mask_cvt_roundps_epi64(W, U, A, R)                              \
    753   ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask(                            \
    754       (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
    755 
    756 #define _mm256_maskz_cvt_roundps_epi64(U, A, R)                                \
    757   ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask(                            \
    758       (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U),      \
    759       (int)(R)))
    760 
    761 #define _mm256_cvt_roundps_epu32(A, R)                                         \
    762   ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask(                           \
    763       (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1,       \
    764       (int)(R)))
    765 
    766 #define _mm256_mask_cvt_roundps_epu32(W, U, A, R)                              \
    767   ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask(                           \
    768       (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R)))
    769 
    770 #define _mm256_maskz_cvt_roundps_epu32(U, A, R)                                \
    771   ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask(                           \
    772       (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U),      \
    773       (int)(R)))
    774 
    775 #define _mm256_cvt_roundps_epu64(A, R)                                         \
    776   ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask(                           \
    777       (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1,       \
    778       (int)(R)))
    779 
    780 #define _mm256_mask_cvt_roundps_epu64(W, U, A, R)                              \
    781   ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask(                           \
    782       (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
    783 
    784 #define _mm256_maskz_cvt_roundps_epu64(U, A, R)                                \
    785   ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask(                           \
    786       (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U),      \
    787       (int)(R)))
    788 
    789 #define _mm256_cvt_roundepi64_pd(A, R)                                         \
    790   ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask(                            \
    791       (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1,         \
    792       (int)(R)))
    793 
    794 #define _mm256_mask_cvt_roundepi64_pd(W, U, A, R)                              \
    795   ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask(                            \
    796       (__v4di)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
    797 
    798 #define _mm256_maskz_cvt_roundepi64_pd(U, A, R)                                \
    799   ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask(                            \
    800       (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U),        \
    801       (int)(R)))
    802 
    803 #define _mm256_cvt_roundepi64_ph(A, R)                                         \
    804   ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask(                            \
    805       (__v4di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
    806 
    807 #define _mm256_mask_cvt_roundepi64_ph(W, U, A, R)                              \
    808   ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask((__v4di)(A), (__v8hf)(W),   \
    809                                                    (__mmask8)(U), (int)(R)))
    810 
    811 #define _mm256_maskz_cvt_roundepi64_ph(U, A, R)                                \
    812   ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask(                            \
    813       (__v4di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
    814 
    815 #define _mm256_cvt_roundepi64_ps(A, R)                                         \
    816   ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask(                             \
    817       (__v4di)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
    818 
    819 #define _mm256_mask_cvt_roundepi64_ps(W, U, A, R)                              \
    820   ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask(                             \
    821       (__v4di)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
    822 
    823 #define _mm256_maskz_cvt_roundepi64_ps(U, A, R)                                \
    824   ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask((__v4di)(__m256i)(A),        \
    825                                                   (__v4sf)_mm_setzero_ps(),    \
    826                                                   (__mmask8)(U), (int)(R)))
    827 
    828 #define _mm256_cvtt_roundpd_epi32(A, R)                                        \
    829   ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask(                           \
    830       (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1,         \
    831       (int)(R)))
    832 
    833 #define _mm256_mask_cvtt_roundpd_epi32(W, U, A, R)                             \
    834   ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask(                           \
    835       (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R)))
    836 
    837 #define _mm256_maskz_cvtt_roundpd_epi32(U, A, R)                               \
    838   ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask(                           \
    839       (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U),        \
    840       (int)(R)))
    841 
    842 #define _mm256_cvtt_roundpd_epi64(A, R)                                        \
    843   ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask(                           \
    844       (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1,      \
    845       (int)(R)))
    846 
    847 #define _mm256_mask_cvtt_roundpd_epi64(W, U, A, R)                             \
    848   ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask(                           \
    849       (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
    850 
    851 #define _mm256_maskz_cvtt_roundpd_epi64(U, A, R)                               \
    852   ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask(                           \
    853       (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U),     \
    854       (int)(R)))
    855 
    856 #define _mm256_cvtt_roundpd_epu32(A, R)                                        \
    857   ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask(                          \
    858       (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1,         \
    859       (int)(R)))
    860 
    861 #define _mm256_mask_cvtt_roundpd_epu32(W, U, A, R)                             \
    862   ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask(                          \
    863       (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R)))
    864 
    865 #define _mm256_maskz_cvtt_roundpd_epu32(U, A, R)                               \
    866   ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask(                          \
    867       (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U),        \
    868       (int)(R)))
    869 
    870 #define _mm256_cvtt_roundpd_epu64(A, R)                                        \
    871   ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask(                          \
    872       (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1,      \
    873       (int)(R)))
    874 
    875 #define _mm256_mask_cvtt_roundpd_epu64(W, U, A, R)                             \
    876   ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask(                          \
    877       (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
    878 
    879 #define _mm256_maskz_cvtt_roundpd_epu64(U, A, R)                               \
    880   ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask(                          \
    881       (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U),     \
    882       (int)(R)))
    883 
    884 #define _mm256_cvtt_roundph_epi32(A, R)                                        \
    885   ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask(                           \
    886       (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1),           \
    887       (int)(R)))
    888 
    889 #define _mm256_mask_cvtt_roundph_epi32(W, U, A, R)                             \
    890   ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask((__v8hf)(A), (__v8si)(W),  \
    891                                                     (__mmask8)(U), (int)(R)))
    892 
    893 #define _mm256_maskz_cvtt_roundph_epi32(U, A, R)                               \
    894   ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask(                           \
    895       (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
    896 
    897 #define _mm256_cvtt_roundph_epi64(A, R)                                        \
    898   ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask(                           \
    899       (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1),           \
    900       (int)(R)))
    901 
    902 #define _mm256_mask_cvtt_roundph_epi64(W, U, A, R)                             \
    903   ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask((__v8hf)(A), (__v4di)(W),  \
    904                                                     (__mmask8)(U), (int)(R)))
    905 
    906 #define _mm256_maskz_cvtt_roundph_epi64(U, A, R)                               \
    907   ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask(                           \
    908       (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
    909 
    910 #define _mm256_cvtt_roundph_epu32(A, R)                                        \
    911   ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask(                          \
    912       (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1),           \
    913       (int)(R)))
    914 
    915 #define _mm256_mask_cvtt_roundph_epu32(W, U, A, R)                             \
    916   ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \
    917                                                      (__mmask8)(U), (int)(R)))
    918 
    919 #define _mm256_maskz_cvtt_roundph_epu32(U, A, R)                               \
    920   ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask(                          \
    921       (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
    922 
    923 #define _mm256_cvtt_roundph_epu64(A, R)                                        \
    924   ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask(                          \
    925       (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1),           \
    926       (int)(R)))
    927 
    928 #define _mm256_mask_cvtt_roundph_epu64(W, U, A, R)                             \
    929   ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \
    930                                                      (__mmask8)(U), (int)(R)))
    931 
    932 #define _mm256_maskz_cvtt_roundph_epu64(U, A, R)                               \
    933   ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask(                          \
    934       (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
    935 
    936 #define _mm256_cvtt_roundph_epu16(A, R)                                        \
    937   ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask(                           \
    938       (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1),        \
    939       (int)(R)))
    940 
    941 #define _mm256_mask_cvtt_roundph_epu16(W, U, A, R)                             \
    942   ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask(                           \
    943       (__v16hf)(A), (__v16hu)(W), (__mmask16)(U), (int)(R)))
    944 
    945 #define _mm256_maskz_cvtt_roundph_epu16(U, A, R)                               \
    946   ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask(                           \
    947       (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U),           \
    948       (int)(R)))
    949 
    950 #define _mm256_cvtt_roundph_epi16(A, R)                                        \
    951   ((__m256i)__builtin_ia32_vcvttph2w256_round_mask(                            \
    952       (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1),        \
    953       (int)(R)))
    954 
    955 #define _mm256_mask_cvtt_roundph_epi16(W, U, A, R)                             \
    956   ((__m256i)__builtin_ia32_vcvttph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \
    957                                                    (__mmask16)(U), (int)(R)))
    958 
    959 #define _mm256_maskz_cvtt_roundph_epi16(U, A, R)                               \
    960   ((__m256i)__builtin_ia32_vcvttph2w256_round_mask(                            \
    961       (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U),           \
    962       (int)(R)))
    963 
    964 #define _mm256_cvtt_roundps_epi32(A, R)                                        \
    965   ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask(                           \
    966       (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1,       \
    967       (int)(R)))
    968 
    969 #define _mm256_mask_cvtt_roundps_epi32(W, U, A, R)                             \
    970   ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask(                           \
    971       (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R)))
    972 
    973 #define _mm256_maskz_cvtt_roundps_epi32(U, A, R)                               \
    974   ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask(                           \
    975       (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U),      \
    976       (int)(R)))
    977 
    978 #define _mm256_cvtt_roundps_epi64(A, R)                                        \
    979   ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask(                           \
    980       (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1,       \
    981       (int)(R)))
    982 
    983 #define _mm256_mask_cvtt_roundps_epi64(W, U, A, R)                             \
    984   ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask(                           \
    985       (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
    986 
    987 #define _mm256_maskz_cvtt_roundps_epi64(U, A, R)                               \
    988   ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask(                           \
    989       (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U),      \
    990       (int)(R)))
    991 
    992 #define _mm256_cvtt_roundps_epu32(A, R)                                        \
    993   ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask(                          \
    994       (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1,       \
    995       (int)(R)))
    996 
    997 #define _mm256_mask_cvtt_roundps_epu32(W, U, A, R)                             \
    998   ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask(                          \
    999       (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R)))
   1000 
   1001 #define _mm256_maskz_cvtt_roundps_epu32(U, A, R)                               \
   1002   ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask(                          \
   1003       (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U),      \
   1004       (int)(R)))
   1005 
   1006 #define _mm256_cvtt_roundps_epu64(A, R)                                        \
   1007   ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask(                          \
   1008       (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1,       \
   1009       (int)(R)))
   1010 
   1011 #define _mm256_mask_cvtt_roundps_epu64(W, U, A, R)                             \
   1012   ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask(                          \
   1013       (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
   1014 
   1015 #define _mm256_maskz_cvtt_roundps_epu64(U, A, R)                               \
   1016   ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask(                          \
   1017       (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U),      \
   1018       (int)(R)))
   1019 
   1020 #define _mm256_cvt_roundepu32_ph(A, R)                                         \
   1021   ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask(                           \
   1022       (__v8su)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
   1023 
   1024 #define _mm256_mask_cvt_roundepu32_ph(W, U, A, R)                              \
   1025   ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask((__v8su)(A), (__v8hf)(W),  \
   1026                                                     (__mmask8)(U), (int)(R)))
   1027 
   1028 #define _mm256_maskz_cvt_roundepu32_ph(U, A, R)                                \
   1029   ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask(                           \
   1030       (__v8su)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
   1031 
   1032 #define _mm256_cvt_roundepu32_ps(A, R)                                         \
   1033   ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask(                            \
   1034       (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1,         \
   1035       (int)(R)))
   1036 
   1037 #define _mm256_mask_cvt_roundepu32_ps(W, U, A, R)                              \
   1038   ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask(                            \
   1039       (__v8su)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
   1040 
   1041 #define _mm256_maskz_cvt_roundepu32_ps(U, A, R)                                \
   1042   ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask(                            \
   1043       (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U),        \
   1044       (int)(R)))
   1045 
   1046 #define _mm256_cvt_roundepu64_pd(A, R)                                         \
   1047   ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask(                           \
   1048       (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1,         \
   1049       (int)(R)))
   1050 
   1051 #define _mm256_mask_cvt_roundepu64_pd(W, U, A, R)                              \
   1052   ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask(                           \
   1053       (__v4du)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
   1054 
   1055 #define _mm256_maskz_cvt_roundepu64_pd(U, A, R)                                \
   1056   ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask(                           \
   1057       (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U),        \
   1058       (int)(R)))
   1059 
   1060 #define _mm256_cvt_roundepu64_ph(A, R)                                         \
   1061   ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask(                           \
   1062       (__v4du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
   1063 
   1064 #define _mm256_mask_cvt_roundepu64_ph(W, U, A, R)                              \
   1065   ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask((__v4du)(A), (__v8hf)(W),  \
   1066                                                     (__mmask8)(U), (int)(R)))
   1067 
   1068 #define _mm256_maskz_cvt_roundepu64_ph(U, A, R)                                \
   1069   ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask(                           \
   1070       (__v4du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
   1071 
   1072 #define _mm256_cvt_roundepu64_ps(A, R)                                         \
   1073   ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask(                            \
   1074       (__v4du)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
   1075 
   1076 #define _mm256_mask_cvt_roundepu64_ps(W, U, A, R)                              \
   1077   ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask(                            \
   1078       (__v4du)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
   1079 
   1080 #define _mm256_maskz_cvt_roundepu64_ps(U, A, R)                                \
   1081   ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask((__v4du)(__m256i)(A),       \
   1082                                                    (__v4sf)_mm_setzero_ps(),   \
   1083                                                    (__mmask8)(U), (int)(R)))
   1084 
   1085 #define _mm256_cvt_roundepu16_ph(A, R)                                         \
   1086   ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask(                            \
   1087       (__v16hu)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1),           \
   1088       (int)(R)))
   1089 
   1090 #define _mm256_mask_cvt_roundepu16_ph(W, U, A, R)                              \
   1091   ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask((__v16hu)(A), (__v16hf)(W), \
   1092                                                    (__mmask16)(U), (int)(R)))
   1093 
   1094 #define _mm256_maskz_cvt_roundepu16_ph(U, A, R)                                \
   1095   ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask(                            \
   1096       (__v16hu)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
   1097 
   1098 #define _mm256_cvt_roundepi16_ph(A, R)                                         \
   1099   ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask(                             \
   1100       (__v16hi)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1),           \
   1101       (int)(R)))
   1102 
   1103 #define _mm256_mask_cvt_roundepi16_ph(W, U, A, R)                              \
   1104   ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask((__v16hi)(A), (__v16hf)(W),  \
   1105                                                   (__mmask16)(U), (int)(R)))
   1106 
   1107 #define _mm256_maskz_cvt_roundepi16_ph(U, A, R)                                \
   1108   ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask(                             \
   1109       (__v16hi)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
   1110 
   1111 #define _mm256_div_round_pd(A, B, R)                                           \
   1112   ((__m256d)__builtin_ia32_vdivpd256_round((__v4df)(__m256d)(A),               \
   1113                                            (__v4df)(__m256d)(B), (int)(R)))
   1114 
   1115 #define _mm256_mask_div_round_pd(W, U, A, B, R)                                \
   1116   ((__m256d)__builtin_ia32_selectpd_256(                                       \
   1117       (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)),               \
   1118       (__v4df)(__m256d)(W)))
   1119 
   1120 #define _mm256_maskz_div_round_pd(U, A, B, R)                                  \
   1121   ((__m256d)__builtin_ia32_selectpd_256(                                       \
   1122       (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)),               \
   1123       (__v4df)_mm256_setzero_pd()))
   1124 
   1125 #define _mm256_div_round_ph(A, B, R)                                           \
   1126   ((__m256h)__builtin_ia32_vdivph256_round((__v16hf)(__m256h)(A),              \
   1127                                            (__v16hf)(__m256h)(B), (int)(R)))
   1128 
   1129 #define _mm256_mask_div_round_ph(W, U, A, B, R)                                \
   1130   ((__m256h)__builtin_ia32_selectph_256(                                       \
   1131       (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)),             \
   1132       (__v16hf)(__m256h)(W)))
   1133 
   1134 #define _mm256_maskz_div_round_ph(U, A, B, R)                                  \
   1135   ((__m256h)__builtin_ia32_selectph_256(                                       \
   1136       (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)),             \
   1137       (__v16hf)_mm256_setzero_ph()))
   1138 
   1139 #define _mm256_div_round_ps(A, B, R)                                           \
   1140   ((__m256)__builtin_ia32_vdivps256_round((__v8sf)(__m256)(A),                 \
   1141                                           (__v8sf)(__m256)(B), (int)(R)))
   1142 
   1143 #define _mm256_mask_div_round_ps(W, U, A, B, R)                                \
   1144   ((__m256)__builtin_ia32_selectps_256(                                        \
   1145       (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)),               \
   1146       (__v8sf)(__m256)(W)))
   1147 
   1148 #define _mm256_maskz_div_round_ps(U, A, B, R)                                  \
   1149   ((__m256)__builtin_ia32_selectps_256(                                        \
   1150       (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)),               \
   1151       (__v8sf)_mm256_setzero_ps()))
   1152 
   1153 #define _mm256_fcmadd_round_pch(A, B, C, R)                                    \
   1154   ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3(                          \
   1155       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
   1156       (__mmask8)-1, (int)(R)))
   1157 
   1158 #define _mm256_mask_fcmadd_round_pch(A, U, B, C, R)                            \
   1159   ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask(                           \
   1160       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
   1161       (__mmask8)(U), (int)(R)))
   1162 
   1163 #define _mm256_mask3_fcmadd_round_pch(A, B, C, U, R)                           \
   1164   ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3(                          \
   1165       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
   1166       (__mmask8)(U), (int)(R)))
   1167 
   1168 #define _mm256_maskz_fcmadd_round_pch(U, A, B, C, R)                           \
   1169   ((__m256h)__builtin_ia32_vfcmaddcph256_round_maskz(                          \
   1170       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
   1171       (__mmask8)(U), (int)(R)))
   1172 
   1173 #define _mm256_cmul_round_pch(A, B, R)                                         \
   1174   ((__m256h)__builtin_ia32_vfcmulcph256_round_mask(                            \
   1175       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B),                              \
   1176       (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
   1177 
   1178 #define _mm256_mask_cmul_round_pch(W, U, A, B, R)                              \
   1179   ((__m256h)__builtin_ia32_vfcmulcph256_round_mask(                            \
   1180       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W),        \
   1181       (__mmask8)(U), (int)(R)))
   1182 
   1183 #define _mm256_maskz_cmul_round_pch(U, A, B, R)                                \
   1184   ((__m256h)__builtin_ia32_vfcmulcph256_round_mask(                            \
   1185       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B),                              \
   1186       (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R)))
   1187 
   1188 #define _mm256_fixupimm_round_pd(A, B, C, imm, R)                              \
   1189   ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask(                          \
   1190       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C),        \
   1191       (int)(imm), (__mmask8)-1, (int)(R)))
   1192 
   1193 #define _mm256_mask_fixupimm_round_pd(A, U, B, C, imm, R)                      \
   1194   ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask(                          \
   1195       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C),        \
   1196       (int)(imm), (__mmask8)(U), (int)(R)))
   1197 
   1198 #define _mm256_maskz_fixupimm_round_pd(U, A, B, C, imm, R)                     \
   1199   ((__m256d)__builtin_ia32_vfixupimmpd256_round_maskz(                         \
   1200       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C),        \
   1201       (int)(imm), (__mmask8)(U), (int)(R)))
   1202 
   1203 #define _mm256_fixupimm_round_ps(A, B, C, imm, R)                              \
   1204   ((__m256)__builtin_ia32_vfixupimmps256_round_mask(                           \
   1205       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C),          \
   1206       (int)(imm), (__mmask8)-1, (int)(R)))
   1207 
   1208 #define _mm256_mask_fixupimm_round_ps(A, U, B, C, imm, R)                      \
   1209   ((__m256)__builtin_ia32_vfixupimmps256_round_mask(                           \
   1210       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C),          \
   1211       (int)(imm), (__mmask8)(U), (int)(R)))
   1212 
   1213 #define _mm256_maskz_fixupimm_round_ps(U, A, B, C, imm, R)                     \
   1214   ((__m256)__builtin_ia32_vfixupimmps256_round_maskz(                          \
   1215       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C),          \
   1216       (int)(imm), (__mmask8)(U), (int)(R)))
   1217 
   1218 #define _mm256_fmadd_round_pd(A, B, C, R)                                      \
   1219   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
   1220       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
   1221       (__mmask8)-1, (int)(R)))
   1222 
   1223 #define _mm256_mask_fmadd_round_pd(A, U, B, C, R)                              \
   1224   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
   1225       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
   1226       (__mmask8)(U), (int)(R)))
   1227 
   1228 #define _mm256_mask3_fmadd_round_pd(A, B, C, U, R)                             \
   1229   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3(                            \
   1230       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
   1231       (__mmask8)(U), (int)(R)))
   1232 
   1233 #define _mm256_maskz_fmadd_round_pd(U, A, B, C, R)                             \
   1234   ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz(                            \
   1235       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
   1236       (__mmask8)(U), (int)(R)))
   1237 
   1238 #define _mm256_fmsub_round_pd(A, B, C, R)                                      \
   1239   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
   1240       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),       \
   1241       (__mmask8)-1, (int)(R)))
   1242 
   1243 #define _mm256_mask_fmsub_round_pd(A, U, B, C, R)                              \
   1244   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
   1245       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),       \
   1246       (__mmask8)(U), (int)(R)))
   1247 
   1248 #define _mm256_maskz_fmsub_round_pd(U, A, B, C, R)                             \
   1249   ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz(                            \
   1250       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),       \
   1251       (__mmask8)(U), (int)(R)))
   1252 
   1253 #define _mm256_fnmadd_round_pd(A, B, C, R)                                     \
   1254   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
   1255       -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),       \
   1256       (__mmask8)-1, (int)(R)))
   1257 
   1258 #define _mm256_mask3_fnmadd_round_pd(A, B, C, U, R)                            \
   1259   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3(                            \
   1260       -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),       \
   1261       (__mmask8)(U), (int)(R)))
   1262 
   1263 #define _mm256_maskz_fnmadd_round_pd(U, A, B, C, R)                            \
   1264   ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz(                            \
   1265       -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),       \
   1266       (__mmask8)(U), (int)(R)))
   1267 
   1268 #define _mm256_fnmsub_round_pd(A, B, C, R)                                     \
   1269   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
   1270       -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),      \
   1271       (__mmask8)-1, (int)(R)))
   1272 
   1273 #define _mm256_maskz_fnmsub_round_pd(U, A, B, C, R)                            \
   1274   ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz(                            \
   1275       -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),      \
   1276       (__mmask8)(U), (int)(R)))
   1277 
   1278 #define _mm256_fmadd_round_ph(A, B, C, R)                                      \
   1279   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
   1280       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
   1281       (__mmask16)-1, (int)(R)))
   1282 
   1283 #define _mm256_mask_fmadd_round_ph(A, U, B, C, R)                              \
   1284   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
   1285       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
   1286       (__mmask16)(U), (int)(R)))
   1287 
   1288 #define _mm256_mask3_fmadd_round_ph(A, B, C, U, R)                             \
   1289   ((__m256h)__builtin_ia32_vfmaddph256_round_mask3(                            \
   1290       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
   1291       (__mmask16)(U), (int)(R)))
   1292 
   1293 #define _mm256_maskz_fmadd_round_ph(U, A, B, C, R)                             \
   1294   ((__m256h)__builtin_ia32_vfmaddph256_round_maskz(                            \
   1295       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
   1296       (__mmask16)(U), (int)(R)))
   1297 
   1298 #define _mm256_fmsub_round_ph(A, B, C, R)                                      \
   1299   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
   1300       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),    \
   1301       (__mmask16)-1, (int)(R)))
   1302 
   1303 #define _mm256_mask_fmsub_round_ph(A, U, B, C, R)                              \
   1304   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
   1305       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),    \
   1306       (__mmask16)(U), (int)(R)))
   1307 
   1308 #define _mm256_maskz_fmsub_round_ph(U, A, B, C, R)                             \
   1309   ((__m256h)__builtin_ia32_vfmaddph256_round_maskz(                            \
   1310       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),    \
   1311       (__mmask16)(U), (int)(R)))
   1312 
   1313 #define _mm256_fnmadd_round_ph(A, B, C, R)                                     \
   1314   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
   1315       (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),    \
   1316       (__mmask16)-1, (int)(R)))
   1317 
   1318 #define _mm256_mask3_fnmadd_round_ph(A, B, C, U, R)                            \
   1319   ((__m256h)__builtin_ia32_vfmaddph256_round_mask3(                            \
   1320       -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),    \
   1321       (__mmask16)(U), (int)(R)))
   1322 
   1323 #define _mm256_maskz_fnmadd_round_ph(U, A, B, C, R)                            \
   1324   ((__m256h)__builtin_ia32_vfmaddph256_round_maskz(                            \
   1325       -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),    \
   1326       (__mmask16)(U), (int)(R)))
   1327 
   1328 #define _mm256_fnmsub_round_ph(A, B, C, R)                                     \
   1329   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
   1330       (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),   \
   1331       (__mmask16)-1, (int)(R)))
   1332 
   1333 #define _mm256_maskz_fnmsub_round_ph(U, A, B, C, R)                            \
   1334   ((__m256h)__builtin_ia32_vfmaddph256_round_maskz(                            \
   1335       -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),   \
   1336       (__mmask16)(U), (int)(R)))
   1337 
   1338 #define _mm256_fmadd_round_ps(A, B, C, R)                                      \
   1339   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
   1340       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
   1341       (__mmask8)-1, (int)(R)))
   1342 
   1343 #define _mm256_mask_fmadd_round_ps(A, U, B, C, R)                              \
   1344   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
   1345       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
   1346       (__mmask8)(U), (int)(R)))
   1347 
   1348 #define _mm256_mask3_fmadd_round_ps(A, B, C, U, R)                             \
   1349   ((__m256)__builtin_ia32_vfmaddps256_round_mask3(                             \
   1350       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
   1351       (__mmask8)(U), (int)(R)))
   1352 
   1353 #define _mm256_maskz_fmadd_round_ps(U, A, B, C, R)                             \
   1354   ((__m256)__builtin_ia32_vfmaddps256_round_maskz(                             \
   1355       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
   1356       (__mmask8)(U), (int)(R)))
   1357 
   1358 #define _mm256_fmsub_round_ps(A, B, C, R)                                      \
   1359   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
   1360       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),          \
   1361       (__mmask8)-1, (int)(R)))
   1362 
   1363 #define _mm256_mask_fmsub_round_ps(A, U, B, C, R)                              \
   1364   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
   1365       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),          \
   1366       (__mmask8)(U), (int)(R)))
   1367 
   1368 #define _mm256_maskz_fmsub_round_ps(U, A, B, C, R)                             \
   1369   ((__m256)__builtin_ia32_vfmaddps256_round_maskz(                             \
   1370       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),          \
   1371       (__mmask8)(U), (int)(R)))
   1372 
   1373 #define _mm256_fnmadd_round_ps(A, B, C, R)                                     \
   1374   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
   1375       (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C),          \
   1376       (__mmask8)-1, (int)(R)))
   1377 
   1378 #define _mm256_mask3_fnmadd_round_ps(A, B, C, U, R)                            \
   1379   ((__m256)__builtin_ia32_vfmaddps256_round_mask3(                             \
   1380       -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),          \
   1381       (__mmask8)(U), (int)(R)))
   1382 
   1383 #define _mm256_maskz_fnmadd_round_ps(U, A, B, C, R)                            \
   1384   ((__m256)__builtin_ia32_vfmaddps256_round_maskz(                             \
   1385       -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),          \
   1386       (__mmask8)(U), (int)(R)))
   1387 
   1388 #define _mm256_fnmsub_round_ps(A, B, C, R)                                     \
   1389   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
   1390       (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),         \
   1391       (__mmask8)-1, (int)(R)))
   1392 
   1393 #define _mm256_maskz_fnmsub_round_ps(U, A, B, C, R)                            \
   1394   ((__m256)__builtin_ia32_vfmaddps256_round_maskz(                             \
   1395       -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),         \
   1396       (__mmask8)(U), (int)(R)))
   1397 
   1398 #define _mm256_fmadd_round_pch(A, B, C, R)                                     \
   1399   ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3(                           \
   1400       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
   1401       (__mmask8)-1, (int)(R)))
   1402 
   1403 #define _mm256_mask_fmadd_round_pch(A, U, B, C, R)                             \
   1404   ((__m256h)__builtin_ia32_vfmaddcph256_round_mask(                            \
   1405       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
   1406       (__mmask8)(U), (int)(R)))
   1407 
   1408 #define _mm256_mask3_fmadd_round_pch(A, B, C, U, R)                            \
   1409   ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3(                           \
   1410       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
   1411       (__mmask8)(U), (int)(R)))
   1412 
   1413 #define _mm256_maskz_fmadd_round_pch(U, A, B, C, R)                            \
   1414   ((__m256h)__builtin_ia32_vfmaddcph256_round_maskz(                           \
   1415       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C),        \
   1416       (__mmask8)(U), (int)(R)))
   1417 
   1418 #define _mm256_fmaddsub_round_pd(A, B, C, R)                                   \
   1419   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask(                          \
   1420       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
   1421       (__mmask8)-1, (int)(R)))
   1422 
   1423 #define _mm256_mask_fmaddsub_round_pd(A, U, B, C, R)                           \
   1424   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask(                          \
   1425       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
   1426       (__mmask8)(U), (int)(R)))
   1427 
   1428 #define _mm256_mask3_fmaddsub_round_pd(A, B, C, U, R)                          \
   1429   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask3(                         \
   1430       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
   1431       (__mmask8)(U), (int)(R)))
   1432 
   1433 #define _mm256_maskz_fmaddsub_round_pd(U, A, B, C, R)                          \
   1434   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz(                         \
   1435       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
   1436       (__mmask8)(U), (int)(R)))
   1437 
   1438 #define _mm256_fmsubadd_round_pd(A, B, C, R)                                   \
   1439   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask(                          \
   1440       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),       \
   1441       (__mmask8)-1, (int)(R)))
   1442 
   1443 #define _mm256_mask_fmsubadd_round_pd(A, U, B, C, R)                           \
   1444   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask(                          \
   1445       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),       \
   1446       (__mmask8)(U), (int)(R)))
   1447 
   1448 #define _mm256_maskz_fmsubadd_round_pd(U, A, B, C, R)                          \
   1449   ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz(                         \
   1450       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),       \
   1451       (__mmask8)(U), (int)(R)))
   1452 
   1453 #define _mm256_fmaddsub_round_ph(A, B, C, R)                                   \
   1454   ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask(                          \
   1455       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
   1456       (__mmask16)-1, (int)(R)))
   1457 
   1458 #define _mm256_mask_fmaddsub_round_ph(A, U, B, C, R)                           \
   1459   ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask(                          \
   1460       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
   1461       (__mmask16)(U), (int)(R)))
   1462 
   1463 #define _mm256_mask3_fmaddsub_round_ph(A, B, C, U, R)                          \
   1464   ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask3(                         \
   1465       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
   1466       (__mmask16)(U), (int)(R)))
   1467 
   1468 #define _mm256_maskz_fmaddsub_round_ph(U, A, B, C, R)                          \
   1469   ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz(                         \
   1470       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
   1471       (__mmask16)(U), (int)(R)))
   1472 
   1473 #define _mm256_fmsubadd_round_ph(A, B, C, R)                                   \
   1474   ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask(                          \
   1475       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),    \
   1476       (__mmask16)-1, (int)(R)))
   1477 
   1478 #define _mm256_mask_fmsubadd_round_ph(A, U, B, C, R)                           \
   1479   ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask(                          \
   1480       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),    \
   1481       (__mmask16)(U), (int)(R)))
   1482 
   1483 #define _mm256_maskz_fmsubadd_round_ph(U, A, B, C, R)                          \
   1484   ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz(                         \
   1485       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),    \
   1486       (__mmask16)(U), (int)(R)))
   1487 
   1488 #define _mm256_fmaddsub_round_ps(A, B, C, R)                                   \
   1489   ((__m256)__builtin_ia32_vfmaddsubps256_round_mask(                           \
   1490       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
   1491       (__mmask8)-1, (int)(R)))
   1492 
   1493 #define _mm256_mask_fmaddsub_round_ps(A, U, B, C, R)                           \
   1494   ((__m256)__builtin_ia32_vfmaddsubps256_round_mask(                           \
   1495       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
   1496       (__mmask8)(U), (int)(R)))
   1497 
   1498 #define _mm256_mask3_fmaddsub_round_ps(A, B, C, U, R)                          \
   1499   ((__m256)__builtin_ia32_vfmaddsubps256_round_mask3(                          \
   1500       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
   1501       (__mmask8)(U), (int)(R)))
   1502 
   1503 #define _mm256_maskz_fmaddsub_round_ps(U, A, B, C, R)                          \
   1504   ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz(                          \
   1505       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
   1506       (__mmask8)(U), (int)(R)))
   1507 
   1508 #define _mm256_fmsubadd_round_ps(A, B, C, R)                                   \
   1509   ((__m256)__builtin_ia32_vfmaddsubps256_round_mask(                           \
   1510       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),          \
   1511       (__mmask8)-1, (int)(R)))
   1512 
   1513 #define _mm256_mask_fmsubadd_round_ps(A, U, B, C, R)                           \
   1514   ((__m256)__builtin_ia32_vfmaddsubps256_round_mask(                           \
   1515       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),          \
   1516       (__mmask8)(U), (int)(R)))
   1517 
   1518 #define _mm256_maskz_fmsubadd_round_ps(U, A, B, C, R)                          \
   1519   ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz(                          \
   1520       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),          \
   1521       (__mmask8)(U), (int)(R)))
   1522 #define _mm256_mask3_fmsub_round_pd(A, B, C, U, R)                             \
   1523   ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3(                            \
   1524       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
   1525       (__mmask8)(U), (int)(R)))
   1526 
   1527 #define _mm256_mask3_fmsubadd_round_pd(A, B, C, U, R)                          \
   1528   ((__m256d)__builtin_ia32_vfmsubaddpd256_round_mask3(                         \
   1529       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),        \
   1530       (__mmask8)(U), (int)(R)))
   1531 
   1532 #define _mm256_mask_fnmadd_round_pd(A, U, B, C, R)                             \
   1533   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
   1534       (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), (__v4df)(__m256d)(C),       \
   1535       (__mmask8)(U), (int)(R)))
   1536 
   1537 #define _mm256_mask_fnmsub_round_pd(A, U, B, C, R)                             \
   1538   ((__m256d)__builtin_ia32_vfmaddpd256_round_mask(                             \
   1539       (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), -(__v4df)(__m256d)(C),      \
   1540       (__mmask8)(U), (int)(R)))
   1541 
   1542 #define _mm256_mask3_fnmsub_round_pd(A, B, C, U, R)                            \
   1543   ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3(                            \
   1544       -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C),       \
   1545       (__mmask8)(U), (int)(R)))
   1546 
   1547 #define _mm256_mask3_fmsub_round_ph(A, B, C, U, R)                             \
   1548   ((__m256h)__builtin_ia32_vfmsubph256_round_mask3(                            \
   1549       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
   1550       (__mmask16)(U), (int)(R)))
   1551 
   1552 #define _mm256_mask3_fmsubadd_round_ph(A, B, C, U, R)                          \
   1553   ((__m256h)__builtin_ia32_vfmsubaddph256_round_mask3(                         \
   1554       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),     \
   1555       (__mmask16)(U), (int)(R)))
   1556 
   1557 #define _mm256_mask_fnmadd_round_ph(A, U, B, C, R)                             \
   1558   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
   1559       (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),    \
   1560       (__mmask16)(U), (int)(R)))
   1561 
   1562 #define _mm256_mask_fnmsub_round_ph(A, U, B, C, R)                             \
   1563   ((__m256h)__builtin_ia32_vfmaddph256_round_mask(                             \
   1564       (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C),   \
   1565       (__mmask16)(U), (int)(R)))
   1566 
   1567 #define _mm256_mask3_fnmsub_round_ph(A, B, C, U, R)                            \
   1568   ((__m256h)__builtin_ia32_vfmsubph256_round_mask3(                            \
   1569       -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C),    \
   1570       (__mmask16)(U), (int)(R)))
   1571 
   1572 #define _mm256_mask3_fmsub_round_ps(A, B, C, U, R)                             \
   1573   ((__m256)__builtin_ia32_vfmsubps256_round_mask3(                             \
   1574       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
   1575       (__mmask8)(U), (int)(R)))
   1576 
   1577 #define _mm256_mask3_fmsubadd_round_ps(A, B, C, U, R)                          \
   1578   ((__m256)__builtin_ia32_vfmsubaddps256_round_mask3(                          \
   1579       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),           \
   1580       (__mmask8)(U), (int)(R)))
   1581 
   1582 #define _mm256_mask_fnmadd_round_ps(A, U, B, C, R)                             \
   1583   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
   1584       (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C),          \
   1585       (__mmask8)(U), (int)(R)))
   1586 
   1587 #define _mm256_mask_fnmsub_round_ps(A, U, B, C, R)                             \
   1588   ((__m256)__builtin_ia32_vfmaddps256_round_mask(                              \
   1589       (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C),         \
   1590       (__mmask8)(U), (int)(R)))
   1591 
   1592 #define _mm256_mask3_fnmsub_round_ps(A, B, C, U, R)                            \
   1593   ((__m256)__builtin_ia32_vfmsubps256_round_mask3(                             \
   1594       -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C),          \
   1595       (__mmask8)(U), (int)(R)))
   1596 
   1597 #define _mm256_mul_round_pch(A, B, R)                                          \
   1598   ((__m256h)__builtin_ia32_vfmulcph256_round_mask(                             \
   1599       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B),                              \
   1600       (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
   1601 
   1602 #define _mm256_mask_mul_round_pch(W, U, A, B, R)                               \
   1603   ((__m256h)__builtin_ia32_vfmulcph256_round_mask(                             \
   1604       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W),        \
   1605       (__mmask8)(U), (int)(R)))
   1606 
   1607 #define _mm256_maskz_mul_round_pch(U, A, B, R)                                 \
   1608   ((__m256h)__builtin_ia32_vfmulcph256_round_mask(                             \
   1609       (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B),                              \
   1610       (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R)))
   1611 
   1612 #define _mm256_getexp_round_pd(A, R)                                           \
   1613   ((__m256d)__builtin_ia32_vgetexppd256_round_mask(                            \
   1614       (__v4df)(__m256d)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1,       \
   1615       (int)(R)))
   1616 
   1617 #define _mm256_mask_getexp_round_pd(W, U, A, R)                                \
   1618   ((__m256d)__builtin_ia32_vgetexppd256_round_mask(                            \
   1619       (__v4df)(__m256d)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
   1620 
   1621 #define _mm256_maskz_getexp_round_pd(U, A, R)                                  \
   1622   ((__m256d)__builtin_ia32_vgetexppd256_round_mask(                            \
   1623       (__v4df)(__m256d)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U),        \
   1624       (int)(R)))
   1625 
   1626 #define _mm256_getexp_round_ph(A, R)                                           \
   1627   ((__m256h)__builtin_ia32_vgetexpph256_round_mask(                            \
   1628       (__v16hf)(__m256h)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)-1,    \
   1629       (int)(R)))
   1630 
   1631 #define _mm256_mask_getexp_round_ph(W, U, A, R)                                \
   1632   ((__m256h)__builtin_ia32_vgetexpph256_round_mask(                            \
   1633       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(W), (__mmask16)(U), (int)(R)))
   1634 
   1635 #define _mm256_maskz_getexp_round_ph(U, A, R)                                  \
   1636   ((__m256h)__builtin_ia32_vgetexpph256_round_mask(                            \
   1637       (__v16hf)(__m256h)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U),     \
   1638       (int)(R)))
   1639 
   1640 #define _mm256_getexp_round_ps(A, R)                                           \
   1641   ((__m256)__builtin_ia32_vgetexpps256_round_mask(                             \
   1642       (__v8sf)(__m256)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)-1,        \
   1643       (int)(R)))
   1644 
   1645 #define _mm256_mask_getexp_round_ps(W, U, A, R)                                \
   1646   ((__m256)__builtin_ia32_vgetexpps256_round_mask(                             \
   1647       (__v8sf)(__m256)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
   1648 
   1649 #define _mm256_maskz_getexp_round_ps(U, A, R)                                  \
   1650   ((__m256)__builtin_ia32_vgetexpps256_round_mask((__v8sf)(__m256)(A),         \
   1651                                                   (__v8sf)_mm256_setzero_ps(), \
   1652                                                   (__mmask8)(U), (int)(R)))
   1653 
   1654 #define _mm256_getmant_round_pd(A, B, C, R)                                    \
   1655   ((__m256d)__builtin_ia32_vgetmantpd256_round_mask(                           \
   1656       (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)),                           \
   1657       (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
   1658 
   1659 #define _mm256_mask_getmant_round_pd(W, U, A, B, C, R)                         \
   1660   ((__m256d)__builtin_ia32_vgetmantpd256_round_mask(                           \
   1661       (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), (__v4df)(__m256d)(W),     \
   1662       (__mmask8)(U), (int)(R)))
   1663 
   1664 #define _mm256_maskz_getmant_round_pd(U, A, B, C, R)                           \
   1665   ((__m256d)__builtin_ia32_vgetmantpd256_round_mask(                           \
   1666       (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)),                           \
   1667       (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
   1668 
   1669 #define _mm256_getmant_round_ph(A, B, C, R)                                    \
   1670   ((__m256h)__builtin_ia32_vgetmantph256_round_mask(                           \
   1671       (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)),                          \
   1672       (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
   1673 
   1674 #define _mm256_mask_getmant_round_ph(W, U, A, B, C, R)                         \
   1675   ((__m256h)__builtin_ia32_vgetmantph256_round_mask(                           \
   1676       (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W),   \
   1677       (__mmask16)(U), (int)(R)))
   1678 
   1679 #define _mm256_maskz_getmant_round_ph(U, A, B, C, R)                           \
   1680   ((__m256h)__builtin_ia32_vgetmantph256_round_mask(                           \
   1681       (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)),                          \
   1682       (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
   1683 
   1684 #define _mm256_getmant_round_ps(A, B, C, R)                                    \
   1685   ((__m256)__builtin_ia32_vgetmantps256_round_mask(                            \
   1686       (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)),                            \
   1687       (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R)))
   1688 
   1689 #define _mm256_mask_getmant_round_ps(W, U, A, B, C, R)                         \
   1690   ((__m256)__builtin_ia32_vgetmantps256_round_mask(                            \
   1691       (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), (__v8sf)(__m256)(W),       \
   1692       (__mmask8)(U), (int)(R)))
   1693 
   1694 #define _mm256_maskz_getmant_round_ps(U, A, B, C, R)                           \
   1695   ((__m256)__builtin_ia32_vgetmantps256_round_mask(                            \
   1696       (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)),                            \
   1697       (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
   1698 
   1699 #define _mm256_max_round_pd(A, B, R)                                           \
   1700   ((__m256d)__builtin_ia32_vmaxpd256_round((__v4df)(__m256d)(A),               \
   1701                                            (__v4df)(__m256d)(B), (int)(R)))
   1702 
   1703 #define _mm256_mask_max_round_pd(W, U, A, B, R)                                \
   1704   ((__m256d)__builtin_ia32_selectpd_256(                                       \
   1705       (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)),               \
   1706       (__v4df)(__m256d)(W)))
   1707 
   1708 #define _mm256_maskz_max_round_pd(U, A, B, R)                                  \
   1709   ((__m256d)__builtin_ia32_selectpd_256(                                       \
   1710       (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)),               \
   1711       (__v4df)_mm256_setzero_pd()))
   1712 
   1713 #define _mm256_max_round_ph(A, B, R)                                           \
   1714   ((__m256h)__builtin_ia32_vmaxph256_round((__v16hf)(__m256h)(A),              \
   1715                                            (__v16hf)(__m256h)(B), (int)(R)))
   1716 
   1717 #define _mm256_mask_max_round_ph(W, U, A, B, R)                                \
   1718   ((__m256h)__builtin_ia32_selectph_256(                                       \
   1719       (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)),             \
   1720       (__v16hf)(__m256h)(W)))
   1721 
   1722 #define _mm256_maskz_max_round_ph(U, A, B, R)                                  \
   1723   ((__m256h)__builtin_ia32_selectph_256(                                       \
   1724       (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)),             \
   1725       (__v16hf)_mm256_setzero_ph()))
   1726 
   1727 #define _mm256_max_round_ps(A, B, R)                                           \
   1728   ((__m256)__builtin_ia32_vmaxps256_round((__v8sf)(__m256)(A),                 \
   1729                                           (__v8sf)(__m256)(B), (int)(R)))
   1730 
   1731 #define _mm256_mask_max_round_ps(W, U, A, B, R)                                \
   1732   ((__m256)__builtin_ia32_selectps_256(                                        \
   1733       (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)),               \
   1734       (__v8sf)(__m256)(W)))
   1735 
   1736 #define _mm256_maskz_max_round_ps(U, A, B, R)                                  \
   1737   ((__m256)__builtin_ia32_selectps_256(                                        \
   1738       (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)),               \
   1739       (__v8sf)_mm256_setzero_ps()))
   1740 
   1741 #define _mm256_min_round_pd(A, B, R)                                           \
   1742   ((__m256d)__builtin_ia32_vminpd256_round((__v4df)(__m256d)(A),               \
   1743                                            (__v4df)(__m256d)(B), (int)(R)))
   1744 
   1745 #define _mm256_mask_min_round_pd(W, U, A, B, R)                                \
   1746   ((__m256d)__builtin_ia32_selectpd_256(                                       \
   1747       (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)),               \
   1748       (__v4df)(__m256d)(W)))
   1749 
   1750 #define _mm256_maskz_min_round_pd(U, A, B, R)                                  \
   1751   ((__m256d)__builtin_ia32_selectpd_256(                                       \
   1752       (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)),               \
   1753       (__v4df)_mm256_setzero_pd()))
   1754 
   1755 #define _mm256_min_round_ph(A, B, R)                                           \
   1756   ((__m256h)__builtin_ia32_vminph256_round((__v16hf)(__m256h)(A),              \
   1757                                            (__v16hf)(__m256h)(B), (int)(R)))
   1758 
   1759 #define _mm256_mask_min_round_ph(W, U, A, B, R)                                \
   1760   ((__m256h)__builtin_ia32_selectph_256(                                       \
   1761       (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)),             \
   1762       (__v16hf)(__m256h)(W)))
   1763 
   1764 #define _mm256_maskz_min_round_ph(U, A, B, R)                                  \
   1765   ((__m256h)__builtin_ia32_selectph_256(                                       \
   1766       (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)),             \
   1767       (__v16hf)_mm256_setzero_ph()))
   1768 
   1769 #define _mm256_min_round_ps(A, B, R)                                           \
   1770   ((__m256)__builtin_ia32_vminps256_round((__v8sf)(__m256)(A),                 \
   1771                                           (__v8sf)(__m256)(B), (int)(R)))
   1772 
   1773 #define _mm256_mask_min_round_ps(W, U, A, B, R)                                \
   1774   ((__m256)__builtin_ia32_selectps_256(                                        \
   1775       (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)),               \
   1776       (__v8sf)(__m256)(W)))
   1777 
   1778 #define _mm256_maskz_min_round_ps(U, A, B, R)                                  \
   1779   ((__m256)__builtin_ia32_selectps_256(                                        \
   1780       (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)),               \
   1781       (__v8sf)_mm256_setzero_ps()))
   1782 
   1783 #define _mm256_mul_round_pd(A, B, R)                                           \
   1784   ((__m256d)__builtin_ia32_vmulpd256_round((__v4df)(__m256d)(A),               \
   1785                                            (__v4df)(__m256d)(B), (int)(R)))
   1786 
   1787 #define _mm256_mask_mul_round_pd(W, U, A, B, R)                                \
   1788   ((__m256d)__builtin_ia32_selectpd_256(                                       \
   1789       (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)),               \
   1790       (__v4df)(__m256d)(W)))
   1791 
   1792 #define _mm256_maskz_mul_round_pd(U, A, B, R)                                  \
   1793   ((__m256d)__builtin_ia32_selectpd_256(                                       \
   1794       (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)),               \
   1795       (__v4df)_mm256_setzero_pd()))
   1796 
   1797 #define _mm256_mul_round_ph(A, B, R)                                           \
   1798   ((__m256h)__builtin_ia32_vmulph256_round((__v16hf)(__m256h)(A),              \
   1799                                            (__v16hf)(__m256h)(B), (int)(R)))
   1800 
   1801 #define _mm256_mask_mul_round_ph(W, U, A, B, R)                                \
   1802   ((__m256h)__builtin_ia32_selectph_256(                                       \
   1803       (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)),             \
   1804       (__v16hf)(__m256h)(W)))
   1805 
   1806 #define _mm256_maskz_mul_round_ph(U, A, B, R)                                  \
   1807   ((__m256h)__builtin_ia32_selectph_256(                                       \
   1808       (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)),             \
   1809       (__v16hf)_mm256_setzero_ph()))
   1810 
   1811 #define _mm256_mul_round_ps(A, B, R)                                           \
   1812   ((__m256)__builtin_ia32_vmulps256_round((__v8sf)(__m256)(A),                 \
   1813                                           (__v8sf)(__m256)(B), (int)(R)))
   1814 
   1815 #define _mm256_mask_mul_round_ps(W, U, A, B, R)                                \
   1816   ((__m256)__builtin_ia32_selectps_256(                                        \
   1817       (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)),               \
   1818       (__v8sf)(__m256)(W)))
   1819 
   1820 #define _mm256_maskz_mul_round_ps(U, A, B, R)                                  \
   1821   ((__m256)__builtin_ia32_selectps_256(                                        \
   1822       (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)),               \
   1823       (__v8sf)_mm256_setzero_ps()))
   1824 
   1825 #define _mm256_range_round_pd(A, B, C, R)                                      \
   1826   ((__m256d)__builtin_ia32_vrangepd256_round_mask(                             \
   1827       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
   1828       (__v4df)_mm256_setzero_pd(), (__mmask8)-1, (int)(R)))
   1829 
   1830 #define _mm256_mask_range_round_pd(W, U, A, B, C, R)                           \
   1831   ((__m256d)__builtin_ia32_vrangepd256_round_mask(                             \
   1832       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
   1833       (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
   1834 
   1835 #define _mm256_maskz_range_round_pd(U, A, B, C, R)                             \
   1836   ((__m256d)__builtin_ia32_vrangepd256_round_mask(                             \
   1837       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
   1838       (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
   1839 
   1840 #define _mm256_range_round_ps(A, B, C, R)                                      \
   1841   ((__m256)__builtin_ia32_vrangeps256_round_mask(                              \
   1842       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
   1843       (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, (int)(R)))
   1844 
   1845 #define _mm256_mask_range_round_ps(W, U, A, B, C, R)                           \
   1846   ((__m256)__builtin_ia32_vrangeps256_round_mask(                              \
   1847       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
   1848       (__mmask8)(U), (int)(R)))
   1849 
   1850 #define _mm256_maskz_range_round_ps(U, A, B, C, R)                             \
   1851   ((__m256)__builtin_ia32_vrangeps256_round_mask(                              \
   1852       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
   1853       (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
   1854 
   1855 #define _mm256_reduce_round_pd(A, B, R)                                        \
   1856   ((__m256d)__builtin_ia32_vreducepd256_round_mask(                            \
   1857       (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(),             \
   1858       (__mmask8)-1, (int)(R)))
   1859 
   1860 #define _mm256_mask_reduce_round_pd(W, U, A, B, R)                             \
   1861   ((__m256d)__builtin_ia32_vreducepd256_round_mask(                            \
   1862       (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U),     \
   1863       (int)(R)))
   1864 
   1865 #define _mm256_maskz_reduce_round_pd(U, A, B, R)                               \
   1866   ((__m256d)__builtin_ia32_vreducepd256_round_mask(                            \
   1867       (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(),             \
   1868       (__mmask8)(U), (int)(R)))
   1869 
   1870 #define _mm256_mask_reduce_round_ph(W, U, A, imm, R)                           \
   1871   ((__m256h)__builtin_ia32_vreduceph256_round_mask(                            \
   1872       (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W),                \
   1873       (__mmask16)(U), (int)(R)))
   1874 
   1875 #define _mm256_maskz_reduce_round_ph(U, A, imm, R)                             \
   1876   ((__m256h)__builtin_ia32_vreduceph256_round_mask(                            \
   1877       (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(),         \
   1878       (__mmask16)(U), (int)(R)))
   1879 
   1880 #define _mm256_reduce_round_ph(A, imm, R)                                      \
   1881   ((__m256h)__builtin_ia32_vreduceph256_round_mask(                            \
   1882       (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(),       \
   1883       (__mmask16)-1, (int)(R)))
   1884 
   1885 #define _mm256_reduce_round_ps(A, B, R)                                        \
   1886   ((__m256)__builtin_ia32_vreduceps256_round_mask(                             \
   1887       (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(),              \
   1888       (__mmask8)-1, (int)(R)))
   1889 
   1890 #define _mm256_mask_reduce_round_ps(W, U, A, B, R)                             \
   1891   ((__m256)__builtin_ia32_vreduceps256_round_mask(                             \
   1892       (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U),       \
   1893       (int)(R)))
   1894 
   1895 #define _mm256_maskz_reduce_round_ps(U, A, B, R)                               \
   1896   ((__m256)__builtin_ia32_vreduceps256_round_mask(                             \
   1897       (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(),              \
   1898       (__mmask8)(U), (int)(R)))
   1899 
   1900 #define _mm256_roundscale_round_pd(A, imm, R)                                  \
   1901   ((__m256d)__builtin_ia32_vrndscalepd256_round_mask(                          \
   1902       (__v4df)(__m256d)(A), (int)(imm), (__v4df)_mm256_undefined_pd(),         \
   1903       (__mmask8)-1, (int)(R)))
   1904 
   1905 #define _mm256_mask_roundscale_round_pd(A, B, C, imm, R)                       \
   1906   ((__m256d)__builtin_ia32_vrndscalepd256_round_mask(                          \
   1907       (__v4df)(__m256d)(C), (int)(imm), (__v4df)(__m256d)(A), (__mmask8)(B),   \
   1908       (int)(R)))
   1909 
   1910 #define _mm256_maskz_roundscale_round_pd(A, B, imm, R)                         \
   1911   ((__m256d)__builtin_ia32_vrndscalepd256_round_mask(                          \
   1912       (__v4df)(__m256d)(B), (int)(imm), (__v4df)_mm256_setzero_pd(),           \
   1913       (__mmask8)(A), (int)(R)))
   1914 
   1915 #define _mm256_roundscale_round_ph(A, imm, R)                                  \
   1916   ((__m256h)__builtin_ia32_vrndscaleph256_round_mask(                          \
   1917       (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(),       \
   1918       (__mmask16)-1, (int)(R)))
   1919 
   1920 #define _mm256_mask_roundscale_round_ph(A, B, C, imm, R)                       \
   1921   ((__m256h)__builtin_ia32_vrndscaleph256_round_mask(                          \
   1922       (__v16hf)(__m256h)(C), (int)(imm), (__v16hf)(__m256h)(A),                \
   1923       (__mmask16)(B), (int)(R)))
   1924 
   1925 #define _mm256_maskz_roundscale_round_ph(A, B, imm, R)                         \
   1926   ((__m256h)__builtin_ia32_vrndscaleph256_round_mask(                          \
   1927       (__v16hf)(__m256h)(B), (int)(imm), (__v16hf)_mm256_setzero_ph(),         \
   1928       (__mmask16)(A), (int)(R)))
   1929 
   1930 #define _mm256_roundscale_round_ps(A, imm, R)                                  \
   1931   ((__m256)__builtin_ia32_vrndscaleps256_round_mask(                           \
   1932       (__v8sf)(__m256)(A), (int)(imm), (__v8sf)_mm256_undefined_ps(),          \
   1933       (__mmask8)-1, (int)(R)))
   1934 
   1935 #define _mm256_mask_roundscale_round_ps(A, B, C, imm, R)                       \
   1936   ((__m256)__builtin_ia32_vrndscaleps256_round_mask(                           \
   1937       (__v8sf)(__m256)(C), (int)(imm), (__v8sf)(__m256)(A), (__mmask8)(B),     \
   1938       (int)(R)))
   1939 
   1940 #define _mm256_maskz_roundscale_round_ps(A, B, imm, R)                         \
   1941   ((__m256)__builtin_ia32_vrndscaleps256_round_mask(                           \
   1942       (__v8sf)(__m256)(B), (int)(imm), (__v8sf)_mm256_setzero_ps(),            \
   1943       (__mmask8)(A), (int)(R)))
   1944 
   1945 #define _mm256_scalef_round_pd(A, B, R)                                        \
   1946   ((__m256d)__builtin_ia32_vscalefpd256_round_mask(                            \
   1947       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B),                              \
   1948       (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
   1949 
   1950 #define _mm256_mask_scalef_round_pd(W, U, A, B, R)                             \
   1951   ((__m256d)__builtin_ia32_vscalefpd256_round_mask(                            \
   1952       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(W),        \
   1953       (__mmask8)(U), (int)(R)))
   1954 
   1955 #define _mm256_maskz_scalef_round_pd(U, A, B, R)                               \
   1956   ((__m256d)__builtin_ia32_vscalefpd256_round_mask(                            \
   1957       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)_mm256_setzero_pd(), \
   1958       (__mmask8)(U), (int)(R)))
   1959 
   1960 #define _mm256_scalef_round_ph(A, B, R)                                        \
   1961   ((__m256h)__builtin_ia32_vscalefph256_round_mask(                            \
   1962       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B),                            \
   1963       (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
   1964 
   1965 #define _mm256_mask_scalef_round_ph(W, U, A, B, R)                             \
   1966   ((__m256h)__builtin_ia32_vscalefph256_round_mask(                            \
   1967       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(W),     \
   1968       (__mmask16)(U), (int)(R)))
   1969 
   1970 #define _mm256_maskz_scalef_round_ph(U, A, B, R)                               \
   1971   ((__m256h)__builtin_ia32_vscalefph256_round_mask(                            \
   1972       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B),                            \
   1973       (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
   1974 
   1975 #define _mm256_scalef_round_ps(A, B, R)                                        \
   1976   ((__m256)__builtin_ia32_vscalefps256_round_mask(                             \
   1977       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_undefined_ps(), \
   1978       (__mmask8)-1, (int)(R)))
   1979 
   1980 #define _mm256_mask_scalef_round_ps(W, U, A, B, R)                             \
   1981   ((__m256)__builtin_ia32_vscalefps256_round_mask(                             \
   1982       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(W),           \
   1983       (__mmask8)(U), (int)(R)))
   1984 
   1985 #define _mm256_maskz_scalef_round_ps(U, A, B, R)                               \
   1986   ((__m256)__builtin_ia32_vscalefps256_round_mask(                             \
   1987       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_setzero_ps(),   \
   1988       (__mmask8)(U), (int)(R)))
   1989 
   1990 #define _mm256_sqrt_round_pd(A, R)                                             \
   1991   ((__m256d)__builtin_ia32_vsqrtpd256_round((__v4df)(__m256d)(A), (int)(R)))
   1992 
   1993 #define _mm256_mask_sqrt_round_pd(W, U, A, R)                                  \
   1994   ((__m256d)__builtin_ia32_selectpd_256(                                       \
   1995       (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)),                   \
   1996       (__v4df)(__m256d)(W)))
   1997 
   1998 #define _mm256_maskz_sqrt_round_pd(U, A, R)                                    \
   1999   ((__m256d)__builtin_ia32_selectpd_256(                                       \
   2000       (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)),                   \
   2001       (__v4df)_mm256_setzero_pd()))
   2002 
   2003 #define _mm256_sqrt_round_ph(A, R)                                             \
   2004   ((__m256h)__builtin_ia32_vsqrtph256_round((__v16hf)(__m256h)(A), (int)(R)))
   2005 
   2006 #define _mm256_mask_sqrt_round_ph(W, U, A, R)                                  \
   2007   ((__m256h)__builtin_ia32_selectph_256(                                       \
   2008       (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)),                 \
   2009       (__v16hf)(__m256h)(W)))
   2010 
   2011 #define _mm256_maskz_sqrt_round_ph(U, A, R)                                    \
   2012   ((__m256h)__builtin_ia32_selectph_256(                                       \
   2013       (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)),                 \
   2014       (__v16hf)_mm256_setzero_ph()))
   2015 
   2016 #define _mm256_sqrt_round_ps(A, R)                                             \
   2017   ((__m256)__builtin_ia32_vsqrtps256_round((__v8sf)(__m256)(A), (int)(R)))
   2018 
   2019 #define _mm256_mask_sqrt_round_ps(W, U, A, R)                                  \
   2020   ((__m256)__builtin_ia32_selectps_256((__mmask8)(U),                          \
   2021                                        (__v8sf)_mm256_sqrt_round_ps((A), (R)), \
   2022                                        (__v8sf)(__m256)(W)))
   2023 
   2024 #define _mm256_maskz_sqrt_round_ps(U, A, R)                                    \
   2025   ((__m256)__builtin_ia32_selectps_256((__mmask8)(U),                          \
   2026                                        (__v8sf)_mm256_sqrt_round_ps((A), (R)), \
   2027                                        (__v8sf)_mm256_setzero_ps()))
   2028 
   2029 #define _mm256_sub_round_pd(A, B, R)                                           \
   2030   ((__m256d)__builtin_ia32_vsubpd256_round((__v4df)(__m256d)(A),               \
   2031                                            (__v4df)(__m256d)(B), (int)(R)))
   2032 
   2033 #define _mm256_mask_sub_round_pd(W, U, A, B, R)                                \
   2034   ((__m256d)__builtin_ia32_selectpd_256(                                       \
   2035       (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)),               \
   2036       (__v4df)(__m256d)(W)))
   2037 
   2038 #define _mm256_maskz_sub_round_pd(U, A, B, R)                                  \
   2039   ((__m256d)__builtin_ia32_selectpd_256(                                       \
   2040       (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)),               \
   2041       (__v4df)_mm256_setzero_pd()))
   2042 
   2043 #define _mm256_sub_round_ph(A, B, R)                                           \
   2044   ((__m256h)__builtin_ia32_vsubph256_round((__v16hf)(__m256h)(A),              \
   2045                                            (__v16hf)(__m256h)(B), (int)(R)))
   2046 
   2047 #define _mm256_mask_sub_round_ph(W, U, A, B, R)                                \
   2048   ((__m256h)__builtin_ia32_selectph_256(                                       \
   2049       (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)),             \
   2050       (__v16hf)(__m256h)(W)))
   2051 
   2052 #define _mm256_maskz_sub_round_ph(U, A, B, R)                                  \
   2053   ((__m256h)__builtin_ia32_selectph_256(                                       \
   2054       (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)),             \
   2055       (__v16hf)_mm256_setzero_ph()))
   2056 
   2057 #define _mm256_sub_round_ps(A, B, R)                                           \
   2058   ((__m256)__builtin_ia32_vsubps256_round((__v8sf)(__m256)(A),                 \
   2059                                           (__v8sf)(__m256)(B), (int)(R)))
   2060 
   2061 #define _mm256_mask_sub_round_ps(W, U, A, B, R)                                \
   2062   ((__m256)__builtin_ia32_selectps_256(                                        \
   2063       (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)),               \
   2064       (__v8sf)(__m256)(W)))
   2065 
   2066 #define _mm256_maskz_sub_round_ps(U, A, B, R)                                  \
   2067   ((__m256)__builtin_ia32_selectps_256(                                        \
   2068       (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)),               \
   2069       (__v8sf)_mm256_setzero_ps()))
   2070 
   2071 #undef __DEFAULT_FN_ATTRS256
   2072 #undef __DEFAULT_FN_ATTRS128
   2073 
   2074 #endif /* __AVX10_2NIINTRIN_H */
   2075 #endif /* __SSE2__ */