zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

avx512fp16intrin.h (160687B) - Raw


      1 /*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 #ifndef __IMMINTRIN_H
     10 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
     11 #endif
     12 
     13 #ifdef __SSE2__
     14 
     15 #ifndef __AVX512FP16INTRIN_H
     16 #define __AVX512FP16INTRIN_H
     17 
     18 /* Define the default attributes for the functions in this file. */
     19 typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
     20 typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
     21 typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
     22 
     23 /* Define the default attributes for the functions in this file. */
     24 #define __DEFAULT_FN_ATTRS512                                                  \
     25   __attribute__((__always_inline__, __nodebug__,                               \
     26                  __target__("avx512fp16,evex512"), __min_vector_width__(512)))
     27 #define __DEFAULT_FN_ATTRS256                                                  \
     28   __attribute__((__always_inline__, __nodebug__,                               \
     29                  __target__("avx512fp16,no-evex512"),                          \
     30                  __min_vector_width__(256)))
     31 #define __DEFAULT_FN_ATTRS128                                                  \
     32   __attribute__((__always_inline__, __nodebug__,                               \
     33                  __target__("avx512fp16,no-evex512"),                          \
     34                  __min_vector_width__(128)))
     35 
     36 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
     37   return __a[0];
     38 }
     39 
     40 static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
     41   return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
     42 }
     43 
     44 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
     45   return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
     46                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
     47 }
     48 
     49 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
     50   return (__m256h)__builtin_ia32_undef256();
     51 }
     52 
     53 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
     54   return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
     55                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
     56                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
     57 }
     58 
     59 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
     60   return (__m128h)__builtin_ia32_undef128();
     61 }
     62 
     63 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
     64   return (__m512h)__builtin_ia32_undef512();
     65 }
     66 
     67 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
     68   return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
     69                             __h, __h, __h, __h, __h, __h, __h, __h,
     70                             __h, __h, __h, __h, __h, __h, __h, __h,
     71                             __h, __h, __h, __h, __h, __h, __h, __h};
     72 }
     73 
     74 static __inline __m512h __DEFAULT_FN_ATTRS512
     75 _mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
     76               _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
     77               _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
     78               _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
     79               _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
     80               _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
     81               _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
     82               _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
     83   return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
     84                             __h25, __h24, __h23, __h22, __h21, __h20, __h19,
     85                             __h18, __h17, __h16, __h15, __h14, __h13, __h12,
     86                             __h11, __h10, __h9,  __h8,  __h7,  __h6,  __h5,
     87                             __h4,  __h3,  __h2,  __h1};
     88 }
     89 
     90 #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
     91                        h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24,  \
     92                        h25, h26, h27, h28, h29, h30, h31, h32)                 \
     93   _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
     94                 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
     95                 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6),     \
     96                 (h5), (h4), (h3), (h2), (h1))
     97 
     98 static __inline __m512h __DEFAULT_FN_ATTRS512
     99 _mm512_set1_pch(_Float16 _Complex __h) {
    100   return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, __h));
    101 }
    102 
    103 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
    104   return (__m128)__a;
    105 }
    106 
    107 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
    108   return (__m256)__a;
    109 }
    110 
    111 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
    112   return (__m512)__a;
    113 }
    114 
    115 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
    116   return (__m128d)__a;
    117 }
    118 
    119 static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
    120   return (__m256d)__a;
    121 }
    122 
    123 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
    124   return (__m512d)__a;
    125 }
    126 
    127 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
    128   return (__m128i)__a;
    129 }
    130 
    131 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    132 _mm256_castph_si256(__m256h __a) {
    133   return (__m256i)__a;
    134 }
    135 
    136 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    137 _mm512_castph_si512(__m512h __a) {
    138   return (__m512i)__a;
    139 }
    140 
    141 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
    142   return (__m128h)__a;
    143 }
    144 
    145 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
    146   return (__m256h)__a;
    147 }
    148 
    149 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
    150   return (__m512h)__a;
    151 }
    152 
    153 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
    154   return (__m128h)__a;
    155 }
    156 
    157 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
    158   return (__m256h)__a;
    159 }
    160 
    161 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
    162   return (__m512h)__a;
    163 }
    164 
    165 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
    166   return (__m128h)__a;
    167 }
    168 
    169 static __inline__ __m256h __DEFAULT_FN_ATTRS256
    170 _mm256_castsi256_ph(__m256i __a) {
    171   return (__m256h)__a;
    172 }
    173 
    174 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    175 _mm512_castsi512_ph(__m512i __a) {
    176   return (__m512h)__a;
    177 }
    178 
    179 static __inline__ __m128h __DEFAULT_FN_ATTRS256
    180 _mm256_castph256_ph128(__m256h __a) {
    181   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
    182 }
    183 
    184 static __inline__ __m128h __DEFAULT_FN_ATTRS512
    185 _mm512_castph512_ph128(__m512h __a) {
    186   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
    187 }
    188 
    189 static __inline__ __m256h __DEFAULT_FN_ATTRS512
    190 _mm512_castph512_ph256(__m512h __a) {
    191   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
    192                                  12, 13, 14, 15);
    193 }
    194 
    195 static __inline__ __m256h __DEFAULT_FN_ATTRS256
    196 _mm256_castph128_ph256(__m128h __a) {
    197   return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
    198                                   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
    199 }
    200 
    201 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    202 _mm512_castph128_ph512(__m128h __a) {
    203   __m256h __b = __builtin_nondeterministic_value(__b);
    204   return __builtin_shufflevector(
    205       __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
    206                               0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
    207       __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    208       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
    209 }
    210 
    211 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    212 _mm512_castph256_ph512(__m256h __a) {
    213   return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
    214                                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
    215                                  15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
    216                                  27, 28, 29, 30, 31);
    217 }
    218 
    219 /// Constructs a 256-bit floating-point vector of [16 x half] from a
    220 ///    128-bit floating-point vector of [8 x half]. The lower 128 bits
    221 ///    contain the value of the source vector. The upper 384 bits are set
    222 ///    to zero.
    223 ///
    224 /// \headerfile <x86intrin.h>
    225 ///
    226 /// This intrinsic has no corresponding instruction.
    227 ///
    228 /// \param __a
    229 ///    A 128-bit vector of [8 x half].
    230 /// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
    231 ///    contain the value of the parameter. The upper 384 bits are set to zero.
    232 static __inline__ __m256h __DEFAULT_FN_ATTRS256
    233 _mm256_zextph128_ph256(__m128h __a) {
    234   return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
    235                                  5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
    236 }
    237 
    238 /// Constructs a 512-bit floating-point vector of [32 x half] from a
    239 ///    128-bit floating-point vector of [8 x half]. The lower 128 bits
    240 ///    contain the value of the source vector. The upper 384 bits are set
    241 ///    to zero.
    242 ///
    243 /// \headerfile <x86intrin.h>
    244 ///
    245 /// This intrinsic has no corresponding instruction.
    246 ///
    247 /// \param __a
    248 ///    A 128-bit vector of [8 x half].
    249 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
    250 ///    contain the value of the parameter. The upper 384 bits are set to zero.
    251 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    252 _mm512_zextph128_ph512(__m128h __a) {
    253   return __builtin_shufflevector(
    254       __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
    255       13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
    256 }
    257 
    258 /// Constructs a 512-bit floating-point vector of [32 x half] from a
    259 ///    256-bit floating-point vector of [16 x half]. The lower 256 bits
    260 ///    contain the value of the source vector. The upper 256 bits are set
    261 ///    to zero.
    262 ///
    263 /// \headerfile <x86intrin.h>
    264 ///
    265 /// This intrinsic has no corresponding instruction.
    266 ///
    267 /// \param __a
    268 ///    A 256-bit vector of [16 x half].
    269 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
    270 ///    contain the value of the parameter. The upper 256 bits are set to zero.
    271 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    272 _mm512_zextph256_ph512(__m256h __a) {
    273   return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
    274                                  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
    275                                  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
    276                                  29, 30, 31);
    277 }
    278 
    279 #define _mm_comi_round_sh(A, B, P, R)                                          \
    280   __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
    281 
    282 #define _mm_comi_sh(A, B, pred)                                                \
    283   _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
    284 
    285 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h __A,
    286                                                           __m128h __B) {
    287   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OS,
    288                                 _MM_FROUND_CUR_DIRECTION);
    289 }
    290 
    291 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h __A,
    292                                                           __m128h __B) {
    293   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OS,
    294                                 _MM_FROUND_CUR_DIRECTION);
    295 }
    296 
    297 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h __A,
    298                                                           __m128h __B) {
    299   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OS,
    300                                 _MM_FROUND_CUR_DIRECTION);
    301 }
    302 
    303 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h __A,
    304                                                           __m128h __B) {
    305   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OS,
    306                                 _MM_FROUND_CUR_DIRECTION);
    307 }
    308 
    309 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h __A,
    310                                                           __m128h __B) {
    311   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OS,
    312                                 _MM_FROUND_CUR_DIRECTION);
    313 }
    314 
    315 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h __A,
    316                                                            __m128h __B) {
    317   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_US,
    318                                 _MM_FROUND_CUR_DIRECTION);
    319 }
    320 
    321 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h __A,
    322                                                            __m128h __B) {
    323   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OQ,
    324                                 _MM_FROUND_CUR_DIRECTION);
    325 }
    326 
    327 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h __A,
    328                                                            __m128h __B) {
    329   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OQ,
    330                                 _MM_FROUND_CUR_DIRECTION);
    331 }
    332 
    333 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h __A,
    334                                                            __m128h __B) {
    335   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OQ,
    336                                 _MM_FROUND_CUR_DIRECTION);
    337 }
    338 
    339 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h __A,
    340                                                            __m128h __B) {
    341   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OQ,
    342                                 _MM_FROUND_CUR_DIRECTION);
    343 }
    344 
    345 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h __A,
    346                                                            __m128h __B) {
    347   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OQ,
    348                                 _MM_FROUND_CUR_DIRECTION);
    349 }
    350 
    351 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h __A,
    352                                                             __m128h __B) {
    353   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_UQ,
    354                                 _MM_FROUND_CUR_DIRECTION);
    355 }
    356 
    357 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
    358                                                               __m512h __B) {
    359   return (__m512h)((__v32hf)__A + (__v32hf)__B);
    360 }
    361 
    362 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    363 _mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
    364   return (__m512h)__builtin_ia32_selectph_512(
    365       (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
    366 }
    367 
    368 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    369 _mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
    370   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
    371                                               (__v32hf)_mm512_add_ph(__A, __B),
    372                                               (__v32hf)_mm512_setzero_ph());
    373 }
    374 
    375 #define _mm512_add_round_ph(A, B, R)                                           \
    376   ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A),                     \
    377                                     (__v32hf)(__m512h)(B), (int)(R)))
    378 
    379 #define _mm512_mask_add_round_ph(W, U, A, B, R)                                \
    380   ((__m512h)__builtin_ia32_selectph_512(                                       \
    381       (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
    382       (__v32hf)(__m512h)(W)))
    383 
    384 #define _mm512_maskz_add_round_ph(U, A, B, R)                                  \
    385   ((__m512h)__builtin_ia32_selectph_512(                                       \
    386       (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
    387       (__v32hf)_mm512_setzero_ph()))
    388 
    389 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
    390                                                               __m512h __B) {
    391   return (__m512h)((__v32hf)__A - (__v32hf)__B);
    392 }
    393 
    394 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    395 _mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
    396   return (__m512h)__builtin_ia32_selectph_512(
    397       (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
    398 }
    399 
    400 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    401 _mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
    402   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
    403                                               (__v32hf)_mm512_sub_ph(__A, __B),
    404                                               (__v32hf)_mm512_setzero_ph());
    405 }
    406 
    407 #define _mm512_sub_round_ph(A, B, R)                                           \
    408   ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A),                     \
    409                                     (__v32hf)(__m512h)(B), (int)(R)))
    410 
    411 #define _mm512_mask_sub_round_ph(W, U, A, B, R)                                \
    412   ((__m512h)__builtin_ia32_selectph_512(                                       \
    413       (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
    414       (__v32hf)(__m512h)(W)))
    415 
    416 #define _mm512_maskz_sub_round_ph(U, A, B, R)                                  \
    417   ((__m512h)__builtin_ia32_selectph_512(                                       \
    418       (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
    419       (__v32hf)_mm512_setzero_ph()))
    420 
    421 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
    422                                                               __m512h __B) {
    423   return (__m512h)((__v32hf)__A * (__v32hf)__B);
    424 }
    425 
    426 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    427 _mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
    428   return (__m512h)__builtin_ia32_selectph_512(
    429       (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
    430 }
    431 
    432 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    433 _mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
    434   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
    435                                               (__v32hf)_mm512_mul_ph(__A, __B),
    436                                               (__v32hf)_mm512_setzero_ph());
    437 }
    438 
    439 #define _mm512_mul_round_ph(A, B, R)                                           \
    440   ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A),                     \
    441                                     (__v32hf)(__m512h)(B), (int)(R)))
    442 
    443 #define _mm512_mask_mul_round_ph(W, U, A, B, R)                                \
    444   ((__m512h)__builtin_ia32_selectph_512(                                       \
    445       (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
    446       (__v32hf)(__m512h)(W)))
    447 
    448 #define _mm512_maskz_mul_round_ph(U, A, B, R)                                  \
    449   ((__m512h)__builtin_ia32_selectph_512(                                       \
    450       (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
    451       (__v32hf)_mm512_setzero_ph()))
    452 
    453 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
    454                                                               __m512h __B) {
    455   return (__m512h)((__v32hf)__A / (__v32hf)__B);
    456 }
    457 
    458 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    459 _mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
    460   return (__m512h)__builtin_ia32_selectph_512(
    461       (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
    462 }
    463 
    464 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    465 _mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
    466   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
    467                                               (__v32hf)_mm512_div_ph(__A, __B),
    468                                               (__v32hf)_mm512_setzero_ph());
    469 }
    470 
    471 #define _mm512_div_round_ph(A, B, R)                                           \
    472   ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A),                     \
    473                                     (__v32hf)(__m512h)(B), (int)(R)))
    474 
    475 #define _mm512_mask_div_round_ph(W, U, A, B, R)                                \
    476   ((__m512h)__builtin_ia32_selectph_512(                                       \
    477       (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
    478       (__v32hf)(__m512h)(W)))
    479 
    480 #define _mm512_maskz_div_round_ph(U, A, B, R)                                  \
    481   ((__m512h)__builtin_ia32_selectph_512(                                       \
    482       (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
    483       (__v32hf)_mm512_setzero_ph()))
    484 
    485 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
    486                                                               __m512h __B) {
    487   return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
    488                                           _MM_FROUND_CUR_DIRECTION);
    489 }
    490 
    491 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    492 _mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
    493   return (__m512h)__builtin_ia32_selectph_512(
    494       (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
    495 }
    496 
    497 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    498 _mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
    499   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
    500                                               (__v32hf)_mm512_min_ph(__A, __B),
    501                                               (__v32hf)_mm512_setzero_ph());
    502 }
    503 
    504 #define _mm512_min_round_ph(A, B, R)                                           \
    505   ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A),                     \
    506                                     (__v32hf)(__m512h)(B), (int)(R)))
    507 
    508 #define _mm512_mask_min_round_ph(W, U, A, B, R)                                \
    509   ((__m512h)__builtin_ia32_selectph_512(                                       \
    510       (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
    511       (__v32hf)(__m512h)(W)))
    512 
    513 #define _mm512_maskz_min_round_ph(U, A, B, R)                                  \
    514   ((__m512h)__builtin_ia32_selectph_512(                                       \
    515       (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
    516       (__v32hf)_mm512_setzero_ph()))
    517 
    518 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
    519                                                               __m512h __B) {
    520   return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
    521                                           _MM_FROUND_CUR_DIRECTION);
    522 }
    523 
    524 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    525 _mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
    526   return (__m512h)__builtin_ia32_selectph_512(
    527       (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
    528 }
    529 
    530 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    531 _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
    532   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
    533                                               (__v32hf)_mm512_max_ph(__A, __B),
    534                                               (__v32hf)_mm512_setzero_ph());
    535 }
    536 
    537 #define _mm512_max_round_ph(A, B, R)                                           \
    538   ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A),                     \
    539                                     (__v32hf)(__m512h)(B), (int)(R)))
    540 
    541 #define _mm512_mask_max_round_ph(W, U, A, B, R)                                \
    542   ((__m512h)__builtin_ia32_selectph_512(                                       \
    543       (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
    544       (__v32hf)(__m512h)(W)))
    545 
    546 #define _mm512_maskz_max_round_ph(U, A, B, R)                                  \
    547   ((__m512h)__builtin_ia32_selectph_512(                                       \
    548       (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
    549       (__v32hf)_mm512_setzero_ph()))
    550 
    551 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
    552   return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
    553 }
    554 
    555 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
    556   return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
    557 }
    558 
    559 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    560 _mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
    561   return (__m512h)__builtin_ia32_selectps_512(
    562       (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
    563 }
    564 
    565 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    566 _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
    567   return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
    568                                               (__v16sf)_mm512_conj_pch(__A),
    569                                               (__v16sf)_mm512_setzero_ps());
    570 }
    571 
    572 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
    573                                                            __m128h __B) {
    574   __A[0] += __B[0];
    575   return __A;
    576 }
    577 
    578 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
    579                                                                 __mmask8 __U,
    580                                                                 __m128h __A,
    581                                                                 __m128h __B) {
    582   __A = _mm_add_sh(__A, __B);
    583   return __builtin_ia32_selectsh_128(__U, __A, __W);
    584 }
    585 
    586 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
    587                                                                  __m128h __A,
    588                                                                  __m128h __B) {
    589   __A = _mm_add_sh(__A, __B);
    590   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
    591 }
    592 
    593 #define _mm_add_round_sh(A, B, R)                                              \
    594   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
    595       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
    596       (__mmask8)-1, (int)(R)))
    597 
    598 #define _mm_mask_add_round_sh(W, U, A, B, R)                                   \
    599   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
    600       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
    601       (__mmask8)(U), (int)(R)))
    602 
    603 #define _mm_maskz_add_round_sh(U, A, B, R)                                     \
    604   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
    605       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
    606       (__mmask8)(U), (int)(R)))
    607 
    608 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
    609                                                            __m128h __B) {
    610   __A[0] -= __B[0];
    611   return __A;
    612 }
    613 
    614 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
    615                                                                 __mmask8 __U,
    616                                                                 __m128h __A,
    617                                                                 __m128h __B) {
    618   __A = _mm_sub_sh(__A, __B);
    619   return __builtin_ia32_selectsh_128(__U, __A, __W);
    620 }
    621 
    622 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
    623                                                                  __m128h __A,
    624                                                                  __m128h __B) {
    625   __A = _mm_sub_sh(__A, __B);
    626   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
    627 }
    628 
    629 #define _mm_sub_round_sh(A, B, R)                                              \
    630   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
    631       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
    632       (__mmask8)-1, (int)(R)))
    633 
    634 #define _mm_mask_sub_round_sh(W, U, A, B, R)                                   \
    635   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
    636       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
    637       (__mmask8)(U), (int)(R)))
    638 
    639 #define _mm_maskz_sub_round_sh(U, A, B, R)                                     \
    640   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
    641       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
    642       (__mmask8)(U), (int)(R)))
    643 
    644 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
    645                                                            __m128h __B) {
    646   __A[0] *= __B[0];
    647   return __A;
    648 }
    649 
    650 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
    651                                                                 __mmask8 __U,
    652                                                                 __m128h __A,
    653                                                                 __m128h __B) {
    654   __A = _mm_mul_sh(__A, __B);
    655   return __builtin_ia32_selectsh_128(__U, __A, __W);
    656 }
    657 
    658 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
    659                                                                  __m128h __A,
    660                                                                  __m128h __B) {
    661   __A = _mm_mul_sh(__A, __B);
    662   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
    663 }
    664 
    665 #define _mm_mul_round_sh(A, B, R)                                              \
    666   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
    667       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
    668       (__mmask8)-1, (int)(R)))
    669 
    670 #define _mm_mask_mul_round_sh(W, U, A, B, R)                                   \
    671   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
    672       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
    673       (__mmask8)(U), (int)(R)))
    674 
    675 #define _mm_maskz_mul_round_sh(U, A, B, R)                                     \
    676   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
    677       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
    678       (__mmask8)(U), (int)(R)))
    679 
    680 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
    681                                                            __m128h __B) {
    682   __A[0] /= __B[0];
    683   return __A;
    684 }
    685 
    686 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
    687                                                                 __mmask8 __U,
    688                                                                 __m128h __A,
    689                                                                 __m128h __B) {
    690   __A = _mm_div_sh(__A, __B);
    691   return __builtin_ia32_selectsh_128(__U, __A, __W);
    692 }
    693 
    694 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
    695                                                                  __m128h __A,
    696                                                                  __m128h __B) {
    697   __A = _mm_div_sh(__A, __B);
    698   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
    699 }
    700 
    701 #define _mm_div_round_sh(A, B, R)                                              \
    702   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
    703       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
    704       (__mmask8)-1, (int)(R)))
    705 
    706 #define _mm_mask_div_round_sh(W, U, A, B, R)                                   \
    707   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
    708       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
    709       (__mmask8)(U), (int)(R)))
    710 
    711 #define _mm_maskz_div_round_sh(U, A, B, R)                                     \
    712   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
    713       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
    714       (__mmask8)(U), (int)(R)))
    715 
    716 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
    717                                                            __m128h __B) {
    718   return (__m128h)__builtin_ia32_minsh_round_mask(
    719       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
    720       _MM_FROUND_CUR_DIRECTION);
    721 }
    722 
    723 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
    724                                                                 __mmask8 __U,
    725                                                                 __m128h __A,
    726                                                                 __m128h __B) {
    727   return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
    728                                                   (__v8hf)__W, (__mmask8)__U,
    729                                                   _MM_FROUND_CUR_DIRECTION);
    730 }
    731 
    732 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
    733                                                                  __m128h __A,
    734                                                                  __m128h __B) {
    735   return (__m128h)__builtin_ia32_minsh_round_mask(
    736       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
    737       _MM_FROUND_CUR_DIRECTION);
    738 }
    739 
    740 #define _mm_min_round_sh(A, B, R)                                              \
    741   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
    742       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
    743       (__mmask8)-1, (int)(R)))
    744 
    745 #define _mm_mask_min_round_sh(W, U, A, B, R)                                   \
    746   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
    747       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
    748       (__mmask8)(U), (int)(R)))
    749 
    750 #define _mm_maskz_min_round_sh(U, A, B, R)                                     \
    751   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
    752       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
    753       (__mmask8)(U), (int)(R)))
    754 
    755 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
    756                                                            __m128h __B) {
    757   return (__m128h)__builtin_ia32_maxsh_round_mask(
    758       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
    759       _MM_FROUND_CUR_DIRECTION);
    760 }
    761 
    762 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
    763                                                                 __mmask8 __U,
    764                                                                 __m128h __A,
    765                                                                 __m128h __B) {
    766   return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
    767                                                   (__v8hf)__W, (__mmask8)__U,
    768                                                   _MM_FROUND_CUR_DIRECTION);
    769 }
    770 
    771 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
    772                                                                  __m128h __A,
    773                                                                  __m128h __B) {
    774   return (__m128h)__builtin_ia32_maxsh_round_mask(
    775       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
    776       _MM_FROUND_CUR_DIRECTION);
    777 }
    778 
    779 #define _mm_max_round_sh(A, B, R)                                              \
    780   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
    781       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
    782       (__mmask8)-1, (int)(R)))
    783 
    784 #define _mm_mask_max_round_sh(W, U, A, B, R)                                   \
    785   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
    786       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
    787       (__mmask8)(U), (int)(R)))
    788 
    789 #define _mm_maskz_max_round_sh(U, A, B, R)                                     \
    790   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
    791       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
    792       (__mmask8)(U), (int)(R)))
    793 
    794 #define _mm512_cmp_round_ph_mask(A, B, P, R)                                   \
    795   ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
    796                                            (__v32hf)(__m512h)(B), (int)(P),    \
    797                                            (__mmask32)-1, (int)(R)))
    798 
    799 #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R)                           \
    800   ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
    801                                            (__v32hf)(__m512h)(B), (int)(P),    \
    802                                            (__mmask32)(U), (int)(R)))
    803 
    804 #define _mm512_cmp_ph_mask(A, B, P)                                            \
    805   _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
    806 
    807 #define _mm512_mask_cmp_ph_mask(U, A, B, P)                                    \
    808   _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
    809 
    810 #define _mm_cmp_round_sh_mask(X, Y, P, R)                                      \
    811   ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
    812                                        (__v8hf)(__m128h)(Y), (int)(P),         \
    813                                        (__mmask8)-1, (int)(R)))
    814 
    815 #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R)                              \
    816   ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
    817                                        (__v8hf)(__m128h)(Y), (int)(P),         \
    818                                        (__mmask8)(M), (int)(R)))
    819 
    820 #define _mm_cmp_sh_mask(X, Y, P)                                               \
    821   ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
    822       (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1,      \
    823       _MM_FROUND_CUR_DIRECTION))
    824 
    825 #define _mm_mask_cmp_sh_mask(M, X, Y, P)                                       \
    826   ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
    827       (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M),     \
    828       _MM_FROUND_CUR_DIRECTION))
    829 // loads with vmovsh:
    830 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
    831   struct __mm_load_sh_struct {
    832     _Float16 __u;
    833   } __attribute__((__packed__, __may_alias__));
    834   _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
    835   return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
    836 }
    837 
    838 static __inline__ __m128h __DEFAULT_FN_ATTRS128
    839 _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
    840   __m128h src = (__v8hf)__builtin_shufflevector(
    841       (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
    842 
    843   return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
    844 }
    845 
    846 static __inline__ __m128h __DEFAULT_FN_ATTRS128
    847 _mm_maskz_load_sh(__mmask8 __U, const void *__A) {
    848   return (__m128h)__builtin_ia32_loadsh128_mask(
    849       (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
    850 }
    851 
    852 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    853 _mm512_load_ph(void const *__p) {
    854   return *(const __m512h *)__p;
    855 }
    856 
    857 static __inline__ __m256h __DEFAULT_FN_ATTRS256
    858 _mm256_load_ph(void const *__p) {
    859   return *(const __m256h *)__p;
    860 }
    861 
    862 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
    863   return *(const __m128h *)__p;
    864 }
    865 
    866 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    867 _mm512_loadu_ph(void const *__p) {
    868   struct __loadu_ph {
    869     __m512h_u __v;
    870   } __attribute__((__packed__, __may_alias__));
    871   return ((const struct __loadu_ph *)__p)->__v;
    872 }
    873 
    874 static __inline__ __m256h __DEFAULT_FN_ATTRS256
    875 _mm256_loadu_ph(void const *__p) {
    876   struct __loadu_ph {
    877     __m256h_u __v;
    878   } __attribute__((__packed__, __may_alias__));
    879   return ((const struct __loadu_ph *)__p)->__v;
    880 }
    881 
    882 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
    883   struct __loadu_ph {
    884     __m128h_u __v;
    885   } __attribute__((__packed__, __may_alias__));
    886   return ((const struct __loadu_ph *)__p)->__v;
    887 }
    888 
    889 // stores with vmovsh:
    890 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
    891                                                           __m128h __a) {
    892   struct __mm_store_sh_struct {
    893     _Float16 __u;
    894   } __attribute__((__packed__, __may_alias__));
    895   ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
    896 }
    897 
    898 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
    899                                                                __mmask8 __U,
    900                                                                __m128h __A) {
    901   __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
    902 }
    903 
    904 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
    905                                                              __m512h __A) {
    906   *(__m512h *)__P = __A;
    907 }
    908 
    909 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
    910                                                              __m256h __A) {
    911   *(__m256h *)__P = __A;
    912 }
    913 
    914 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
    915                                                           __m128h __A) {
    916   *(__m128h *)__P = __A;
    917 }
    918 
    919 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
    920                                                               __m512h __A) {
    921   struct __storeu_ph {
    922     __m512h_u __v;
    923   } __attribute__((__packed__, __may_alias__));
    924   ((struct __storeu_ph *)__P)->__v = __A;
    925 }
    926 
    927 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
    928                                                               __m256h __A) {
    929   struct __storeu_ph {
    930     __m256h_u __v;
    931   } __attribute__((__packed__, __may_alias__));
    932   ((struct __storeu_ph *)__P)->__v = __A;
    933 }
    934 
    935 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
    936                                                            __m128h __A) {
    937   struct __storeu_ph {
    938     __m128h_u __v;
    939   } __attribute__((__packed__, __may_alias__));
    940   ((struct __storeu_ph *)__P)->__v = __A;
    941 }
    942 
    943 // moves with vmovsh:
    944 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
    945                                                             __m128h __b) {
    946   __a[0] = __b[0];
    947   return __a;
    948 }
    949 
    950 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
    951                                                                  __mmask8 __U,
    952                                                                  __m128h __A,
    953                                                                  __m128h __B) {
    954   return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
    955 }
    956 
    957 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
    958                                                                   __m128h __A,
    959                                                                   __m128h __B) {
    960   return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
    961                                      _mm_setzero_ph());
    962 }
    963 
    964 // vmovw:
    965 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
    966   return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
    967 }
    968 
    969 static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
    970   __v8hi __b = (__v8hi)__a;
    971   return __b[0];
    972 }
    973 
    974 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
    975   return (__m512h)__builtin_ia32_rcpph512_mask(
    976       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
    977 }
    978 
    979 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    980 _mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
    981   return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
    982                                                (__mmask32)__U);
    983 }
    984 
    985 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    986 _mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
    987   return (__m512h)__builtin_ia32_rcpph512_mask(
    988       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
    989 }
    990 
    991 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
    992   return (__m512h)__builtin_ia32_rsqrtph512_mask(
    993       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
    994 }
    995 
    996 static __inline__ __m512h __DEFAULT_FN_ATTRS512
    997 _mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
    998   return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
    999                                                  (__mmask32)__U);
   1000 }
   1001 
   1002 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   1003 _mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
   1004   return (__m512h)__builtin_ia32_rsqrtph512_mask(
   1005       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
   1006 }
   1007 
   1008 #define _mm512_getmant_ph(A, B, C)                                             \
   1009   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
   1010       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
   1011       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,                           \
   1012       _MM_FROUND_CUR_DIRECTION))
   1013 
   1014 #define _mm512_mask_getmant_ph(W, U, A, B, C)                                  \
   1015   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
   1016       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
   1017       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
   1018 
   1019 #define _mm512_maskz_getmant_ph(U, A, B, C)                                    \
   1020   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
   1021       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
   1022       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
   1023 
   1024 #define _mm512_getmant_round_ph(A, B, C, R)                                    \
   1025   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
   1026       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
   1027       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
   1028 
   1029 #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R)                         \
   1030   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
   1031       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
   1032       (__mmask32)(U), (int)(R)))
   1033 
   1034 #define _mm512_maskz_getmant_round_ph(U, A, B, C, R)                           \
   1035   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
   1036       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
   1037       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
   1038 
   1039 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
   1040   return (__m512h)__builtin_ia32_getexpph512_mask(
   1041       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
   1042       _MM_FROUND_CUR_DIRECTION);
   1043 }
   1044 
   1045 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   1046 _mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
   1047   return (__m512h)__builtin_ia32_getexpph512_mask(
   1048       (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
   1049 }
   1050 
   1051 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   1052 _mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
   1053   return (__m512h)__builtin_ia32_getexpph512_mask(
   1054       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
   1055       _MM_FROUND_CUR_DIRECTION);
   1056 }
   1057 
   1058 #define _mm512_getexp_round_ph(A, R)                                           \
   1059   ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
   1060                                             (__v32hf)_mm512_undefined_ph(),    \
   1061                                             (__mmask32)-1, (int)(R)))
   1062 
   1063 #define _mm512_mask_getexp_round_ph(W, U, A, R)                                \
   1064   ((__m512h)__builtin_ia32_getexpph512_mask(                                   \
   1065       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
   1066 
   1067 #define _mm512_maskz_getexp_round_ph(U, A, R)                                  \
   1068   ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
   1069                                             (__v32hf)_mm512_setzero_ph(),      \
   1070                                             (__mmask32)(U), (int)(R)))
   1071 
   1072 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
   1073                                                                  __m512h __B) {
   1074   return (__m512h)__builtin_ia32_scalefph512_mask(
   1075       (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
   1076       _MM_FROUND_CUR_DIRECTION);
   1077 }
   1078 
   1079 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   1080 _mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
   1081   return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
   1082                                                   (__v32hf)__W, (__mmask32)__U,
   1083                                                   _MM_FROUND_CUR_DIRECTION);
   1084 }
   1085 
   1086 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   1087 _mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
   1088   return (__m512h)__builtin_ia32_scalefph512_mask(
   1089       (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
   1090       _MM_FROUND_CUR_DIRECTION);
   1091 }
   1092 
   1093 #define _mm512_scalef_round_ph(A, B, R)                                        \
   1094   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
   1095       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
   1096       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
   1097 
   1098 #define _mm512_mask_scalef_round_ph(W, U, A, B, R)                             \
   1099   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
   1100       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W),     \
   1101       (__mmask32)(U), (int)(R)))
   1102 
   1103 #define _mm512_maskz_scalef_round_ph(U, A, B, R)                               \
   1104   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
   1105       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
   1106       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
   1107 
   1108 #define _mm512_roundscale_ph(A, B)                                             \
   1109   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
   1110       (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1,   \
   1111       _MM_FROUND_CUR_DIRECTION))
   1112 
   1113 #define _mm512_mask_roundscale_ph(A, B, C, imm)                                \
   1114   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
   1115       (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A),                \
   1116       (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
   1117 
   1118 #define _mm512_maskz_roundscale_ph(A, B, imm)                                  \
   1119   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
   1120       (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
   1121       (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
   1122 
   1123 #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R)                       \
   1124   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm),  \
   1125                                            (__v32hf)(__m512h)(A),              \
   1126                                            (__mmask32)(B), (int)(R)))
   1127 
   1128 #define _mm512_maskz_roundscale_round_ph(A, B, imm, R)                         \
   1129   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm),  \
   1130                                            (__v32hf)_mm512_setzero_ph(),       \
   1131                                            (__mmask32)(A), (int)(R)))
   1132 
   1133 #define _mm512_roundscale_round_ph(A, imm, R)                                  \
   1134   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm),  \
   1135                                            (__v32hf)_mm512_undefined_ph(),     \
   1136                                            (__mmask32)-1, (int)(R)))
   1137 
   1138 #define _mm512_reduce_ph(A, imm)                                               \
   1139   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
   1140       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(),       \
   1141       (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
   1142 
   1143 #define _mm512_mask_reduce_ph(W, U, A, imm)                                    \
   1144   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
   1145       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W),                \
   1146       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
   1147 
   1148 #define _mm512_maskz_reduce_ph(U, A, imm)                                      \
   1149   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
   1150       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
   1151       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
   1152 
   1153 #define _mm512_mask_reduce_round_ph(W, U, A, imm, R)                           \
   1154   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
   1155                                             (__v32hf)(__m512h)(W),             \
   1156                                             (__mmask32)(U), (int)(R)))
   1157 
   1158 #define _mm512_maskz_reduce_round_ph(U, A, imm, R)                             \
   1159   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
   1160                                             (__v32hf)_mm512_setzero_ph(),      \
   1161                                             (__mmask32)(U), (int)(R)))
   1162 
   1163 #define _mm512_reduce_round_ph(A, imm, R)                                      \
   1164   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
   1165                                             (__v32hf)_mm512_undefined_ph(),    \
   1166                                             (__mmask32)-1, (int)(R)))
   1167 
   1168 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
   1169                                                            __m128h __B) {
   1170   return (__m128h)__builtin_ia32_rcpsh_mask(
   1171       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
   1172 }
   1173 
   1174 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
   1175                                                                 __mmask8 __U,
   1176                                                                 __m128h __A,
   1177                                                                 __m128h __B) {
   1178   return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
   1179                                             (__v8hf)__W, (__mmask8)__U);
   1180 }
   1181 
   1182 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
   1183                                                                  __m128h __A,
   1184                                                                  __m128h __B) {
   1185   return (__m128h)__builtin_ia32_rcpsh_mask(
   1186       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
   1187 }
   1188 
   1189 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
   1190                                                              __m128h __B) {
   1191   return (__m128h)__builtin_ia32_rsqrtsh_mask(
   1192       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
   1193 }
   1194 
   1195 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
   1196                                                                   __mmask8 __U,
   1197                                                                   __m128h __A,
   1198                                                                   __m128h __B) {
   1199   return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
   1200                                               (__v8hf)__W, (__mmask8)__U);
   1201 }
   1202 
   1203 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   1204 _mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
   1205   return (__m128h)__builtin_ia32_rsqrtsh_mask(
   1206       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
   1207 }
   1208 
   1209 #define _mm_getmant_round_sh(A, B, C, D, R)                                    \
   1210   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
   1211       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
   1212       (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
   1213 
   1214 #define _mm_getmant_sh(A, B, C, D)                                             \
   1215   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
   1216       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
   1217       (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
   1218 
   1219 #define _mm_mask_getmant_sh(W, U, A, B, C, D)                                  \
   1220   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
   1221       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
   1222       (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
   1223 
   1224 #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R)                         \
   1225   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
   1226       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
   1227       (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
   1228 
   1229 #define _mm_maskz_getmant_sh(U, A, B, C, D)                                    \
   1230   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
   1231       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
   1232       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
   1233 
   1234 #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R)                           \
   1235   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
   1236       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
   1237       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
   1238 
   1239 #define _mm_getexp_round_sh(A, B, R)                                           \
   1240   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
   1241       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1242       (__mmask8)-1, (int)(R)))
   1243 
   1244 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
   1245                                                               __m128h __B) {
   1246   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
   1247       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
   1248       _MM_FROUND_CUR_DIRECTION);
   1249 }
   1250 
   1251 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   1252 _mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
   1253   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
   1254       (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
   1255       _MM_FROUND_CUR_DIRECTION);
   1256 }
   1257 
   1258 #define _mm_mask_getexp_round_sh(W, U, A, B, R)                                \
   1259   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
   1260       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
   1261       (__mmask8)(U), (int)(R)))
   1262 
   1263 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   1264 _mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
   1265   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
   1266       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
   1267       _MM_FROUND_CUR_DIRECTION);
   1268 }
   1269 
   1270 #define _mm_maskz_getexp_round_sh(U, A, B, R)                                  \
   1271   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
   1272       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1273       (__mmask8)(U), (int)(R)))
   1274 
   1275 #define _mm_scalef_round_sh(A, B, R)                                           \
   1276   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
   1277       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1278       (__mmask8)-1, (int)(R)))
   1279 
   1280 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
   1281                                                               __m128h __B) {
   1282   return (__m128h)__builtin_ia32_scalefsh_round_mask(
   1283       (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
   1284       _MM_FROUND_CUR_DIRECTION);
   1285 }
   1286 
   1287 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   1288 _mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
   1289   return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
   1290                                                      (__v8hf)__W, (__mmask8)__U,
   1291                                                      _MM_FROUND_CUR_DIRECTION);
   1292 }
   1293 
   1294 #define _mm_mask_scalef_round_sh(W, U, A, B, R)                                \
   1295   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
   1296       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
   1297       (__mmask8)(U), (int)(R)))
   1298 
   1299 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   1300 _mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
   1301   return (__m128h)__builtin_ia32_scalefsh_round_mask(
   1302       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
   1303       _MM_FROUND_CUR_DIRECTION);
   1304 }
   1305 
   1306 #define _mm_maskz_scalef_round_sh(U, A, B, R)                                  \
   1307   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
   1308       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1309       (__mmask8)(U), (int)(R)))
   1310 
   1311 #define _mm_roundscale_round_sh(A, B, imm, R)                                  \
   1312   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
   1313       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1314       (__mmask8)-1, (int)(imm), (int)(R)))
   1315 
   1316 #define _mm_roundscale_sh(A, B, imm)                                           \
   1317   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
   1318       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1319       (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
   1320 
   1321 #define _mm_mask_roundscale_sh(W, U, A, B, I)                                  \
   1322   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
   1323       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
   1324       (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
   1325 
   1326 #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R)                         \
   1327   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
   1328       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
   1329       (__mmask8)(U), (int)(I), (int)(R)))
   1330 
   1331 #define _mm_maskz_roundscale_sh(U, A, B, I)                                    \
   1332   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
   1333       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1334       (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
   1335 
   1336 #define _mm_maskz_roundscale_round_sh(U, A, B, I, R)                           \
   1337   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
   1338       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1339       (__mmask8)(U), (int)(I), (int)(R)))
   1340 
   1341 #define _mm_reduce_sh(A, B, C)                                                 \
   1342   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
   1343       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1344       (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
   1345 
   1346 #define _mm_mask_reduce_sh(W, U, A, B, C)                                      \
   1347   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
   1348       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
   1349       (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
   1350 
   1351 #define _mm_maskz_reduce_sh(U, A, B, C)                                        \
   1352   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
   1353       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1354       (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
   1355 
   1356 #define _mm_reduce_round_sh(A, B, C, R)                                        \
   1357   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
   1358       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1359       (__mmask8)-1, (int)(C), (int)(R)))
   1360 
   1361 #define _mm_mask_reduce_round_sh(W, U, A, B, C, R)                             \
   1362   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
   1363       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
   1364       (__mmask8)(U), (int)(C), (int)(R)))
   1365 
   1366 #define _mm_maskz_reduce_round_sh(U, A, B, C, R)                               \
   1367   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
   1368       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1369       (__mmask8)(U), (int)(C), (int)(R)))
   1370 
   1371 #define _mm512_sqrt_round_ph(A, R)                                             \
   1372   ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
   1373 
   1374 #define _mm512_mask_sqrt_round_ph(W, U, A, R)                                  \
   1375   ((__m512h)__builtin_ia32_selectph_512(                                       \
   1376       (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
   1377       (__v32hf)(__m512h)(W)))
   1378 
   1379 #define _mm512_maskz_sqrt_round_ph(U, A, R)                                    \
   1380   ((__m512h)__builtin_ia32_selectph_512(                                       \
   1381       (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
   1382       (__v32hf)_mm512_setzero_ph()))
   1383 
   1384 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
   1385   return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
   1386                                            _MM_FROUND_CUR_DIRECTION);
   1387 }
   1388 
   1389 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   1390 _mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
   1391   return (__m512h)__builtin_ia32_selectph_512(
   1392       (__mmask32)(__U),
   1393       (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
   1394       (__v32hf)(__m512h)(__W));
   1395 }
   1396 
   1397 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   1398 _mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
   1399   return (__m512h)__builtin_ia32_selectph_512(
   1400       (__mmask32)(__U),
   1401       (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
   1402       (__v32hf)_mm512_setzero_ph());
   1403 }
   1404 
   1405 #define _mm_sqrt_round_sh(A, B, R)                                             \
   1406   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
   1407       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1408       (__mmask8)-1, (int)(R)))
   1409 
   1410 #define _mm_mask_sqrt_round_sh(W, U, A, B, R)                                  \
   1411   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
   1412       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
   1413       (__mmask8)(U), (int)(R)))
   1414 
   1415 #define _mm_maskz_sqrt_round_sh(U, A, B, R)                                    \
   1416   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
   1417       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
   1418       (__mmask8)(U), (int)(R)))
   1419 
   1420 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
   1421                                                             __m128h __B) {
   1422   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
   1423       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
   1424       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
   1425 }
   1426 
   1427 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
   1428                                                                  __mmask32 __U,
   1429                                                                  __m128h __A,
   1430                                                                  __m128h __B) {
   1431   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
   1432       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
   1433       (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
   1434 }
   1435 
   1436 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
   1437                                                                   __m128h __A,
   1438                                                                   __m128h __B) {
   1439   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
   1440       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
   1441       (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
   1442 }
   1443 
   1444 #define _mm512_mask_fpclass_ph_mask(U, A, imm)                                 \
   1445   ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
   1446                                                (int)(imm), (__mmask32)(U)))
   1447 
   1448 #define _mm512_fpclass_ph_mask(A, imm)                                         \
   1449   ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
   1450                                                (int)(imm), (__mmask32)-1))
   1451 
   1452 #define _mm_fpclass_sh_mask(A, imm)                                            \
   1453   ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
   1454                                            (__mmask8)-1))
   1455 
   1456 #define _mm_mask_fpclass_sh_mask(U, A, imm)                                    \
   1457   ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
   1458                                            (__mmask8)(U)))
   1459 
   1460 #define _mm512_cvt_roundpd_ph(A, R)                                            \
   1461   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
   1462       (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
   1463 
   1464 #define _mm512_mask_cvt_roundpd_ph(W, U, A, R)                                 \
   1465   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W),         \
   1466                                              (__mmask8)(U), (int)(R)))
   1467 
   1468 #define _mm512_maskz_cvt_roundpd_ph(U, A, R)                                   \
   1469   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
   1470       (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
   1471 
   1472 static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
   1473   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
   1474       (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
   1475       _MM_FROUND_CUR_DIRECTION);
   1476 }
   1477 
   1478 static __inline__ __m128h __DEFAULT_FN_ATTRS512
   1479 _mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
   1480   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
   1481       (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   1482 }
   1483 
   1484 static __inline__ __m128h __DEFAULT_FN_ATTRS512
   1485 _mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
   1486   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
   1487       (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
   1488       _MM_FROUND_CUR_DIRECTION);
   1489 }
   1490 
   1491 #define _mm512_cvt_roundph_pd(A, R)                                            \
   1492   ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
   1493       (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
   1494 
   1495 #define _mm512_mask_cvt_roundph_pd(W, U, A, R)                                 \
   1496   ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W),         \
   1497                                              (__mmask8)(U), (int)(R)))
   1498 
   1499 #define _mm512_maskz_cvt_roundph_pd(U, A, R)                                   \
   1500   ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
   1501       (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
   1502 
   1503 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
   1504   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
   1505       (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
   1506       _MM_FROUND_CUR_DIRECTION);
   1507 }
   1508 
   1509 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1510 _mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
   1511   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
   1512       (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   1513 }
   1514 
   1515 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1516 _mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
   1517   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
   1518       (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
   1519       _MM_FROUND_CUR_DIRECTION);
   1520 }
   1521 
   1522 #define _mm_cvt_roundsh_ss(A, B, R)                                            \
   1523   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
   1524                                                (__v4sf)_mm_undefined_ps(),     \
   1525                                                (__mmask8)(-1), (int)(R)))
   1526 
   1527 #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R)                                 \
   1528   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask(                                \
   1529       (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
   1530 
   1531 #define _mm_maskz_cvt_roundsh_ss(U, A, B, R)                                   \
   1532   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
   1533                                                (__v4sf)_mm_setzero_ps(),       \
   1534                                                (__mmask8)(U), (int)(R)))
   1535 
   1536 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
   1537                                                             __m128h __B) {
   1538   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
   1539       (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
   1540       _MM_FROUND_CUR_DIRECTION);
   1541 }
   1542 
   1543 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
   1544                                                                  __mmask8 __U,
   1545                                                                  __m128 __A,
   1546                                                                  __m128h __B) {
   1547   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
   1548                                                      (__v4sf)__W, (__mmask8)__U,
   1549                                                      _MM_FROUND_CUR_DIRECTION);
   1550 }
   1551 
   1552 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
   1553                                                                   __m128 __A,
   1554                                                                   __m128h __B) {
   1555   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
   1556       (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
   1557       _MM_FROUND_CUR_DIRECTION);
   1558 }
   1559 
   1560 #define _mm_cvt_roundss_sh(A, B, R)                                            \
   1561   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
   1562                                                 (__v8hf)_mm_undefined_ph(),    \
   1563                                                 (__mmask8)(-1), (int)(R)))
   1564 
   1565 #define _mm_mask_cvt_roundss_sh(W, U, A, B, R)                                 \
   1566   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask(                               \
   1567       (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
   1568 
   1569 #define _mm_maskz_cvt_roundss_sh(U, A, B, R)                                   \
   1570   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
   1571                                                 (__v8hf)_mm_setzero_ph(),      \
   1572                                                 (__mmask8)(U), (int)(R)))
   1573 
   1574 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
   1575                                                              __m128 __B) {
   1576   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
   1577       (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
   1578       _MM_FROUND_CUR_DIRECTION);
   1579 }
   1580 
   1581 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
   1582                                                                   __mmask8 __U,
   1583                                                                   __m128h __A,
   1584                                                                   __m128 __B) {
   1585   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
   1586       (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
   1587       _MM_FROUND_CUR_DIRECTION);
   1588 }
   1589 
   1590 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
   1591                                                                    __m128h __A,
   1592                                                                    __m128 __B) {
   1593   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
   1594       (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
   1595       _MM_FROUND_CUR_DIRECTION);
   1596 }
   1597 
   1598 #define _mm_cvt_roundsd_sh(A, B, R)                                            \
   1599   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
   1600                                                 (__v8hf)_mm_undefined_ph(),    \
   1601                                                 (__mmask8)(-1), (int)(R)))
   1602 
   1603 #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R)                                 \
   1604   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask(                               \
   1605       (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
   1606 
   1607 #define _mm_maskz_cvt_roundsd_sh(U, A, B, R)                                   \
   1608   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
   1609                                                 (__v8hf)_mm_setzero_ph(),      \
   1610                                                 (__mmask8)(U), (int)(R)))
   1611 
   1612 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
   1613                                                              __m128d __B) {
   1614   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
   1615       (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
   1616       _MM_FROUND_CUR_DIRECTION);
   1617 }
   1618 
   1619 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
   1620                                                                   __mmask8 __U,
   1621                                                                   __m128h __A,
   1622                                                                   __m128d __B) {
   1623   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
   1624       (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
   1625       _MM_FROUND_CUR_DIRECTION);
   1626 }
   1627 
   1628 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   1629 _mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
   1630   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
   1631       (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
   1632       _MM_FROUND_CUR_DIRECTION);
   1633 }
   1634 
   1635 #define _mm_cvt_roundsh_sd(A, B, R)                                            \
   1636   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
   1637                                                 (__v2df)_mm_undefined_pd(),    \
   1638                                                 (__mmask8)(-1), (int)(R)))
   1639 
   1640 #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R)                                 \
   1641   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask(                               \
   1642       (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
   1643 
   1644 #define _mm_maskz_cvt_roundsh_sd(U, A, B, R)                                   \
   1645   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
   1646                                                 (__v2df)_mm_setzero_pd(),      \
   1647                                                 (__mmask8)(U), (int)(R)))
   1648 
   1649 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
   1650                                                              __m128h __B) {
   1651   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
   1652       (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
   1653       _MM_FROUND_CUR_DIRECTION);
   1654 }
   1655 
   1656 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
   1657                                                                   __mmask8 __U,
   1658                                                                   __m128d __A,
   1659                                                                   __m128h __B) {
   1660   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
   1661       (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
   1662       _MM_FROUND_CUR_DIRECTION);
   1663 }
   1664 
   1665 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   1666 _mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
   1667   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
   1668       (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
   1669       _MM_FROUND_CUR_DIRECTION);
   1670 }
   1671 
   1672 #define _mm512_cvt_roundph_epi16(A, R)                                         \
   1673   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
   1674                                             (__v32hi)_mm512_undefined_epi32(), \
   1675                                             (__mmask32)(-1), (int)(R)))
   1676 
   1677 #define _mm512_mask_cvt_roundph_epi16(W, U, A, R)                              \
   1678   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W),        \
   1679                                             (__mmask32)(U), (int)(R)))
   1680 
   1681 #define _mm512_maskz_cvt_roundph_epi16(U, A, R)                                \
   1682   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
   1683                                             (__v32hi)_mm512_setzero_epi32(),   \
   1684                                             (__mmask32)(U), (int)(R)))
   1685 
   1686 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1687 _mm512_cvtph_epi16(__m512h __A) {
   1688   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
   1689       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
   1690       _MM_FROUND_CUR_DIRECTION);
   1691 }
   1692 
   1693 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1694 _mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
   1695   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
   1696       (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
   1697 }
   1698 
   1699 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1700 _mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
   1701   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
   1702       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
   1703       _MM_FROUND_CUR_DIRECTION);
   1704 }
   1705 
   1706 #define _mm512_cvtt_roundph_epi16(A, R)                                        \
   1707   ((__m512i)__builtin_ia32_vcvttph2w512_mask(                                  \
   1708       (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1),        \
   1709       (int)(R)))
   1710 
   1711 #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R)                             \
   1712   ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W),       \
   1713                                              (__mmask32)(U), (int)(R)))
   1714 
   1715 #define _mm512_maskz_cvtt_roundph_epi16(U, A, R)                               \
   1716   ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A),                     \
   1717                                              (__v32hi)_mm512_setzero_epi32(),  \
   1718                                              (__mmask32)(U), (int)(R)))
   1719 
   1720 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1721 _mm512_cvttph_epi16(__m512h __A) {
   1722   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
   1723       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
   1724       _MM_FROUND_CUR_DIRECTION);
   1725 }
   1726 
   1727 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1728 _mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
   1729   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
   1730       (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
   1731 }
   1732 
   1733 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1734 _mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
   1735   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
   1736       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
   1737       _MM_FROUND_CUR_DIRECTION);
   1738 }
   1739 
   1740 #define _mm512_cvt_roundepi16_ph(A, R)                                         \
   1741   ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A),                      \
   1742                                             (__v32hf)_mm512_undefined_ph(),    \
   1743                                             (__mmask32)(-1), (int)(R)))
   1744 
   1745 #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R)                              \
   1746   ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W),        \
   1747                                             (__mmask32)(U), (int)(R)))
   1748 
   1749 #define _mm512_maskz_cvt_roundepi16_ph(U, A, R)                                \
   1750   ((__m512h)__builtin_ia32_vcvtw2ph512_mask(                                   \
   1751       (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
   1752 
   1753 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   1754 _mm512_cvtepi16_ph(__m512i __A) {
   1755   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
   1756       (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
   1757       _MM_FROUND_CUR_DIRECTION);
   1758 }
   1759 
   1760 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   1761 _mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
   1762   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
   1763       (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
   1764 }
   1765 
   1766 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   1767 _mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
   1768   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
   1769       (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
   1770       _MM_FROUND_CUR_DIRECTION);
   1771 }
   1772 
   1773 #define _mm512_cvt_roundph_epu16(A, R)                                         \
   1774   ((__m512i)__builtin_ia32_vcvtph2uw512_mask(                                  \
   1775       (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
   1776       (int)(R)))
   1777 
   1778 #define _mm512_mask_cvt_roundph_epu16(W, U, A, R)                              \
   1779   ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W),       \
   1780                                              (__mmask32)(U), (int)(R)))
   1781 
   1782 #define _mm512_maskz_cvt_roundph_epu16(U, A, R)                                \
   1783   ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A),                     \
   1784                                              (__v32hu)_mm512_setzero_epi32(),  \
   1785                                              (__mmask32)(U), (int)(R)))
   1786 
   1787 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1788 _mm512_cvtph_epu16(__m512h __A) {
   1789   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
   1790       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
   1791       _MM_FROUND_CUR_DIRECTION);
   1792 }
   1793 
   1794 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1795 _mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
   1796   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
   1797       (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
   1798 }
   1799 
   1800 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1801 _mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
   1802   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
   1803       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
   1804       _MM_FROUND_CUR_DIRECTION);
   1805 }
   1806 
   1807 #define _mm512_cvtt_roundph_epu16(A, R)                                        \
   1808   ((__m512i)__builtin_ia32_vcvttph2uw512_mask(                                 \
   1809       (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
   1810       (int)(R)))
   1811 
   1812 #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R)                             \
   1813   ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W),      \
   1814                                               (__mmask32)(U), (int)(R)))
   1815 
   1816 #define _mm512_maskz_cvtt_roundph_epu16(U, A, R)                               \
   1817   ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A),                    \
   1818                                               (__v32hu)_mm512_setzero_epi32(), \
   1819                                               (__mmask32)(U), (int)(R)))
   1820 
   1821 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1822 _mm512_cvttph_epu16(__m512h __A) {
   1823   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
   1824       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
   1825       _MM_FROUND_CUR_DIRECTION);
   1826 }
   1827 
   1828 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1829 _mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
   1830   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
   1831       (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
   1832 }
   1833 
   1834 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1835 _mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
   1836   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
   1837       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
   1838       _MM_FROUND_CUR_DIRECTION);
   1839 }
   1840 
   1841 #define _mm512_cvt_roundepu16_ph(A, R)                                         \
   1842   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A),                     \
   1843                                              (__v32hf)_mm512_undefined_ph(),   \
   1844                                              (__mmask32)(-1), (int)(R)))
   1845 
   1846 #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R)                              \
   1847   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W),       \
   1848                                              (__mmask32)(U), (int)(R)))
   1849 
   1850 #define _mm512_maskz_cvt_roundepu16_ph(U, A, R)                                \
   1851   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask(                                  \
   1852       (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
   1853 
   1854 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   1855 _mm512_cvtepu16_ph(__m512i __A) {
   1856   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
   1857       (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
   1858       _MM_FROUND_CUR_DIRECTION);
   1859 }
   1860 
   1861 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   1862 _mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
   1863   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
   1864       (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
   1865 }
   1866 
   1867 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   1868 _mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
   1869   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
   1870       (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
   1871       _MM_FROUND_CUR_DIRECTION);
   1872 }
   1873 
   1874 #define _mm512_cvt_roundph_epi32(A, R)                                         \
   1875   ((__m512i)__builtin_ia32_vcvtph2dq512_mask(                                  \
   1876       (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
   1877       (int)(R)))
   1878 
   1879 #define _mm512_mask_cvt_roundph_epi32(W, U, A, R)                              \
   1880   ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W),       \
   1881                                              (__mmask16)(U), (int)(R)))
   1882 
   1883 #define _mm512_maskz_cvt_roundph_epi32(U, A, R)                                \
   1884   ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A),                     \
   1885                                              (__v16si)_mm512_setzero_epi32(),  \
   1886                                              (__mmask16)(U), (int)(R)))
   1887 
   1888 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1889 _mm512_cvtph_epi32(__m256h __A) {
   1890   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
   1891       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
   1892       _MM_FROUND_CUR_DIRECTION);
   1893 }
   1894 
   1895 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1896 _mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
   1897   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
   1898       (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
   1899 }
   1900 
   1901 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1902 _mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
   1903   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
   1904       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
   1905       _MM_FROUND_CUR_DIRECTION);
   1906 }
   1907 
   1908 #define _mm512_cvt_roundph_epu32(A, R)                                         \
   1909   ((__m512i)__builtin_ia32_vcvtph2udq512_mask(                                 \
   1910       (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
   1911       (int)(R)))
   1912 
   1913 #define _mm512_mask_cvt_roundph_epu32(W, U, A, R)                              \
   1914   ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W),      \
   1915                                               (__mmask16)(U), (int)(R)))
   1916 
   1917 #define _mm512_maskz_cvt_roundph_epu32(U, A, R)                                \
   1918   ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A),                    \
   1919                                               (__v16su)_mm512_setzero_epi32(), \
   1920                                               (__mmask16)(U), (int)(R)))
   1921 
   1922 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1923 _mm512_cvtph_epu32(__m256h __A) {
   1924   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
   1925       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
   1926       _MM_FROUND_CUR_DIRECTION);
   1927 }
   1928 
   1929 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1930 _mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
   1931   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
   1932       (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
   1933 }
   1934 
   1935 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1936 _mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
   1937   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
   1938       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
   1939       _MM_FROUND_CUR_DIRECTION);
   1940 }
   1941 
   1942 #define _mm512_cvt_roundepi32_ph(A, R)                                         \
   1943   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A),                     \
   1944                                              (__v16hf)_mm256_undefined_ph(),   \
   1945                                              (__mmask16)(-1), (int)(R)))
   1946 
   1947 #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R)                              \
   1948   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W),       \
   1949                                              (__mmask16)(U), (int)(R)))
   1950 
   1951 #define _mm512_maskz_cvt_roundepi32_ph(U, A, R)                                \
   1952   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask(                                  \
   1953       (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
   1954 
   1955 static __inline__ __m256h __DEFAULT_FN_ATTRS512
   1956 _mm512_cvtepi32_ph(__m512i __A) {
   1957   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
   1958       (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
   1959       _MM_FROUND_CUR_DIRECTION);
   1960 }
   1961 
   1962 static __inline__ __m256h __DEFAULT_FN_ATTRS512
   1963 _mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
   1964   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
   1965       (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
   1966 }
   1967 
   1968 static __inline__ __m256h __DEFAULT_FN_ATTRS512
   1969 _mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
   1970   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
   1971       (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
   1972       _MM_FROUND_CUR_DIRECTION);
   1973 }
   1974 
   1975 #define _mm512_cvt_roundepu32_ph(A, R)                                         \
   1976   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A),                    \
   1977                                               (__v16hf)_mm256_undefined_ph(),  \
   1978                                               (__mmask16)(-1), (int)(R)))
   1979 
   1980 #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R)                              \
   1981   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W),      \
   1982                                               (__mmask16)(U), (int)(R)))
   1983 
   1984 #define _mm512_maskz_cvt_roundepu32_ph(U, A, R)                                \
   1985   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask(                                 \
   1986       (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
   1987 
   1988 static __inline__ __m256h __DEFAULT_FN_ATTRS512
   1989 _mm512_cvtepu32_ph(__m512i __A) {
   1990   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
   1991       (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
   1992       _MM_FROUND_CUR_DIRECTION);
   1993 }
   1994 
   1995 static __inline__ __m256h __DEFAULT_FN_ATTRS512
   1996 _mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
   1997   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
   1998       (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
   1999 }
   2000 
   2001 static __inline__ __m256h __DEFAULT_FN_ATTRS512
   2002 _mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
   2003   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
   2004       (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
   2005       _MM_FROUND_CUR_DIRECTION);
   2006 }
   2007 
   2008 #define _mm512_cvtt_roundph_epi32(A, R)                                        \
   2009   ((__m512i)__builtin_ia32_vcvttph2dq512_mask(                                 \
   2010       (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
   2011       (int)(R)))
   2012 
   2013 #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R)                             \
   2014   ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W),      \
   2015                                               (__mmask16)(U), (int)(R)))
   2016 
   2017 #define _mm512_maskz_cvtt_roundph_epi32(U, A, R)                               \
   2018   ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A),                    \
   2019                                               (__v16si)_mm512_setzero_epi32(), \
   2020                                               (__mmask16)(U), (int)(R)))
   2021 
   2022 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2023 _mm512_cvttph_epi32(__m256h __A) {
   2024   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
   2025       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
   2026       _MM_FROUND_CUR_DIRECTION);
   2027 }
   2028 
   2029 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2030 _mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
   2031   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
   2032       (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
   2033 }
   2034 
   2035 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2036 _mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
   2037   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
   2038       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
   2039       _MM_FROUND_CUR_DIRECTION);
   2040 }
   2041 
   2042 #define _mm512_cvtt_roundph_epu32(A, R)                                        \
   2043   ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
   2044       (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
   2045       (int)(R)))
   2046 
   2047 #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R)                             \
   2048   ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W),     \
   2049                                                (__mmask16)(U), (int)(R)))
   2050 
   2051 #define _mm512_maskz_cvtt_roundph_epu32(U, A, R)                               \
   2052   ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
   2053       (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U),           \
   2054       (int)(R)))
   2055 
   2056 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2057 _mm512_cvttph_epu32(__m256h __A) {
   2058   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
   2059       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
   2060       _MM_FROUND_CUR_DIRECTION);
   2061 }
   2062 
   2063 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2064 _mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
   2065   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
   2066       (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
   2067 }
   2068 
   2069 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2070 _mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
   2071   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
   2072       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
   2073       _MM_FROUND_CUR_DIRECTION);
   2074 }
   2075 
   2076 #define _mm512_cvt_roundepi64_ph(A, R)                                         \
   2077   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
   2078       (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
   2079 
   2080 #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R)                              \
   2081   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W),         \
   2082                                              (__mmask8)(U), (int)(R)))
   2083 
   2084 #define _mm512_maskz_cvt_roundepi64_ph(U, A, R)                                \
   2085   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
   2086       (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
   2087 
   2088 static __inline__ __m128h __DEFAULT_FN_ATTRS512
   2089 _mm512_cvtepi64_ph(__m512i __A) {
   2090   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
   2091       (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
   2092       _MM_FROUND_CUR_DIRECTION);
   2093 }
   2094 
   2095 static __inline__ __m128h __DEFAULT_FN_ATTRS512
   2096 _mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
   2097   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
   2098       (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   2099 }
   2100 
   2101 static __inline__ __m128h __DEFAULT_FN_ATTRS512
   2102 _mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
   2103   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
   2104       (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
   2105       _MM_FROUND_CUR_DIRECTION);
   2106 }
   2107 
   2108 #define _mm512_cvt_roundph_epi64(A, R)                                         \
   2109   ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A),                      \
   2110                                              (__v8di)_mm512_undefined_epi32(), \
   2111                                              (__mmask8)(-1), (int)(R)))
   2112 
   2113 #define _mm512_mask_cvt_roundph_epi64(W, U, A, R)                              \
   2114   ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W),         \
   2115                                              (__mmask8)(U), (int)(R)))
   2116 
   2117 #define _mm512_maskz_cvt_roundph_epi64(U, A, R)                                \
   2118   ((__m512i)__builtin_ia32_vcvtph2qq512_mask(                                  \
   2119       (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
   2120 
   2121 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2122 _mm512_cvtph_epi64(__m128h __A) {
   2123   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
   2124       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
   2125       _MM_FROUND_CUR_DIRECTION);
   2126 }
   2127 
   2128 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2129 _mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
   2130   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
   2131       (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   2132 }
   2133 
   2134 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2135 _mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
   2136   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
   2137       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
   2138       _MM_FROUND_CUR_DIRECTION);
   2139 }
   2140 
   2141 #define _mm512_cvt_roundepu64_ph(A, R)                                         \
   2142   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
   2143       (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
   2144 
   2145 #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R)                              \
   2146   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W),        \
   2147                                               (__mmask8)(U), (int)(R)))
   2148 
   2149 #define _mm512_maskz_cvt_roundepu64_ph(U, A, R)                                \
   2150   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
   2151       (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
   2152 
   2153 static __inline__ __m128h __DEFAULT_FN_ATTRS512
   2154 _mm512_cvtepu64_ph(__m512i __A) {
   2155   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
   2156       (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
   2157       _MM_FROUND_CUR_DIRECTION);
   2158 }
   2159 
   2160 static __inline__ __m128h __DEFAULT_FN_ATTRS512
   2161 _mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
   2162   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
   2163       (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   2164 }
   2165 
   2166 static __inline__ __m128h __DEFAULT_FN_ATTRS512
   2167 _mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
   2168   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
   2169       (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
   2170       _MM_FROUND_CUR_DIRECTION);
   2171 }
   2172 
   2173 #define _mm512_cvt_roundph_epu64(A, R)                                         \
   2174   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
   2175       (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
   2176       (int)(R)))
   2177 
   2178 #define _mm512_mask_cvt_roundph_epu64(W, U, A, R)                              \
   2179   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W),        \
   2180                                               (__mmask8)(U), (int)(R)))
   2181 
   2182 #define _mm512_maskz_cvt_roundph_epu64(U, A, R)                                \
   2183   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
   2184       (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
   2185 
   2186 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2187 _mm512_cvtph_epu64(__m128h __A) {
   2188   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
   2189       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
   2190       _MM_FROUND_CUR_DIRECTION);
   2191 }
   2192 
   2193 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2194 _mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
   2195   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
   2196       (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   2197 }
   2198 
   2199 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2200 _mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
   2201   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
   2202       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
   2203       _MM_FROUND_CUR_DIRECTION);
   2204 }
   2205 
   2206 #define _mm512_cvtt_roundph_epi64(A, R)                                        \
   2207   ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
   2208       (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1),           \
   2209       (int)(R)))
   2210 
   2211 #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R)                             \
   2212   ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W),        \
   2213                                               (__mmask8)(U), (int)(R)))
   2214 
   2215 #define _mm512_maskz_cvtt_roundph_epi64(U, A, R)                               \
   2216   ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
   2217       (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
   2218 
   2219 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2220 _mm512_cvttph_epi64(__m128h __A) {
   2221   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
   2222       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
   2223       _MM_FROUND_CUR_DIRECTION);
   2224 }
   2225 
   2226 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2227 _mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
   2228   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
   2229       (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   2230 }
   2231 
   2232 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2233 _mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
   2234   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
   2235       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
   2236       _MM_FROUND_CUR_DIRECTION);
   2237 }
   2238 
   2239 #define _mm512_cvtt_roundph_epu64(A, R)                                        \
   2240   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
   2241       (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
   2242       (int)(R)))
   2243 
   2244 #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R)                             \
   2245   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W),       \
   2246                                                (__mmask8)(U), (int)(R)))
   2247 
   2248 #define _mm512_maskz_cvtt_roundph_epu64(U, A, R)                               \
   2249   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
   2250       (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
   2251 
   2252 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2253 _mm512_cvttph_epu64(__m128h __A) {
   2254   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
   2255       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
   2256       _MM_FROUND_CUR_DIRECTION);
   2257 }
   2258 
   2259 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2260 _mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
   2261   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
   2262       (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   2263 }
   2264 
   2265 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   2266 _mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
   2267   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
   2268       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
   2269       _MM_FROUND_CUR_DIRECTION);
   2270 }
   2271 
   2272 #define _mm_cvt_roundsh_i32(A, R)                                              \
   2273   ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
   2274 
   2275 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
   2276   return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
   2277 }
   2278 
   2279 #define _mm_cvt_roundsh_u32(A, R)                                              \
   2280   ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
   2281 
   2282 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
   2283 _mm_cvtsh_u32(__m128h __A) {
   2284   return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
   2285                                                    _MM_FROUND_CUR_DIRECTION);
   2286 }
   2287 
   2288 #ifdef __x86_64__
   2289 #define _mm_cvt_roundsh_i64(A, R)                                              \
   2290   ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
   2291 
   2292 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
   2293   return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
   2294                                                _MM_FROUND_CUR_DIRECTION);
   2295 }
   2296 
   2297 #define _mm_cvt_roundsh_u64(A, R)                                              \
   2298   ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
   2299 
   2300 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
   2301 _mm_cvtsh_u64(__m128h __A) {
   2302   return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
   2303       (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
   2304 }
   2305 #endif // __x86_64__
   2306 
   2307 #define _mm_cvt_roundu32_sh(A, B, R)                                           \
   2308   ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
   2309 
   2310 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2311 _mm_cvtu32_sh(__m128h __A, unsigned int __B) {
   2312   __A[0] = __B;
   2313   return __A;
   2314 }
   2315 
   2316 #ifdef __x86_64__
   2317 #define _mm_cvt_roundu64_sh(A, B, R)                                           \
   2318   ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B),  \
   2319                                         (int)(R)))
   2320 
   2321 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2322 _mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
   2323   __A[0] = __B;
   2324   return __A;
   2325 }
   2326 #endif
   2327 
   2328 #define _mm_cvt_roundi32_sh(A, B, R)                                           \
   2329   ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
   2330 
   2331 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
   2332                                                               int __B) {
   2333   __A[0] = __B;
   2334   return __A;
   2335 }
   2336 
   2337 #ifdef __x86_64__
   2338 #define _mm_cvt_roundi64_sh(A, B, R)                                           \
   2339   ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
   2340 
   2341 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
   2342                                                               long long __B) {
   2343   __A[0] = __B;
   2344   return __A;
   2345 }
   2346 #endif
   2347 
   2348 #define _mm_cvtt_roundsh_i32(A, R)                                             \
   2349   ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
   2350 
   2351 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
   2352   return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
   2353                                           _MM_FROUND_CUR_DIRECTION);
   2354 }
   2355 
   2356 #ifdef __x86_64__
   2357 #define _mm_cvtt_roundsh_i64(A, R)                                             \
   2358   ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
   2359 
   2360 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
   2361   return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
   2362                                                 _MM_FROUND_CUR_DIRECTION);
   2363 }
   2364 #endif
   2365 
   2366 #define _mm_cvtt_roundsh_u32(A, R)                                             \
   2367   ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
   2368 
   2369 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
   2370 _mm_cvttsh_u32(__m128h __A) {
   2371   return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
   2372                                                     _MM_FROUND_CUR_DIRECTION);
   2373 }
   2374 
   2375 #ifdef __x86_64__
   2376 #define _mm_cvtt_roundsh_u64(A, R)                                             \
   2377   ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
   2378 
   2379 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
   2380 _mm_cvttsh_u64(__m128h __A) {
   2381   return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
   2382       (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
   2383 }
   2384 #endif
   2385 
   2386 #define _mm512_cvtx_roundph_ps(A, R)                                           \
   2387   ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A),                     \
   2388                                              (__v16sf)_mm512_undefined_ps(),   \
   2389                                              (__mmask16)(-1), (int)(R)))
   2390 
   2391 #define _mm512_mask_cvtx_roundph_ps(W, U, A, R)                                \
   2392   ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W),       \
   2393                                              (__mmask16)(U), (int)(R)))
   2394 
   2395 #define _mm512_maskz_cvtx_roundph_ps(U, A, R)                                  \
   2396   ((__m512)__builtin_ia32_vcvtph2psx512_mask(                                  \
   2397       (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
   2398 
   2399 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
   2400   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
   2401       (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
   2402       _MM_FROUND_CUR_DIRECTION);
   2403 }
   2404 
   2405 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2406 _mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
   2407   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
   2408       (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
   2409 }
   2410 
   2411 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2412 _mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
   2413   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
   2414       (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
   2415       _MM_FROUND_CUR_DIRECTION);
   2416 }
   2417 
   2418 #define _mm512_cvtx_roundps_ph(A, R)                                           \
   2419   ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A),                    \
   2420                                               (__v16hf)_mm256_undefined_ph(),  \
   2421                                               (__mmask16)(-1), (int)(R)))
   2422 
   2423 #define _mm512_mask_cvtx_roundps_ph(W, U, A, R)                                \
   2424   ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W),      \
   2425                                               (__mmask16)(U), (int)(R)))
   2426 
   2427 #define _mm512_maskz_cvtx_roundps_ph(U, A, R)                                  \
   2428   ((__m256h)__builtin_ia32_vcvtps2phx512_mask(                                 \
   2429       (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
   2430 
   2431 static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
   2432   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
   2433       (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
   2434       _MM_FROUND_CUR_DIRECTION);
   2435 }
   2436 
   2437 static __inline__ __m256h __DEFAULT_FN_ATTRS512
   2438 _mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
   2439   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
   2440       (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
   2441 }
   2442 
   2443 static __inline__ __m256h __DEFAULT_FN_ATTRS512
   2444 _mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
   2445   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
   2446       (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
   2447       _MM_FROUND_CUR_DIRECTION);
   2448 }
   2449 
   2450 #define _mm512_fmadd_round_ph(A, B, C, R)                                      \
   2451   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
   2452       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
   2453       (__mmask32)-1, (int)(R)))
   2454 
   2455 #define _mm512_mask_fmadd_round_ph(A, U, B, C, R)                              \
   2456   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
   2457       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
   2458       (__mmask32)(U), (int)(R)))
   2459 
   2460 #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R)                             \
   2461   ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
   2462       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
   2463       (__mmask32)(U), (int)(R)))
   2464 
   2465 #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R)                             \
   2466   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
   2467       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
   2468       (__mmask32)(U), (int)(R)))
   2469 
   2470 #define _mm512_fmsub_round_ph(A, B, C, R)                                      \
   2471   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
   2472       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
   2473       (__mmask32)-1, (int)(R)))
   2474 
   2475 #define _mm512_mask_fmsub_round_ph(A, U, B, C, R)                              \
   2476   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
   2477       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
   2478       (__mmask32)(U), (int)(R)))
   2479 
   2480 #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R)                             \
   2481   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
   2482       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
   2483       (__mmask32)(U), (int)(R)))
   2484 
   2485 #define _mm512_fnmadd_round_ph(A, B, C, R)                                     \
   2486   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
   2487       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
   2488       (__mmask32)-1, (int)(R)))
   2489 
   2490 #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R)                            \
   2491   ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
   2492       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
   2493       (__mmask32)(U), (int)(R)))
   2494 
   2495 #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R)                            \
   2496   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
   2497       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
   2498       (__mmask32)(U), (int)(R)))
   2499 
   2500 #define _mm512_fnmsub_round_ph(A, B, C, R)                                     \
   2501   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
   2502       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
   2503       (__mmask32)-1, (int)(R)))
   2504 
   2505 #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R)                            \
   2506   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
   2507       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
   2508       (__mmask32)(U), (int)(R)))
   2509 
   2510 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
   2511                                                                 __m512h __B,
   2512                                                                 __m512h __C) {
   2513   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
   2514                                                   (__v32hf)__C, (__mmask32)-1,
   2515                                                   _MM_FROUND_CUR_DIRECTION);
   2516 }
   2517 
   2518 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2519 _mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
   2520   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
   2521                                                   (__v32hf)__C, (__mmask32)__U,
   2522                                                   _MM_FROUND_CUR_DIRECTION);
   2523 }
   2524 
   2525 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2526 _mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
   2527   return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
   2528                                                    (__v32hf)__C, (__mmask32)__U,
   2529                                                    _MM_FROUND_CUR_DIRECTION);
   2530 }
   2531 
   2532 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2533 _mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
   2534   return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
   2535                                                    (__v32hf)__C, (__mmask32)__U,
   2536                                                    _MM_FROUND_CUR_DIRECTION);
   2537 }
   2538 
   2539 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
   2540                                                                 __m512h __B,
   2541                                                                 __m512h __C) {
   2542   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
   2543                                                   -(__v32hf)__C, (__mmask32)-1,
   2544                                                   _MM_FROUND_CUR_DIRECTION);
   2545 }
   2546 
   2547 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2548 _mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
   2549   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
   2550                                                   -(__v32hf)__C, (__mmask32)__U,
   2551                                                   _MM_FROUND_CUR_DIRECTION);
   2552 }
   2553 
   2554 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2555 _mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
   2556   return (__m512h)__builtin_ia32_vfmaddph512_maskz(
   2557       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
   2558       _MM_FROUND_CUR_DIRECTION);
   2559 }
   2560 
   2561 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
   2562                                                                  __m512h __B,
   2563                                                                  __m512h __C) {
   2564   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
   2565                                                   (__v32hf)__C, (__mmask32)-1,
   2566                                                   _MM_FROUND_CUR_DIRECTION);
   2567 }
   2568 
   2569 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2570 _mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
   2571   return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
   2572                                                    (__v32hf)__C, (__mmask32)__U,
   2573                                                    _MM_FROUND_CUR_DIRECTION);
   2574 }
   2575 
   2576 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2577 _mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
   2578   return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
   2579                                                    (__v32hf)__C, (__mmask32)__U,
   2580                                                    _MM_FROUND_CUR_DIRECTION);
   2581 }
   2582 
   2583 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
   2584                                                                  __m512h __B,
   2585                                                                  __m512h __C) {
   2586   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
   2587                                                   -(__v32hf)__C, (__mmask32)-1,
   2588                                                   _MM_FROUND_CUR_DIRECTION);
   2589 }
   2590 
   2591 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2592 _mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
   2593   return (__m512h)__builtin_ia32_vfmaddph512_maskz(
   2594       -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
   2595       _MM_FROUND_CUR_DIRECTION);
   2596 }
   2597 
   2598 #define _mm512_fmaddsub_round_ph(A, B, C, R)                                   \
   2599   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
   2600       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
   2601       (__mmask32)-1, (int)(R)))
   2602 
   2603 #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R)                           \
   2604   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
   2605       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
   2606       (__mmask32)(U), (int)(R)))
   2607 
   2608 #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R)                          \
   2609   ((__m512h)__builtin_ia32_vfmaddsubph512_mask3(                               \
   2610       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
   2611       (__mmask32)(U), (int)(R)))
   2612 
   2613 #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R)                          \
   2614   ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
   2615       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
   2616       (__mmask32)(U), (int)(R)))
   2617 
   2618 #define _mm512_fmsubadd_round_ph(A, B, C, R)                                   \
   2619   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
   2620       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
   2621       (__mmask32)-1, (int)(R)))
   2622 
   2623 #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R)                           \
   2624   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
   2625       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
   2626       (__mmask32)(U), (int)(R)))
   2627 
   2628 #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R)                          \
   2629   ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
   2630       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
   2631       (__mmask32)(U), (int)(R)))
   2632 
   2633 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2634 _mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
   2635   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
   2636       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
   2637       _MM_FROUND_CUR_DIRECTION);
   2638 }
   2639 
   2640 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2641 _mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
   2642   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
   2643       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
   2644       _MM_FROUND_CUR_DIRECTION);
   2645 }
   2646 
   2647 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2648 _mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
   2649   return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
   2650       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
   2651       _MM_FROUND_CUR_DIRECTION);
   2652 }
   2653 
   2654 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2655 _mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
   2656   return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
   2657       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
   2658       _MM_FROUND_CUR_DIRECTION);
   2659 }
   2660 
   2661 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2662 _mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
   2663   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
   2664       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
   2665       _MM_FROUND_CUR_DIRECTION);
   2666 }
   2667 
   2668 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2669 _mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
   2670   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
   2671       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
   2672       _MM_FROUND_CUR_DIRECTION);
   2673 }
   2674 
   2675 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2676 _mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
   2677   return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
   2678       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
   2679       _MM_FROUND_CUR_DIRECTION);
   2680 }
   2681 
   2682 #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R)                             \
   2683   ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
   2684       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
   2685       (__mmask32)(U), (int)(R)))
   2686 
   2687 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2688 _mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
   2689   return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
   2690                                                    (__v32hf)__C, (__mmask32)__U,
   2691                                                    _MM_FROUND_CUR_DIRECTION);
   2692 }
   2693 
   2694 #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R)                          \
   2695   ((__m512h)__builtin_ia32_vfmsubaddph512_mask3(                               \
   2696       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
   2697       (__mmask32)(U), (int)(R)))
   2698 
   2699 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2700 _mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
   2701   return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
   2702       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
   2703       _MM_FROUND_CUR_DIRECTION);
   2704 }
   2705 
   2706 #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R)                             \
   2707   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
   2708       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
   2709       (__mmask32)(U), (int)(R)))
   2710 
   2711 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2712 _mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
   2713   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
   2714                                                   (__v32hf)__C, (__mmask32)__U,
   2715                                                   _MM_FROUND_CUR_DIRECTION);
   2716 }
   2717 
   2718 #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R)                             \
   2719   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
   2720       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
   2721       (__mmask32)(U), (int)(R)))
   2722 
   2723 #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R)                            \
   2724   ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
   2725       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
   2726       (__mmask32)(U), (int)(R)))
   2727 
   2728 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2729 _mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
   2730   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
   2731                                                   -(__v32hf)__C, (__mmask32)__U,
   2732                                                   _MM_FROUND_CUR_DIRECTION);
   2733 }
   2734 
   2735 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   2736 _mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
   2737   return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
   2738                                                    (__v32hf)__C, (__mmask32)__U,
   2739                                                    _MM_FROUND_CUR_DIRECTION);
   2740 }
   2741 
   2742 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
   2743                                                              __m128h __A,
   2744                                                              __m128h __B) {
   2745   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
   2746                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
   2747 }
   2748 
   2749 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
   2750                                                                   __mmask8 __U,
   2751                                                                   __m128h __A,
   2752                                                                   __m128h __B) {
   2753   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
   2754                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   2755 }
   2756 
   2757 #define _mm_fmadd_round_sh(A, B, C, R)                                         \
   2758   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
   2759       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
   2760       (__mmask8)-1, (int)(R)))
   2761 
   2762 #define _mm_mask_fmadd_round_sh(W, U, A, B, R)                                 \
   2763   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
   2764       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),        \
   2765       (__mmask8)(U), (int)(R)))
   2766 
   2767 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2768 _mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
   2769   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
   2770                                         (__mmask8)__U,
   2771                                         _MM_FROUND_CUR_DIRECTION);
   2772 }
   2773 
   2774 #define _mm_maskz_fmadd_round_sh(U, A, B, C, R)                                \
   2775   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
   2776       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
   2777       (__mmask8)(U), (int)(R)))
   2778 
   2779 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2780 _mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
   2781   return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
   2782                                         (__mmask8)__U,
   2783                                         _MM_FROUND_CUR_DIRECTION);
   2784 }
   2785 
   2786 #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R)                                \
   2787   ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
   2788       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
   2789       (__mmask8)(U), (int)(R)))
   2790 
   2791 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
   2792                                                              __m128h __A,
   2793                                                              __m128h __B) {
   2794   return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
   2795                                                 -(__v8hf)__B, (__mmask8)-1,
   2796                                                 _MM_FROUND_CUR_DIRECTION);
   2797 }
   2798 
   2799 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
   2800                                                                   __mmask8 __U,
   2801                                                                   __m128h __A,
   2802                                                                   __m128h __B) {
   2803   return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
   2804                                                 -(__v8hf)__B, (__mmask8)__U,
   2805                                                 _MM_FROUND_CUR_DIRECTION);
   2806 }
   2807 
   2808 #define _mm_fmsub_round_sh(A, B, C, R)                                         \
   2809   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
   2810       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
   2811       (__mmask8)-1, (int)(R)))
   2812 
   2813 #define _mm_mask_fmsub_round_sh(W, U, A, B, R)                                 \
   2814   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
   2815       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),       \
   2816       (__mmask8)(U), (int)(R)))
   2817 
   2818 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2819 _mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
   2820   return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
   2821                                                  -(__v8hf)__C, (__mmask8)__U,
   2822                                                  _MM_FROUND_CUR_DIRECTION);
   2823 }
   2824 
   2825 #define _mm_maskz_fmsub_round_sh(U, A, B, C, R)                                \
   2826   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
   2827       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
   2828       (__mmask8)(U), (int)R))
   2829 
   2830 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2831 _mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
   2832   return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
   2833                                         (__mmask8)__U,
   2834                                         _MM_FROUND_CUR_DIRECTION);
   2835 }
   2836 
   2837 #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R)                                \
   2838   ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
   2839       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
   2840       (__mmask8)(U), (int)(R)))
   2841 
   2842 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
   2843                                                               __m128h __A,
   2844                                                               __m128h __B) {
   2845   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
   2846                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
   2847 }
   2848 
   2849 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2850 _mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
   2851   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
   2852                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   2853 }
   2854 
   2855 #define _mm_fnmadd_round_sh(A, B, C, R)                                        \
   2856   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
   2857       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
   2858       (__mmask8)-1, (int)(R)))
   2859 
   2860 #define _mm_mask_fnmadd_round_sh(W, U, A, B, R)                                \
   2861   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
   2862       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),       \
   2863       (__mmask8)(U), (int)(R)))
   2864 
   2865 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2866 _mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
   2867   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
   2868                                         (__mmask8)__U,
   2869                                         _MM_FROUND_CUR_DIRECTION);
   2870 }
   2871 
   2872 #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R)                               \
   2873   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
   2874       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
   2875       (__mmask8)(U), (int)(R)))
   2876 
   2877 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2878 _mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
   2879   return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
   2880                                         (__mmask8)__U,
   2881                                         _MM_FROUND_CUR_DIRECTION);
   2882 }
   2883 
   2884 #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R)                               \
   2885   ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
   2886       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
   2887       (__mmask8)(U), (int)(R)))
   2888 
   2889 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
   2890                                                               __m128h __A,
   2891                                                               __m128h __B) {
   2892   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
   2893                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
   2894 }
   2895 
   2896 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2897 _mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
   2898   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
   2899                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   2900 }
   2901 
   2902 #define _mm_fnmsub_round_sh(A, B, C, R)                                        \
   2903   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
   2904       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
   2905       (__mmask8)-1, (int)(R)))
   2906 
   2907 #define _mm_mask_fnmsub_round_sh(W, U, A, B, R)                                \
   2908   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
   2909       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),      \
   2910       (__mmask8)(U), (int)(R)))
   2911 
   2912 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2913 _mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
   2914   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
   2915                                         (__mmask8)__U,
   2916                                         _MM_FROUND_CUR_DIRECTION);
   2917 }
   2918 
   2919 #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R)                               \
   2920   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
   2921       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
   2922       (__mmask8)(U), (int)(R)))
   2923 
   2924 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2925 _mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
   2926   return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
   2927                                         (__mmask8)__U,
   2928                                         _MM_FROUND_CUR_DIRECTION);
   2929 }
   2930 
   2931 #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R)                               \
   2932   ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
   2933       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
   2934       (__mmask8)(U), (int)(R)))
   2935 
   2936 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
   2937                                                                __m128h __B,
   2938                                                                __m128h __C) {
   2939   return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
   2940                                                  (__v4sf)__C, (__mmask8)-1,
   2941                                                  _MM_FROUND_CUR_DIRECTION);
   2942 }
   2943 
   2944 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2945 _mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
   2946   return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
   2947       (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
   2948 }
   2949 
   2950 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2951 _mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
   2952   return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
   2953                                                   (__v4sf)__C, (__mmask8)__U,
   2954                                                   _MM_FROUND_CUR_DIRECTION);
   2955 }
   2956 
   2957 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2958 _mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
   2959   return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
   2960       (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
   2961 }
   2962 
   2963 #define _mm_fcmadd_round_sch(A, B, C, R)                                       \
   2964   ((__m128h)__builtin_ia32_vfcmaddcsh_mask(                                    \
   2965       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
   2966       (__mmask8)-1, (int)(R)))
   2967 
   2968 #define _mm_mask_fcmadd_round_sch(A, U, B, C, R)                               \
   2969   ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask(                              \
   2970       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
   2971       (__mmask8)(U), (int)(R)))
   2972 
   2973 #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R)                              \
   2974   ((__m128h)__builtin_ia32_vfcmaddcsh_maskz(                                   \
   2975       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
   2976       (__mmask8)(U), (int)(R)))
   2977 
   2978 #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R)                              \
   2979   ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(                             \
   2980       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
   2981       (__mmask8)(U), (int)(R)))
   2982 
   2983 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
   2984                                                               __m128h __B,
   2985                                                               __m128h __C) {
   2986   return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
   2987                                                 (__v4sf)__C, (__mmask8)-1,
   2988                                                 _MM_FROUND_CUR_DIRECTION);
   2989 }
   2990 
   2991 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2992 _mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
   2993   return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
   2994       (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
   2995 }
   2996 
   2997 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   2998 _mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
   2999   return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
   3000                                                  (__v4sf)__C, (__mmask8)__U,
   3001                                                  _MM_FROUND_CUR_DIRECTION);
   3002 }
   3003 
   3004 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   3005 _mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
   3006   return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
   3007       (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
   3008 }
   3009 
   3010 #define _mm_fmadd_round_sch(A, B, C, R)                                        \
   3011   ((__m128h)__builtin_ia32_vfmaddcsh_mask(                                     \
   3012       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
   3013       (__mmask8)-1, (int)(R)))
   3014 
   3015 #define _mm_mask_fmadd_round_sch(A, U, B, C, R)                                \
   3016   ((__m128h)__builtin_ia32_vfmaddcsh_round_mask(                               \
   3017       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
   3018       (__mmask8)(U), (int)(R)))
   3019 
   3020 #define _mm_maskz_fmadd_round_sch(U, A, B, C, R)                               \
   3021   ((__m128h)__builtin_ia32_vfmaddcsh_maskz(                                    \
   3022       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
   3023       (__mmask8)(U), (int)(R)))
   3024 
   3025 #define _mm_mask3_fmadd_round_sch(A, B, C, U, R)                               \
   3026   ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3(                              \
   3027       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
   3028       (__mmask8)(U), (int)(R)))
   3029 
   3030 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
   3031                                                               __m128h __B) {
   3032   return (__m128h)__builtin_ia32_vfcmulcsh_mask(
   3033       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
   3034       _MM_FROUND_CUR_DIRECTION);
   3035 }
   3036 
   3037 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   3038 _mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
   3039   return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
   3040                                                 (__v4sf)__W, (__mmask8)__U,
   3041                                                 _MM_FROUND_CUR_DIRECTION);
   3042 }
   3043 
   3044 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   3045 _mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
   3046   return (__m128h)__builtin_ia32_vfcmulcsh_mask(
   3047       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
   3048       _MM_FROUND_CUR_DIRECTION);
   3049 }
   3050 
   3051 #define _mm_fcmul_round_sch(A, B, R)                                           \
   3052   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
   3053       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
   3054       (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
   3055 
   3056 #define _mm_mask_fcmul_round_sch(W, U, A, B, R)                                \
   3057   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
   3058       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
   3059       (__mmask8)(U), (int)(R)))
   3060 
   3061 #define _mm_maskz_fcmul_round_sch(U, A, B, R)                                  \
   3062   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
   3063       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
   3064       (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
   3065 
   3066 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
   3067                                                              __m128h __B) {
   3068   return (__m128h)__builtin_ia32_vfmulcsh_mask(
   3069       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
   3070       _MM_FROUND_CUR_DIRECTION);
   3071 }
   3072 
   3073 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
   3074                                                                   __mmask8 __U,
   3075                                                                   __m128h __A,
   3076                                                                   __m128h __B) {
   3077   return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
   3078                                                (__v4sf)__W, (__mmask8)__U,
   3079                                                _MM_FROUND_CUR_DIRECTION);
   3080 }
   3081 
   3082 static __inline__ __m128h __DEFAULT_FN_ATTRS128
   3083 _mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
   3084   return (__m128h)__builtin_ia32_vfmulcsh_mask(
   3085       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
   3086       _MM_FROUND_CUR_DIRECTION);
   3087 }
   3088 
   3089 #define _mm_fmul_round_sch(A, B, R)                                            \
   3090   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
   3091       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
   3092       (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
   3093 
   3094 #define _mm_mask_fmul_round_sch(W, U, A, B, R)                                 \
   3095   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
   3096       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
   3097       (__mmask8)(U), (int)(R)))
   3098 
   3099 #define _mm_maskz_fmul_round_sch(U, A, B, R)                                   \
   3100   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
   3101       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
   3102       (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
   3103 
   3104 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
   3105                                                                  __m512h __B) {
   3106   return (__m512h)__builtin_ia32_vfcmulcph512_mask(
   3107       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
   3108       _MM_FROUND_CUR_DIRECTION);
   3109 }
   3110 
   3111 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   3112 _mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
   3113   return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
   3114                                                    (__v16sf)__W, (__mmask16)__U,
   3115                                                    _MM_FROUND_CUR_DIRECTION);
   3116 }
   3117 
   3118 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   3119 _mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
   3120   return (__m512h)__builtin_ia32_vfcmulcph512_mask(
   3121       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
   3122       _MM_FROUND_CUR_DIRECTION);
   3123 }
   3124 
   3125 #define _mm512_fcmul_round_pch(A, B, R)                                        \
   3126   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
   3127       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
   3128       (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
   3129 
   3130 #define _mm512_mask_fcmul_round_pch(W, U, A, B, R)                             \
   3131   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
   3132       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
   3133       (__mmask16)(U), (int)(R)))
   3134 
   3135 #define _mm512_maskz_fcmul_round_pch(U, A, B, R)                               \
   3136   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
   3137       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
   3138       (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
   3139 
   3140 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
   3141                                                                 __m512h __B) {
   3142   return (__m512h)__builtin_ia32_vfmulcph512_mask(
   3143       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
   3144       _MM_FROUND_CUR_DIRECTION);
   3145 }
   3146 
   3147 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   3148 _mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
   3149   return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
   3150                                                   (__v16sf)__W, (__mmask16)__U,
   3151                                                   _MM_FROUND_CUR_DIRECTION);
   3152 }
   3153 
   3154 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   3155 _mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
   3156   return (__m512h)__builtin_ia32_vfmulcph512_mask(
   3157       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
   3158       _MM_FROUND_CUR_DIRECTION);
   3159 }
   3160 
   3161 #define _mm512_fmul_round_pch(A, B, R)                                         \
   3162   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
   3163       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
   3164       (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
   3165 
   3166 #define _mm512_mask_fmul_round_pch(W, U, A, B, R)                              \
   3167   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
   3168       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
   3169       (__mmask16)(U), (int)(R)))
   3170 
   3171 #define _mm512_maskz_fmul_round_pch(U, A, B, R)                                \
   3172   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
   3173       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
   3174       (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
   3175 
   3176 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
   3177                                                                   __m512h __B,
   3178                                                                   __m512h __C) {
   3179   return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
   3180       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
   3181       _MM_FROUND_CUR_DIRECTION);
   3182 }
   3183 
   3184 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   3185 _mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
   3186   return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
   3187       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
   3188       _MM_FROUND_CUR_DIRECTION);
   3189 }
   3190 
   3191 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   3192 _mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
   3193   return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
   3194       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
   3195       _MM_FROUND_CUR_DIRECTION);
   3196 }
   3197 
   3198 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   3199 _mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
   3200   return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
   3201       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
   3202       _MM_FROUND_CUR_DIRECTION);
   3203 }
   3204 
   3205 #define _mm512_fcmadd_round_pch(A, B, C, R)                                    \
   3206   ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
   3207       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
   3208       (__mmask16)-1, (int)(R)))
   3209 
   3210 #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R)                            \
   3211   ((__m512h)__builtin_ia32_vfcmaddcph512_mask(                                 \
   3212       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
   3213       (__mmask16)(U), (int)(R)))
   3214 
   3215 #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R)                           \
   3216   ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
   3217       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
   3218       (__mmask16)(U), (int)(R)))
   3219 
   3220 #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R)                           \
   3221   ((__m512h)__builtin_ia32_vfcmaddcph512_maskz(                                \
   3222       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
   3223       (__mmask16)(U), (int)(R)))
   3224 
   3225 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
   3226                                                                  __m512h __B,
   3227                                                                  __m512h __C) {
   3228   return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
   3229                                                     (__v16sf)__C, (__mmask16)-1,
   3230                                                     _MM_FROUND_CUR_DIRECTION);
   3231 }
   3232 
   3233 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   3234 _mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
   3235   return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
   3236                                                    (__v16sf)__C, (__mmask16)__U,
   3237                                                    _MM_FROUND_CUR_DIRECTION);
   3238 }
   3239 
   3240 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   3241 _mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
   3242   return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
   3243       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
   3244       _MM_FROUND_CUR_DIRECTION);
   3245 }
   3246 
   3247 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   3248 _mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
   3249   return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
   3250       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
   3251       _MM_FROUND_CUR_DIRECTION);
   3252 }
   3253 
   3254 #define _mm512_fmadd_round_pch(A, B, C, R)                                     \
   3255   ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
   3256       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
   3257       (__mmask16)-1, (int)(R)))
   3258 
   3259 #define _mm512_mask_fmadd_round_pch(A, U, B, C, R)                             \
   3260   ((__m512h)__builtin_ia32_vfmaddcph512_mask(                                  \
   3261       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
   3262       (__mmask16)(U), (int)(R)))
   3263 
   3264 #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R)                            \
   3265   ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
   3266       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
   3267       (__mmask16)(U), (int)(R)))
   3268 
   3269 #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R)                            \
   3270   ((__m512h)__builtin_ia32_vfmaddcph512_maskz(                                 \
   3271       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
   3272       (__mmask16)(U), (int)(R)))
   3273 
   3274 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
   3275 _mm512_reduce_add_ph(__m512h __W) {
   3276   return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
   3277 }
   3278 
   3279 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
   3280 _mm512_reduce_mul_ph(__m512h __W) {
   3281   return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
   3282 }
   3283 
   3284 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
   3285 _mm512_reduce_max_ph(__m512h __V) {
   3286   return __builtin_ia32_reduce_fmax_ph512(__V);
   3287 }
   3288 
   3289 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
   3290 _mm512_reduce_min_ph(__m512h __V) {
   3291   return __builtin_ia32_reduce_fmin_ph512(__V);
   3292 }
   3293 
   3294 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   3295 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
   3296   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
   3297                                               (__v32hf)__A);
   3298 }
   3299 
   3300 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   3301 _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
   3302   return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
   3303                                                  (__v32hi)__B);
   3304 }
   3305 
   3306 static __inline__ __m512h __DEFAULT_FN_ATTRS512
   3307 _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
   3308   return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
   3309 }
   3310 
   3311 // intrinsics below are alias for f*mul_*ch
   3312 #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
   3313 #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
   3314 #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
   3315 #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
   3316 #define _mm512_mask_mul_round_pch(W, U, A, B, R)                               \
   3317   _mm512_mask_fmul_round_pch(W, U, A, B, R)
   3318 #define _mm512_maskz_mul_round_pch(U, A, B, R)                                 \
   3319   _mm512_maskz_fmul_round_pch(U, A, B, R)
   3320 
   3321 #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
   3322 #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
   3323 #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
   3324 #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
   3325 #define _mm512_mask_cmul_round_pch(W, U, A, B, R)                              \
   3326   _mm512_mask_fcmul_round_pch(W, U, A, B, R)
   3327 #define _mm512_maskz_cmul_round_pch(U, A, B, R)                                \
   3328   _mm512_maskz_fcmul_round_pch(U, A, B, R)
   3329 
   3330 #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
   3331 #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
   3332 #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
   3333 #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
   3334 #define _mm_mask_mul_round_sch(W, U, A, B, R)                                  \
   3335   _mm_mask_fmul_round_sch(W, U, A, B, R)
   3336 #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
   3337 
   3338 #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
   3339 #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
   3340 #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
   3341 #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
   3342 #define _mm_mask_cmul_round_sch(W, U, A, B, R)                                 \
   3343   _mm_mask_fcmul_round_sch(W, U, A, B, R)
   3344 #define _mm_maskz_cmul_round_sch(U, A, B, R)                                   \
   3345   _mm_maskz_fcmul_round_sch(U, A, B, R)
   3346 
   3347 #undef __DEFAULT_FN_ATTRS128
   3348 #undef __DEFAULT_FN_ATTRS256
   3349 #undef __DEFAULT_FN_ATTRS512
   3350 
   3351 #endif
   3352 #endif