zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

avx10_2_512bf16intrin.h (23418B) - Raw


      1 /*===----------- avx10_2_512bf16intrin.h - AVX10-BF16 intrinsics ---------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 #ifndef __IMMINTRIN_H
     10 #error                                                                         \
     11     "Never use <avx10_2_512bf16intrin.h> directly; include <immintrin.h> instead."
     12 #endif
     13 
     14 #ifdef __SSE2__
     15 
     16 #ifndef __AVX10_2_512BF16INTRIN_H
     17 #define __AVX10_2_512BF16INTRIN_H
     18 
     19 /* Define the default attributes for the functions in this file. */
     20 typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
     21 
     22 /* Define the default attributes for the functions in this file. */
     23 #define __DEFAULT_FN_ATTRS512                                                  \
     24   __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"),    \
     25                  __min_vector_width__(512)))
     26 
     27 static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
     28   return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
     29 }
     30 
     31 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_undefined_pbh(void) {
     32   return (__m512bh)__builtin_ia32_undef512();
     33 }
     34 
     35 static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set1_pbh(__bf16 bf) {
     36   return (__m512bh)(__v32bf){bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
     37                              bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
     38                              bf, bf, bf, bf, bf, bf, bf, bf, bf, bf};
     39 }
     40 
     41 static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set_pbh(
     42     __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6,
     43     __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
     44     __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, __bf16 bf17,
     45     __bf16 bf18, __bf16 bf19, __bf16 bf20, __bf16 bf21, __bf16 bf22,
     46     __bf16 bf23, __bf16 bf24, __bf16 bf25, __bf16 bf26, __bf16 bf27,
     47     __bf16 bf28, __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) {
     48   return (__m512bh)(__v32bf){bf32, bf31, bf30, bf29, bf28, bf27, bf26, bf25,
     49                              bf24, bf23, bf22, bf21, bf20, bf19, bf18, bf17,
     50                              bf16, bf15, bf14, bf13, bf12, bf11, bf10, bf9,
     51                              bf8,  bf7,  bf6,  bf5,  bf4,  bf3,  bf2,  bf1};
     52 }
     53 
     54 #define _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10,     \
     55                         bf11, bf12, bf13, bf14, bf15, bf16, bf17, bf18, bf19,  \
     56                         bf20, bf21, bf22, bf23, bf24, bf25, bf26, bf27, bf28,  \
     57                         bf29, bf30, bf31, bf32)                                \
     58   _mm512_set_pbh((bf32), (bf31), (bf30), (bf29), (bf28), (bf27), (bf26),       \
     59                  (bf25), (bf24), (bf23), (bf22), (bf21), (bf20), (bf19),       \
     60                  (bf18), (bf17), (bf16), (bf15), (bf14), (bf13), (bf12),       \
     61                  (bf11), (bf10), (bf9), (bf8), (bf7), (bf6), (bf5), (bf4),     \
     62                  (bf3), (bf2), (bf1))
     63 
     64 static __inline__ __m512 __DEFAULT_FN_ATTRS512
     65 _mm512_castbf16_ps(__m512bh __a) {
     66   return (__m512)__a;
     67 }
     68 
     69 static __inline__ __m512d __DEFAULT_FN_ATTRS512
     70 _mm512_castbf16_pd(__m512bh __a) {
     71   return (__m512d)__a;
     72 }
     73 
     74 static __inline__ __m512i __DEFAULT_FN_ATTRS512
     75 _mm512_castbf16_si512(__m512bh __a) {
     76   return (__m512i)__a;
     77 }
     78 
     79 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_castps_pbh(__m512 __a) {
     80   return (__m512bh)__a;
     81 }
     82 
     83 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
     84 _mm512_castpd_pbh(__m512d __a) {
     85   return (__m512bh)__a;
     86 }
     87 
     88 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
     89 _mm512_castsi512_pbh(__m512i __a) {
     90   return (__m512bh)__a;
     91 }
     92 
     93 static __inline__ __m128bh __DEFAULT_FN_ATTRS512
     94 _mm512_castbf16512_pbh128(__m512bh __a) {
     95   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
     96 }
     97 
     98 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
     99 _mm512_castbf16512_pbh256(__m512bh __a) {
    100   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
    101                                  12, 13, 14, 15);
    102 }
    103 
    104 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    105 _mm512_castbf16128_pbh512(__m128bh __a) {
    106   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
    107                                  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    108                                  -1, -1, -1, -1, -1, -1, -1, -1, -1);
    109 }
    110 
    111 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    112 _mm512_castbf16256_pbh512(__m256bh __a) {
    113   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
    114                                  12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
    115                                  -1, -1, -1, -1, -1, -1, -1, -1);
    116 }
    117 
    118 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    119 _mm512_zextbf16128_pbh512(__m128bh __a) {
    120   return __builtin_shufflevector(
    121       __a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
    122       13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
    123 }
    124 
    125 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    126 _mm512_zextbf16256_pbh512(__m256bh __a) {
    127   return __builtin_shufflevector(__a, (__v16bf)_mm256_setzero_pbh(), 0, 1, 2, 3,
    128                                  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
    129                                  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
    130                                  29, 30, 31);
    131 }
    132 
    133 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_abs_pbh(__m512bh __A) {
    134   return (__m512bh)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF),
    135                                     (__m512i)__A);
    136 }
    137 
    138 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    139 _mm512_load_pbh(void const *__p) {
    140   return *(const __m512bh *)__p;
    141 }
    142 
    143 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    144 _mm512_loadu_pbh(void const *__p) {
    145   struct __loadu_pbh {
    146     __m512bh_u __v;
    147   } __attribute__((__packed__, __may_alias__));
    148   return ((const struct __loadu_pbh *)__p)->__v;
    149 }
    150 
    151 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_pbh(void *__P,
    152                                                               __m512bh __A) {
    153   *(__m512bh *)__P = __A;
    154 }
    155 
    156 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_pbh(void *__P,
    157                                                                __m512bh __A) {
    158   struct __storeu_pbh {
    159     __m512bh_u __v;
    160   } __attribute__((__packed__, __may_alias__));
    161   ((struct __storeu_pbh *)__P)->__v = __A;
    162 }
    163 
    164 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    165 _mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
    166   return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, (__v32bf)__W,
    167                                                 (__v32bf)__A);
    168 }
    169 
    170 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    171 _mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
    172   return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
    173                                                   (__v32hi)__B);
    174 }
    175 
    176 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    177 _mm512_permutexvar_pbh(__m512i __A, __m512bh __B) {
    178   return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
    179 }
    180 
    181 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_add_pbh(__m512bh __A,
    182                                                                 __m512bh __B) {
    183   return (__m512bh)((__v32bf)__A + (__v32bf)__B);
    184 }
    185 
    186 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    187 _mm512_mask_add_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
    188   return (__m512bh)__builtin_ia32_selectpbf_512(
    189       (__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B), (__v32bf)__W);
    190 }
    191 
    192 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    193 _mm512_maskz_add_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
    194   return (__m512bh)__builtin_ia32_selectpbf_512(
    195       (__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B),
    196       (__v32bf)_mm512_setzero_pbh());
    197 }
    198 
    199 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sub_pbh(__m512bh __A,
    200                                                                 __m512bh __B) {
    201   return (__m512bh)((__v32bf)__A - (__v32bf)__B);
    202 }
    203 
    204 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    205 _mm512_mask_sub_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
    206   return (__m512bh)__builtin_ia32_selectpbf_512(
    207       (__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B), (__v32bf)__W);
    208 }
    209 
    210 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    211 _mm512_maskz_sub_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
    212   return (__m512bh)__builtin_ia32_selectpbf_512(
    213       (__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B),
    214       (__v32bf)_mm512_setzero_pbh());
    215 }
    216 
    217 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mul_pbh(__m512bh __A,
    218                                                                 __m512bh __B) {
    219   return (__m512bh)((__v32bf)__A * (__v32bf)__B);
    220 }
    221 
    222 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    223 _mm512_mask_mul_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
    224   return (__m512bh)__builtin_ia32_selectpbf_512(
    225       (__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B), (__v32bf)__W);
    226 }
    227 
    228 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    229 _mm512_maskz_mul_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
    230   return (__m512bh)__builtin_ia32_selectpbf_512(
    231       (__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B),
    232       (__v32bf)_mm512_setzero_pbh());
    233 }
    234 
    235 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_div_pbh(__m512bh __A,
    236                                                                 __m512bh __B) {
    237   return (__m512bh)((__v32bf)__A / (__v32bf)__B);
    238 }
    239 
    240 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    241 _mm512_mask_div_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
    242   return (__m512bh)__builtin_ia32_selectpbf_512(
    243       (__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B), (__v32bf)__W);
    244 }
    245 
    246 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    247 _mm512_maskz_div_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
    248   return (__m512bh)__builtin_ia32_selectpbf_512(
    249       (__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B),
    250       (__v32bf)_mm512_setzero_pbh());
    251 }
    252 
    253 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_max_pbh(__m512bh __A,
    254                                                                 __m512bh __B) {
    255   return (__m512bh)__builtin_ia32_vmaxbf16512((__v32bf)__A, (__v32bf)__B);
    256 }
    257 
    258 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    259 _mm512_mask_max_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
    260   return (__m512bh)__builtin_ia32_selectpbf_512(
    261       (__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B), (__v32bf)__W);
    262 }
    263 
    264 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    265 _mm512_maskz_max_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
    266   return (__m512bh)__builtin_ia32_selectpbf_512(
    267       (__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B),
    268       (__v32bf)_mm512_setzero_pbh());
    269 }
    270 
    271 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_min_pbh(__m512bh __A,
    272                                                                 __m512bh __B) {
    273   return (__m512bh)__builtin_ia32_vminbf16512((__v32bf)__A, (__v32bf)__B);
    274 }
    275 
    276 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    277 _mm512_mask_min_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
    278   return (__m512bh)__builtin_ia32_selectpbf_512(
    279       (__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B), (__v32bf)__W);
    280 }
    281 
    282 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    283 _mm512_maskz_min_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
    284   return (__m512bh)__builtin_ia32_selectpbf_512(
    285       (__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B),
    286       (__v32bf)_mm512_setzero_pbh());
    287 }
    288 
    289 #define _mm512_cmp_pbh_mask(__A, __B, __P)                                     \
    290   ((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A),        \
    291                                               (__v32bf)(__m512bh)(__B),        \
    292                                               (int)(__P), (__mmask32) - 1))
    293 
    294 #define _mm512_mask_cmp_pbh_mask(__U, __A, __B, __P)                           \
    295   ((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A),        \
    296                                               (__v32bf)(__m512bh)(__B),        \
    297                                               (int)(__P), (__mmask32)(__U)))
    298 
    299 #define _mm512_mask_fpclass_pbh_mask(__U, __A, imm)                            \
    300   ((__mmask32)__builtin_ia32_vfpclassbf16512_mask(                             \
    301       (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32)(__U)))
    302 
    303 #define _mm512_fpclass_pbh_mask(__A, imm)                                      \
    304   ((__mmask32)__builtin_ia32_vfpclassbf16512_mask(                             \
    305       (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32) - 1))
    306 
    307 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    308 _mm512_scalef_pbh(__m512bh __A, __m512bh __B) {
    309   return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
    310       (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_undefined_pbh(),
    311       (__mmask32)-1);
    312 }
    313 
    314 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_scalef_pbh(
    315     __m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
    316   return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
    317       (__v32bf)__A, (__v32bf)__B, (__v32bf)__W, (__mmask32)__U);
    318 }
    319 
    320 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    321 _mm512_maskz_scalef_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
    322   return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
    323       (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_setzero_pbh(),
    324       (__mmask32)__U);
    325 }
    326 
    327 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_rcp_pbh(__m512bh __A) {
    328   return (__m512bh)__builtin_ia32_vrcpbf16512_mask(
    329       (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
    330 }
    331 
    332 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    333 _mm512_mask_rcp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
    334   return (__m512bh)__builtin_ia32_vrcpbf16512_mask((__v32bf)__A, (__v32bf)__W,
    335                                                    (__mmask32)__U);
    336 }
    337 
    338 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    339 _mm512_maskz_rcp_pbh(__mmask32 __U, __m512bh __A) {
    340   return (__m512bh)__builtin_ia32_vrcpbf16512_mask(
    341       (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
    342 }
    343 
    344 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    345 _mm512_getexp_pbh(__m512bh __A) {
    346   return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
    347       (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
    348 }
    349 
    350 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    351 _mm512_mask_getexp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
    352   return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
    353       (__v32bf)__A, (__v32bf)__W, (__mmask32)__U);
    354 }
    355 
    356 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    357 _mm512_maskz_getexp_pbh(__mmask32 __U, __m512bh __A) {
    358   return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
    359       (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
    360 }
    361 
    362 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    363 _mm512_rsqrt_pbh(__m512bh __A) {
    364   return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask(
    365       (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
    366 }
    367 
    368 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    369 _mm512_mask_rsqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
    370   return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask((__v32bf)__A, (__v32bf)__W,
    371                                                      (__mmask32)__U);
    372 }
    373 
    374 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    375 _mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
    376   return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask(
    377       (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
    378 }
    379 
    380 #define _mm512_reduce_pbh(__A, imm)                                            \
    381   ((__m512bh)__builtin_ia32_vreducebf16512_mask(                               \
    382       (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_undefined_pbh(),   \
    383       (__mmask32) - 1))
    384 
    385 #define _mm512_mask_reduce_pbh(__W, __U, __A, imm)                             \
    386   ((__m512bh)__builtin_ia32_vreducebf16512_mask(                               \
    387       (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W),          \
    388       (__mmask32)(__U)))
    389 
    390 #define _mm512_maskz_reduce_pbh(__U, __A, imm)                                 \
    391   ((__m512bh)__builtin_ia32_vreducebf16512_mask(                               \
    392       (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(),     \
    393       (__mmask32)(__U)))
    394 
    395 #define _mm512_roundscale_pbh(__A, imm)                                        \
    396   ((__m512bh)__builtin_ia32_vrndscalebf16_mask(                                \
    397       (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(),     \
    398       (__mmask32) - 1))
    399 
    400 #define _mm512_mask_roundscale_pbh(__W, __U, __A, imm)                         \
    401   ((__m512bh)__builtin_ia32_vrndscalebf16_mask(                                \
    402       (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W),          \
    403       (__mmask32)(__U)))
    404 
    405 #define _mm512_maskz_roundscale_pbh(__U, __A, imm)                             \
    406   ((__m512bh)__builtin_ia32_vrndscalebf16_mask(                                \
    407       (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(),     \
    408       (__mmask32)(__U)))
    409 
    410 #define _mm512_getmant_pbh(__A, __B, __C)                                      \
    411   ((__m512bh)__builtin_ia32_vgetmantbf16512_mask(                              \
    412       (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)),                   \
    413       (__v32bf)_mm512_undefined_pbh(), (__mmask32) - 1))
    414 
    415 #define _mm512_mask_getmant_pbh(__W, __U, __A, __B, __C)                       \
    416   ((__m512bh)__builtin_ia32_vgetmantbf16512_mask(                              \
    417       (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)),                   \
    418       (__v32bf)(__m512bh)(__W), (__mmask32)(__U)))
    419 
    420 #define _mm512_maskz_getmant_pbh(__U, __A, __B, __C)                           \
    421   ((__m512bh)__builtin_ia32_vgetmantbf16512_mask(                              \
    422       (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)),                   \
    423       (__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
    424 
    425 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
    426   return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
    427 }
    428 
    429 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    430 _mm512_mask_sqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
    431   return (__m512bh)__builtin_ia32_selectpbf_512(
    432       (__mmask32)__U, (__v32bf)_mm512_sqrt_pbh(__A), (__v32bf)__W);
    433 }
    434 
    435 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    436 _mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) {
    437   return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
    438                                                 (__v32bf)_mm512_sqrt_pbh(__A),
    439                                                 (__v32bf)_mm512_setzero_pbh());
    440 }
    441 
    442 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    443 _mm512_fmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
    444   return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B,
    445                                                  (__v32bf)__C);
    446 }
    447 
    448 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    449 _mm512_mask_fmadd_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
    450   return (__m512bh)__builtin_ia32_selectpbf_512(
    451       (__mmask32)__U,
    452       _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A);
    453 }
    454 
    455 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmadd_pbh(
    456     __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
    457   return (__m512bh)__builtin_ia32_selectpbf_512(
    458       (__mmask32)__U,
    459       _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C);
    460 }
    461 
    462 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmadd_pbh(
    463     __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
    464   return (__m512bh)__builtin_ia32_selectpbf_512(
    465       (__mmask32)__U,
    466       _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
    467       (__v32bf)_mm512_setzero_pbh());
    468 }
    469 
    470 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    471 _mm512_fmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
    472   return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B,
    473                                                  -(__v32bf)__C);
    474 }
    475 
    476 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    477 _mm512_mask_fmsub_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
    478   return (__m512bh)__builtin_ia32_selectpbf_512(
    479       (__mmask32)__U,
    480       _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A);
    481 }
    482 
    483 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmsub_pbh(
    484     __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
    485   return (__m512bh)__builtin_ia32_selectpbf_512(
    486       (__mmask32)__U,
    487       _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C);
    488 }
    489 
    490 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsub_pbh(
    491     __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
    492   return (__m512bh)__builtin_ia32_selectpbf_512(
    493       (__mmask32)__U,
    494       _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
    495       (__v32bf)_mm512_setzero_pbh());
    496 }
    497 
    498 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    499 _mm512_fnmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
    500   return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B,
    501                                                  (__v32bf)__C);
    502 }
    503 
    504 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmadd_pbh(
    505     __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
    506   return (__m512bh)__builtin_ia32_selectpbf_512(
    507       (__mmask32)__U,
    508       _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
    509       (__v32bf)__A);
    510 }
    511 
    512 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmadd_pbh(
    513     __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
    514   return (__m512bh)__builtin_ia32_selectpbf_512(
    515       (__mmask32)__U,
    516       _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
    517       (__v32bf)__C);
    518 }
    519 
    520 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmadd_pbh(
    521     __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
    522   return (__m512bh)__builtin_ia32_selectpbf_512(
    523       (__mmask32)__U,
    524       _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
    525       (__v32bf)_mm512_setzero_pbh());
    526 }
    527 
    528 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    529 _mm512_fnmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
    530   return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B,
    531                                                  -(__v32bf)__C);
    532 }
    533 
    534 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmsub_pbh(
    535     __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
    536   return (__m512bh)__builtin_ia32_selectpbf_512(
    537       (__mmask32)__U,
    538       _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
    539       (__v32bf)__A);
    540 }
    541 
    542 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmsub_pbh(
    543     __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
    544   return (__m512bh)__builtin_ia32_selectpbf_512(
    545       (__mmask32)__U,
    546       _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
    547       (__v32bf)__C);
    548 }
    549 
    550 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh(
    551     __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
    552   return (__m512bh)__builtin_ia32_selectpbf_512(
    553       (__mmask32)__U,
    554       _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
    555       (__v32bf)_mm512_setzero_pbh());
    556 }
    557 
    558 #undef __DEFAULT_FN_ATTRS512
    559 
    560 #endif
    561 #endif