zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

avx512bf16intrin.h (10832B) - Raw


      1 /*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 #ifndef __IMMINTRIN_H
     10 #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
     11 #endif
     12 
     13 #ifdef __SSE2__
     14 
     15 #ifndef __AVX512BF16INTRIN_H
     16 #define __AVX512BF16INTRIN_H
     17 
     18 typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
     19 typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
     20 typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
     21 
     22 #define __DEFAULT_FN_ATTRS512 \
     23   __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
     24                  __min_vector_width__(512)))
     25 #define __DEFAULT_FN_ATTRS                                                     \
     26   __attribute__((__always_inline__, __nodebug__,                               \
     27                  __target__("avx512bf16,no-evex512")))
     28 
     29 /// Convert One BF16 Data to One Single Float Data.
     30 ///
     31 /// \headerfile <x86intrin.h>
     32 ///
     33 /// This intrinsic does not correspond to a specific instruction.
     34 ///
     35 /// \param __A
     36 ///    A bfloat data.
     37 /// \returns A float data whose sign field and exponent field keep unchanged,
     38 ///    and fraction field is extended to 23 bits.
     39 static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
     40   return __builtin_ia32_cvtsbf162ss_32(__A);
     41 }
     42 
     43 /// Convert Two Packed Single Data to One Packed BF16 Data.
     44 ///
     45 /// \headerfile <x86intrin.h>
     46 ///
     47 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
     48 ///
     49 /// \param __A
     50 ///    A 512-bit vector of [16 x float].
     51 /// \param __B
     52 ///    A 512-bit vector of [16 x float].
     53 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
     54 ///    conversion of __B, and higher 256 bits come from conversion of __A.
     55 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
     56 _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
     57   return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
     58                                                     (__v16sf) __B);
     59 }
     60 
     61 /// Convert Two Packed Single Data to One Packed BF16 Data.
     62 ///
     63 /// \headerfile <x86intrin.h>
     64 ///
     65 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
     66 ///
     67 /// \param __A
     68 ///    A 512-bit vector of [16 x float].
     69 /// \param __B
     70 ///    A 512-bit vector of [16 x float].
     71 /// \param __W
     72 ///    A 512-bit vector of [32 x bfloat].
     73 /// \param __U
     74 ///    A 32-bit mask value specifying what is chosen for each element.
     75 ///    A 1 means conversion of __A or __B. A 0 means element from __W.
     76 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
     77 ///    conversion of __B, and higher 256 bits come from conversion of __A.
     78 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
     79 _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
     80   return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
     81                                         (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
     82                                         (__v32bf)__W);
     83 }
     84 
     85 /// Convert Two Packed Single Data to One Packed BF16 Data.
     86 ///
     87 /// \headerfile <x86intrin.h>
     88 ///
     89 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
     90 ///
     91 /// \param __A
     92 ///    A 512-bit vector of [16 x float].
     93 /// \param __B
     94 ///    A 512-bit vector of [16 x float].
     95 /// \param __U
     96 ///    A 32-bit mask value specifying what is chosen for each element.
     97 ///    A 1 means conversion of __A or __B. A 0 means element is zero.
     98 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
     99 ///    conversion of __B, and higher 256 bits come from conversion of __A.
    100 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
    101 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
    102   return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
    103                                         (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
    104                                         (__v32bf)_mm512_setzero_si512());
    105 }
    106 
    107 /// Convert Packed Single Data to Packed BF16 Data.
    108 ///
    109 /// \headerfile <x86intrin.h>
    110 ///
    111 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
    112 ///
    113 /// \param __A
    114 ///    A 512-bit vector of [16 x float].
    115 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
    116 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
    117 _mm512_cvtneps_pbh(__m512 __A) {
    118   return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
    119                                               (__v16bf)_mm256_undefined_si256(),
    120                                               (__mmask16)-1);
    121 }
    122 
    123 /// Convert Packed Single Data to Packed BF16 Data.
    124 ///
    125 /// \headerfile <x86intrin.h>
    126 ///
    127 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
    128 ///
    129 /// \param __A
    130 ///    A 512-bit vector of [16 x float].
    131 /// \param __W
    132 ///    A 256-bit vector of [16 x bfloat].
    133 /// \param __U
    134 ///    A 16-bit mask value specifying what is chosen for each element.
    135 ///    A 1 means conversion of __A. A 0 means element from __W.
    136 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
    137 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
    138 _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
    139   return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
    140                                                         (__v16bf)__W,
    141                                                         (__mmask16)__U);
    142 }
    143 
    144 /// Convert Packed Single Data to Packed BF16 Data.
    145 ///
    146 /// \headerfile <x86intrin.h>
    147 ///
    148 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
    149 ///
    150 /// \param __A
    151 ///    A 512-bit vector of [16 x float].
    152 /// \param __U
    153 ///    A 16-bit mask value specifying what is chosen for each element.
    154 ///    A 1 means conversion of __A. A 0 means element is zero.
    155 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
    156 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
    157 _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
    158   return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
    159                                                 (__v16bf)_mm256_setzero_si256(),
    160                                                 (__mmask16)__U);
    161 }
    162 
    163 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
    164 ///
    165 /// \headerfile <x86intrin.h>
    166 ///
    167 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
    168 ///
    169 /// \param __A
    170 ///    A 512-bit vector of [32 x bfloat].
    171 /// \param __B
    172 ///    A 512-bit vector of [32 x bfloat].
    173 /// \param __D
    174 ///    A 512-bit vector of [16 x float].
    175 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
    176 ///  __A, __B and __D
    177 static __inline__ __m512 __DEFAULT_FN_ATTRS512
    178 _mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
    179   return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
    180                                              (__v32bf) __A,
    181                                              (__v32bf) __B);
    182 }
    183 
    184 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
    185 ///
    186 /// \headerfile <x86intrin.h>
    187 ///
    188 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
    189 ///
    190 /// \param __A
    191 ///    A 512-bit vector of [32 x bfloat].
    192 /// \param __B
    193 ///    A 512-bit vector of [32 x bfloat].
    194 /// \param __D
    195 ///    A 512-bit vector of [16 x float].
    196 /// \param __U
    197 ///    A 16-bit mask value specifying what is chosen for each element.
    198 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
    199 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
    200 ///  __A, __B and __D
    201 static __inline__ __m512 __DEFAULT_FN_ATTRS512
    202 _mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
    203   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
    204                                        (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
    205                                        (__v16sf)__D);
    206 }
    207 
    208 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
    209 ///
    210 /// \headerfile <x86intrin.h>
    211 ///
    212 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
    213 ///
    214 /// \param __A
    215 ///    A 512-bit vector of [32 x bfloat].
    216 /// \param __B
    217 ///    A 512-bit vector of [32 x bfloat].
    218 /// \param __D
    219 ///    A 512-bit vector of [16 x float].
    220 /// \param __U
    221 ///    A 16-bit mask value specifying what is chosen for each element.
    222 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
    223 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
    224 ///  __A, __B and __D
    225 static __inline__ __m512 __DEFAULT_FN_ATTRS512
    226 _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
    227   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
    228                                        (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
    229                                        (__v16sf)_mm512_setzero_si512());
    230 }
    231 
    232 /// Convert Packed BF16 Data to Packed float Data.
    233 ///
    234 /// \headerfile <x86intrin.h>
    235 ///
    236 /// \param __A
    237 ///    A 256-bit vector of [16 x bfloat].
    238 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
    239 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
    240   return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
    241       (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
    242 }
    243 
    244 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
    245 ///
    246 /// \headerfile <x86intrin.h>
    247 ///
    248 /// \param __U
    249 ///    A 16-bit mask. Elements are zeroed out when the corresponding mask
    250 ///    bit is not set.
    251 /// \param __A
    252 ///    A 256-bit vector of [16 x bfloat].
    253 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
    254 static __inline__ __m512 __DEFAULT_FN_ATTRS512
    255 _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
    256   return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
    257       (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
    258 }
    259 
    260 /// Convert Packed BF16 Data to Packed float Data using merging mask.
    261 ///
    262 /// \headerfile <x86intrin.h>
    263 ///
    264 /// \param __S
    265 ///    A 512-bit vector of [16 x float]. Elements are copied from __S when
    266 ///     the corresponding mask bit is not set.
    267 /// \param __U
    268 ///    A 16-bit mask.
    269 /// \param __A
    270 ///    A 256-bit vector of [16 x bfloat].
    271 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
    272 static __inline__ __m512 __DEFAULT_FN_ATTRS512
    273 _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
    274   return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
    275       (__m512i)__S, (__mmask16)__U,
    276       (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
    277 }
    278 
    279 #undef __DEFAULT_FN_ATTRS
    280 #undef __DEFAULT_FN_ATTRS512
    281 
    282 #endif
    283 #endif