zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

avxneconvertintrin.h (14432B) - Raw


      1 /*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 #ifndef __IMMINTRIN_H
     11 #error                                                                         \
     12     "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
     13 #endif // __IMMINTRIN_H
     14 
     15 #ifdef __SSE2__
     16 
     17 #ifndef __AVXNECONVERTINTRIN_H
     18 #define __AVXNECONVERTINTRIN_H
     19 
     20 /* Define the default attributes for the functions in this file. */
     21 #define __DEFAULT_FN_ATTRS128                                                  \
     22   __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
     23                  __min_vector_width__(128)))
     24 #define __DEFAULT_FN_ATTRS256                                                  \
     25   __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
     26                  __min_vector_width__(256)))
     27 
     28 /// Convert scalar BF16 (16-bit) floating-point element
     29 /// stored at memory locations starting at location \a __A to a
     30 /// single-precision (32-bit) floating-point, broadcast it to packed
     31 /// single-precision (32-bit) floating-point elements, and store the results in
     32 /// \a dst.
     33 ///
     34 /// \headerfile <x86intrin.h>
     35 ///
     36 /// \code
     37 /// _mm_bcstnebf16_ps(const void *__A);
     38 /// \endcode
     39 ///
     40 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
     41 ///
     42 /// \param __A
     43 ///    A pointer to a 16-bit memory location. The address of the memory
     44 ///    location does not have to be aligned.
     45 /// \returns
     46 ///    A 128-bit vector of [4 x float].
     47 ///
     48 /// \code{.operation}
     49 /// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
     50 /// FOR j := 0 to 3
     51 ///   m := j*32
     52 ///   dst[m+31:m] := b
     53 /// ENDFOR
     54 /// dst[MAX:128] := 0
     55 /// \endcode
     56 static __inline__ __m128 __DEFAULT_FN_ATTRS128
     57 _mm_bcstnebf16_ps(const void *__A) {
     58   return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
     59 }
     60 
     61 /// Convert scalar BF16 (16-bit) floating-point element
     62 /// stored at memory locations starting at location \a __A to a
     63 /// single-precision (32-bit) floating-point, broadcast it to packed
     64 /// single-precision (32-bit) floating-point elements, and store the results in
     65 /// \a dst.
     66 ///
     67 /// \headerfile <x86intrin.h>
     68 ///
     69 /// \code
     70 /// _mm256_bcstnebf16_ps(const void *__A);
     71 /// \endcode
     72 ///
     73 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
     74 ///
     75 /// \param __A
     76 ///    A pointer to a 16-bit memory location. The address of the memory
     77 ///    location does not have to be aligned.
     78 /// \returns
     79 ///    A 256-bit vector of [8 x float].
     80 ///
     81 /// \code{.operation}
     82 /// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
     83 /// FOR j := 0 to 7
     84 ///   m := j*32
     85 ///   dst[m+31:m] := b
     86 /// ENDFOR
     87 /// dst[MAX:256] := 0
     88 /// \endcode
     89 static __inline__ __m256 __DEFAULT_FN_ATTRS256
     90 _mm256_bcstnebf16_ps(const void *__A) {
     91   return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
     92 }
     93 
     94 /// Convert scalar half-precision (16-bit) floating-point element
     95 /// stored at memory locations starting at location \a __A to a
     96 /// single-precision (32-bit) floating-point, broadcast it to packed
     97 /// single-precision (32-bit) floating-point elements, and store the results in
     98 /// \a dst.
     99 ///
    100 /// \headerfile <x86intrin.h>
    101 ///
    102 /// \code
    103 /// _mm_bcstnesh_ps(const void *__A);
    104 /// \endcode
    105 ///
    106 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
    107 ///
    108 /// \param __A
    109 ///    A pointer to a 16-bit memory location. The address of the memory
    110 ///    location does not have to be aligned.
    111 /// \returns
    112 ///    A 128-bit vector of [4 x float].
    113 ///
    114 /// \code{.operation}
    115 /// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
    116 /// FOR j := 0 to 3
    117 ///   m := j*32
    118 ///   dst[m+31:m] := b
    119 /// ENDFOR
    120 /// dst[MAX:128] := 0
    121 /// \endcode
    122 static __inline__ __m128 __DEFAULT_FN_ATTRS128
    123 _mm_bcstnesh_ps(const void *__A) {
    124   return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
    125 }
    126 
    127 /// Convert scalar half-precision (16-bit) floating-point element
    128 /// stored at memory locations starting at location \a __A to a
    129 /// single-precision (32-bit) floating-point, broadcast it to packed
    130 /// single-precision (32-bit) floating-point elements, and store the results in
    131 /// \a dst.
    132 ///
    133 /// \headerfile <x86intrin.h>
    134 ///
    135 /// \code
    136 /// _mm256_bcstnesh_ps(const void *__A);
    137 /// \endcode
    138 ///
    139 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
    140 ///
    141 /// \param __A
    142 ///    A pointer to a 16-bit memory location. The address of the memory
    143 ///    location does not have to be aligned.
    144 /// \returns
    145 ///    A 256-bit vector of [8 x float].
    146 ///
    147 /// \code{.operation}
    148 /// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
    149 /// FOR j := 0 to 7
    150 ///   m := j*32
    151 ///   dst[m+31:m] := b
    152 /// ENDFOR
    153 /// dst[MAX:256] := 0
    154 /// \endcode
    155 static __inline__ __m256 __DEFAULT_FN_ATTRS256
    156 _mm256_bcstnesh_ps(const void *__A) {
    157   return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
    158 }
    159 
    160 /// Convert packed BF16 (16-bit) floating-point even-indexed elements
    161 /// stored at memory locations starting at location \a __A to packed
    162 /// single-precision (32-bit) floating-point elements, and store the results in
    163 /// \a dst.
    164 ///
    165 /// \headerfile <x86intrin.h>
    166 ///
    167 /// \code
    168 /// _mm_cvtneebf16_ps(const __m128bh *__A);
    169 /// \endcode
    170 ///
    171 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
    172 ///
    173 /// \param __A
    174 ///    A pointer to a 128-bit memory location containing 8 consecutive
    175 ///    BF16 (16-bit) floating-point values.
    176 /// \returns
    177 ///    A 128-bit vector of [4 x float].
    178 ///
    179 /// \code{.operation}
    180 /// FOR j := 0 to 3
    181 /// 	k := j*2
    182 /// 	i := k*16
    183 /// 	m := j*32
    184 /// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
    185 /// ENDFOR
    186 /// dst[MAX:128] := 0
    187 /// \endcode
    188 static __inline__ __m128 __DEFAULT_FN_ATTRS128
    189 _mm_cvtneebf16_ps(const __m128bh *__A) {
    190   return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
    191 }
    192 
    193 /// Convert packed BF16 (16-bit) floating-point even-indexed elements
    194 /// stored at memory locations starting at location \a __A to packed
    195 /// single-precision (32-bit) floating-point elements, and store the results in
    196 /// \a dst.
    197 ///
    198 /// \headerfile <x86intrin.h>
    199 ///
    200 /// \code
    201 /// _mm256_cvtneebf16_ps(const __m256bh *__A);
    202 /// \endcode
    203 ///
    204 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
    205 ///
    206 /// \param __A
    207 ///    A pointer to a 256-bit memory location containing 16 consecutive
    208 ///    BF16 (16-bit) floating-point values.
    209 /// \returns
    210 ///    A 256-bit vector of [8 x float].
    211 ///
    212 /// \code{.operation}
    213 /// FOR j := 0 to 7
    214 /// 	k := j*2
    215 /// 	i := k*16
    216 /// 	m := j*32
    217 /// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
    218 /// ENDFOR
    219 /// dst[MAX:256] := 0
    220 /// \endcode
    221 static __inline__ __m256 __DEFAULT_FN_ATTRS256
    222 _mm256_cvtneebf16_ps(const __m256bh *__A) {
    223   return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
    224 }
    225 
    226 /// Convert packed half-precision (16-bit) floating-point even-indexed elements
    227 /// stored at memory locations starting at location \a __A to packed
    228 /// single-precision (32-bit) floating-point elements, and store the results in
    229 /// \a dst.
    230 ///
    231 /// \headerfile <x86intrin.h>
    232 ///
    233 /// \code
    234 /// _mm_cvtneeph_ps(const __m128h *__A);
    235 /// \endcode
    236 ///
    237 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
    238 ///
    239 /// \param __A
    240 ///    A pointer to a 128-bit memory location containing 8 consecutive
    241 ///    half-precision (16-bit) floating-point values.
    242 /// \returns
    243 ///    A 128-bit vector of [4 x float].
    244 ///
    245 /// \code{.operation}
    246 /// FOR j := 0 to 3
    247 /// 	k := j*2
    248 /// 	i := k*16
    249 /// 	m := j*32
    250 /// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
    251 /// ENDFOR
    252 /// dst[MAX:128] := 0
    253 /// \endcode
    254 static __inline__ __m128 __DEFAULT_FN_ATTRS128
    255 _mm_cvtneeph_ps(const __m128h *__A) {
    256   return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
    257 }
    258 
    259 /// Convert packed half-precision (16-bit) floating-point even-indexed elements
    260 /// stored at memory locations starting at location \a __A to packed
    261 /// single-precision (32-bit) floating-point elements, and store the results in
    262 /// \a dst.
    263 ///
    264 /// \headerfile <x86intrin.h>
    265 ///
    266 /// \code
    267 /// _mm256_cvtneeph_ps(const __m256h *__A);
    268 /// \endcode
    269 ///
    270 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
    271 ///
    272 /// \param __A
    273 ///    A pointer to a 256-bit memory location containing 16 consecutive
    274 ///    half-precision (16-bit) floating-point values.
    275 /// \returns
    276 ///    A 256-bit vector of [8 x float].
    277 ///
    278 /// \code{.operation}
    279 /// FOR j := 0 to 7
    280 /// 	k := j*2
    281 /// 	i := k*16
    282 /// 	m := j*32
    283 /// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
    284 /// ENDFOR
    285 /// dst[MAX:256] := 0
    286 /// \endcode
    287 static __inline__ __m256 __DEFAULT_FN_ATTRS256
    288 _mm256_cvtneeph_ps(const __m256h *__A) {
    289   return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
    290 }
    291 
    292 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
    293 /// stored at memory locations starting at location \a __A to packed
    294 /// single-precision (32-bit) floating-point elements, and store the results in
    295 /// \a dst.
    296 ///
    297 /// \headerfile <x86intrin.h>
    298 ///
    299 /// \code
    300 /// _mm_cvtneobf16_ps(const __m128bh *__A);
    301 /// \endcode
    302 ///
    303 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
    304 ///
    305 /// \param __A
    306 ///    A pointer to a 128-bit memory location containing 8 consecutive
    307 ///    BF16 (16-bit) floating-point values.
    308 /// \returns
    309 ///    A 128-bit vector of [4 x float].
    310 ///
    311 /// \code{.operation}
    312 /// FOR j := 0 to 3
    313 /// 	k := j*2+1
    314 /// 	i := k*16
    315 /// 	m := j*32
    316 /// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
    317 /// ENDFOR
    318 /// dst[MAX:128] := 0
    319 /// \endcode
    320 static __inline__ __m128 __DEFAULT_FN_ATTRS128
    321 _mm_cvtneobf16_ps(const __m128bh *__A) {
    322   return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
    323 }
    324 
    325 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
    326 /// stored at memory locations starting at location \a __A to packed
    327 /// single-precision (32-bit) floating-point elements, and store the results in
    328 /// \a dst.
    329 ///
    330 /// \headerfile <x86intrin.h>
    331 ///
    332 /// \code
    333 /// _mm256_cvtneobf16_ps(const __m256bh *__A);
    334 /// \endcode
    335 ///
    336 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
    337 ///
    338 /// \param __A
    339 ///    A pointer to a 256-bit memory location containing 16 consecutive
    340 ///    BF16 (16-bit) floating-point values.
    341 /// \returns
    342 ///    A 256-bit vector of [8 x float].
    343 ///
    344 /// \code{.operation}
    345 /// FOR j := 0 to 7
    346 /// 	k := j*2+1
    347 /// 	i := k*16
    348 /// 	m := j*32
    349 /// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
    350 /// ENDFOR
    351 /// dst[MAX:256] := 0
    352 /// \endcode
    353 static __inline__ __m256 __DEFAULT_FN_ATTRS256
    354 _mm256_cvtneobf16_ps(const __m256bh *__A) {
    355   return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
    356 }
    357 
    358 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
    359 /// stored at memory locations starting at location \a __A to packed
    360 /// single-precision (32-bit) floating-point elements, and store the results in
    361 /// \a dst.
    362 ///
    363 /// \headerfile <x86intrin.h>
    364 ///
    365 /// \code
    366 /// _mm_cvtneoph_ps(const __m128h *__A);
    367 /// \endcode
    368 ///
    369 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
    370 ///
    371 /// \param __A
    372 ///    A pointer to a 128-bit memory location containing 8 consecutive
    373 ///    half-precision (16-bit) floating-point values.
    374 /// \returns
    375 ///    A 128-bit vector of [4 x float].
    376 ///
    377 /// \code{.operation}
    378 /// FOR j := 0 to 3
    379 /// 	k := j*2+1
    380 /// 	i := k*16
    381 /// 	m := j*32
    382 /// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
    383 /// ENDFOR
    384 /// dst[MAX:128] := 0
    385 /// \endcode
    386 static __inline__ __m128 __DEFAULT_FN_ATTRS128
    387 _mm_cvtneoph_ps(const __m128h *__A) {
    388   return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
    389 }
    390 
    391 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
    392 /// stored at memory locations starting at location \a __A to packed
    393 /// single-precision (32-bit) floating-point elements, and store the results in
    394 /// \a dst.
    395 ///
    396 /// \headerfile <x86intrin.h>
    397 ///
    398 /// \code
    399 /// _mm256_cvtneoph_ps(const __m256h *__A);
    400 /// \endcode
    401 ///
    402 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
    403 ///
    404 /// \param __A
    405 ///    A pointer to a 256-bit memory location containing 16 consecutive
    406 ///    half-precision (16-bit) floating-point values.
    407 /// \returns
    408 ///    A 256-bit vector of [8 x float].
    409 ///
    410 /// \code{.operation}
    411 /// FOR j := 0 to 7
    412 /// 	k := j*2+1
    413 /// 	i := k*16
    414 /// 	m := j*32
    415 /// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
    416 /// ENDFOR
    417 /// dst[MAX:256] := 0
    418 /// \endcode
    419 static __inline__ __m256 __DEFAULT_FN_ATTRS256
    420 _mm256_cvtneoph_ps(const __m256h *__A) {
    421   return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
    422 }
    423 
    424 /// Convert packed single-precision (32-bit) floating-point elements in \a __A
    425 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
    426 /// dst.
    427 ///
    428 /// \headerfile <x86intrin.h>
    429 ///
    430 /// \code
    431 /// _mm_cvtneps_avx_pbh(__m128 __A);
    432 /// \endcode
    433 ///
    434 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
    435 ///
    436 /// \param __A
    437 ///    A 128-bit vector of [4 x float].
    438 /// \returns
    439 ///    A 128-bit vector of [8 x bfloat].
    440 ///
    441 /// \code{.operation}
    442 /// FOR j := 0 to 3
    443 /// 	dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
    444 /// ENDFOR
    445 /// dst[MAX:128] := 0
    446 /// \endcode
    447 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
    448 _mm_cvtneps_avx_pbh(__m128 __A) {
    449   return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
    450 }
    451 
    452 /// Convert packed single-precision (32-bit) floating-point elements in \a __A
    453 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
    454 /// dst.
    455 ///
    456 /// \headerfile <x86intrin.h>
    457 ///
    458 /// \code
    459 /// _mm256_cvtneps_avx_pbh(__m256 __A);
    460 /// \endcode
    461 ///
    462 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
    463 ///
    464 /// \param __A
    465 ///    A 256-bit vector of [8 x float].
    466 /// \returns
    467 ///    A 128-bit vector of [8 x bfloat].
    468 ///
    469 /// \code{.operation}
    470 /// FOR j := 0 to 7
    471 /// 	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
    472 /// ENDFOR
    473 /// dst[MAX:128] := 0
    474 /// \endcode
    475 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
    476 _mm256_cvtneps_avx_pbh(__m256 __A) {
    477   return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
    478 }
    479 
    480 #undef __DEFAULT_FN_ATTRS128
    481 #undef __DEFAULT_FN_ATTRS256
    482 
    483 #endif // __AVXNECONVERTINTRIN_H
    484 #endif // __SSE2__