zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

amxavx512intrin.h (12975B) - Raw


      1 /*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===------------------------------------------------------------------------===
      8  */
      9 #ifndef __IMMINTRIN_H
     10 #error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead."
     11 #endif // __IMMINTRIN_H
     12 
     13 #ifndef __AMX_AVX512INTRIN_H
     14 #define __AMX_AVX512INTRIN_H
     15 #if defined(__x86_64__) && defined(__SSE2__)
     16 
     17 #define __DEFAULT_FN_ATTRS_AVX512                                              \
     18   __attribute__((__always_inline__, __nodebug__,                               \
     19                  __target__("amx-avx512,avx10.2-512")))
     20 
     21 /// Moves a row from a tile register to a zmm destination register, converting
     22 ///    the int32 source elements to fp32. The row of the tile is selected by a
     23 ///    32b GPR.
     24 ///
     25 /// \headerfile <x86intrin.h>
     26 ///
     27 /// \code
     28 /// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row);
     29 /// \endcode
     30 ///
     31 /// \code{.operation}
     32 /// VL := 512
     33 /// VL_bytes := VL >> 3
     34 /// row_index := row & 0xffff
     35 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
     36 /// FOR i := 0 TO (VL_bytes / 4) - 1
     37 ///     IF i + row_chunk / 4 >= tsrc.colsb / 4
     38 ///         dst.dword[i] := 0
     39 ///     ELSE
     40 ///         dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
     41 ///     FI
     42 /// ENDFOR
     43 /// dst[MAX_VL-1:VL] := 0
     44 /// zero_tileconfig_start()
     45 /// \endcode
     46 ///
     47 /// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
     48 ///
     49 /// \param tsrc
     50 ///    The source tile. Max size is 1024 Bytes.
     51 /// \param row
     52 ///    The row of the source tile
     53 #define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row)
     54 
     55 /// Moves a row from a tile register to a zmm destination register, converting
     56 ///    the fp32 source elements to bf16. It places the resulting bf16 elements
     57 ///    in the high 16 bits within each dword. The row of the tile is selected
     58 ///    by a 32b GPR.
     59 ///
     60 /// \headerfile <x86intrin.h>
     61 ///
     62 /// \code
     63 /// __m512i _tile_cvtrowps2bf16h(__tile tsrc, unsigned int row);
     64 /// \endcode
     65 ///
     66 /// \code{.operation}
     67 /// VL := 512
     68 /// VL_bytes := VL >> 3
     69 /// row_index := row & 0xffff
     70 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
     71 /// FOR i := 0 TO (VL_bytes / 4) - 1
     72 ///     IF i + row_chunk / 4 >= tsrc.colsb / 4
     73 ///         dst.dword[i] := 0
     74 ///     ELSE
     75 ///         dst.word[2*i+0] := 0
     76 ///         dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
     77 ///     FI
     78 /// ENDFOR
     79 /// dst[MAX_VL-1:VL] := 0
     80 /// zero_tileconfig_start()
     81 /// \endcode
     82 ///
     83 /// This intrinsic corresponds to the \c TCVTROWPS2BF16H instruction.
     84 ///
     85 /// \param tsrc
     86 ///    The source tile. Max size is 1024 Bytes.
     87 /// \param row
     88 ///    The the row of the source tile.
     89 #define _tile_cvtrowps2bf16h(tsrc, row)                                        \
     90   __builtin_ia32_tcvtrowps2bf16h(tsrc, row)
     91 
     92 /// Moves a row from a tile register to a zmm destination register, converting
     93 ///    the fp32 source elements to bf16. It places the resulting bf16 elements
     94 ///    in the low 16 bits within each dword. The row of the tile is selected
     95 ///    by a 32b GPR.
     96 ///
     97 /// \headerfile <x86intrin.h>
     98 ///
     99 /// \code
    100 /// __m512i _tile_cvtrowps2bf16l(__tile tsrc, unsigned int row);
    101 /// \endcode
    102 ///
    103 /// \code{.operation}
    104 /// VL := 512
    105 /// VL_bytes := VL >> 3
    106 /// row_index := row & 0xffff
    107 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
    108 /// FOR i := 0 TO (VL_bytes / 4) - 1
    109 ///     IF i + row_chunk / 4 >= tsrc.colsb / 4
    110 ///         dst.dword[i] := 0
    111 ///     ELSE
    112 ///         dst.word[2*i+1] := 0
    113 ///         dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
    114 ///     FI
    115 /// ENDFOR
    116 /// dst[MAX_VL-1:VL] := 0
    117 /// zero_tileconfig_start()
    118 /// \endcode
    119 ///
    120 /// This intrinsic corresponds to the \c TCVTROWPS2BF16L instruction.
    121 ///
    122 /// \param tsrc
    123 ///    The source tile. Max size is 1024 Bytes.
    124 /// \param row
    125 ///    The the row of the source tile.
    126 #define _tile_cvtrowps2bf16l(tsrc, row)                                        \
    127   __builtin_ia32_tcvtrowps2bf16l(tsrc, row)
    128 
    129 /// Moves a row from a tile register to a zmm destination register, converting
    130 ///    the fp32 source elements to fp16. It places the resulting fp16 elements
    131 ///    in the high 16 bits within each dword. The row of the tile is selected
    132 ///    by a 32b GPR.
    133 ///
    134 /// \headerfile <x86intrin.h>
    135 ///
    136 /// \code
    137 /// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row);
    138 /// \endcode
    139 ///
    140 /// \code{.operation}
    141 /// VL := 512
    142 /// VL_bytes := VL >> 3
    143 /// row_index := row & 0xffff
    144 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
    145 /// FOR i := 0 TO (VL_bytes / 4) - 1
    146 ///     IF i + row_chunk / 4 >= tsrc.colsb / 4
    147 ///         dst.dword[i] := 0
    148 ///     ELSE
    149 ///         dst.word[2*i+0] := 0
    150 ///         dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
    151 ///     FI
    152 /// ENDFOR
    153 /// dst[MAX_VL-1:VL] := 0
    154 /// zero_tileconfig_start()
    155 /// \endcode
    156 ///
    157 /// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction.
    158 ///
    159 /// \param tsrc
    160 ///    The source tile. Max size is 1024 Bytes.
    161 /// \param row
    162 ///    The the row of the source tile.
    163 #define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row)
    164 
    165 /// Moves a row from a tile register to a zmm destination register, converting
    166 ///    the fp32 source elements to fp16. It places the resulting fp16 elements
    167 ///    in the low 16 bits within each dword. The row of the tile is selected
    168 ///    by a 32b GPR.
    169 ///
    170 /// \headerfile <x86intrin.h>
    171 ///
    172 /// \code
    173 /// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row);
    174 /// \endcode
    175 ///
    176 /// \code{.operation}
    177 /// VL := 512
    178 /// VL_bytes := VL >> 3
    179 /// row_index := row & 0xffff
    180 /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
    181 /// FOR i := 0 TO (VL_bytes / 4) - 1
    182 ///     IF i + row_chunk / 4 >= tsrc.colsb / 4
    183 ///         dst.dword[i] := 0
    184 ///     ELSE
    185 ///         dst.word[2*i+1] := 0
    186 ///         dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
    187 ///     FI
    188 /// ENDFOR
    189 /// dst[MAX_VL-1:VL] := 0
    190 /// zero_tileconfig_start()
    191 /// \endcode
    192 ///
    193 /// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction.
    194 ///
    195 /// \param tsrc
    196 ///    The source tile. Max size is 1024 Bytes.
    197 /// \param row
    198 ///    The the row of the source tile.
    199 #define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row)
    200 
    201 /// Move one row of a tile data to a v16f32 data.
    202 /// The row of the tile is selected by a 32b GPR.
    203 ///
    204 /// \headerfile <immintrin.h>
    205 ///
    206 /// \code
    207 /// __m512 _tile_movrow(__tile a, unsigned b);
    208 /// \endcode
    209 ///
    210 /// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
    211 ///
    212 /// \param a
    213 ///     The 1st source tile. Max size is 1024 Bytes.
    214 /// \param b
    215 ///     The 2nd source r32. Size is 4 Bytes.
    216 /// \returns
    217 ///     The destination v16f32 data. Size is 64 Bytes.
    218 ///
    219 /// \code{.operation}
    220 /// VL := 512
    221 /// VL_bytes := VL>>3
    222 /// row_index := b&0xffff
    223 /// row_chunk := ((b>>16)&0xffff) * VL_bytes
    224 /// FOR i := 0 TO (VL_bytes-1)
    225 ///     IF (row_chunk + i >= a.colsb)
    226 ///             dst.byte[i] := 0
    227 ///     ELSE
    228 ///             dst.byte[i] := a.row[row_index].byte[row_chunk+i]
    229 /// ENDFOR
    230 /// \endcode
    231 #define _tile_movrow(a, b) __builtin_ia32_tilemovrow(a, b)
    232 
    233 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
    234 
    235 static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal(
    236     unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
    237   return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u);
    238 }
    239 
    240 static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
    241 _tile_cvtrowps2bf16h_internal(unsigned short m, unsigned short n,
    242                               _tile1024i src, unsigned u) {
    243   return __builtin_ia32_tcvtrowps2bf16h_internal(m, n, src, u);
    244 }
    245 
    246 static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
    247 _tile_cvtrowps2bf16l_internal(unsigned short m, unsigned short n,
    248                               _tile1024i src, unsigned u) {
    249   return __builtin_ia32_tcvtrowps2bf16l_internal(m, n, src, u);
    250 }
    251 
    252 static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal(
    253     unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
    254   return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u);
    255 }
    256 
    257 static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal(
    258     unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
    259   return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u);
    260 }
    261 
    262 static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal(
    263     unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
    264   return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u);
    265 }
    266 
    267 /// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source
    268 /// elements to fp32. No SIMD exceptions are generated. Rounding is done as if
    269 /// MXCSR.RC=RNE. Embedded rounding is not supported.
    270 /// The row and chunk elements of tile is fetched from 32bit src1.
    271 ///
    272 /// \headerfile <immintrin.h>
    273 ///
    274 /// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction.
    275 ///
    276 /// \param src0
    277 ///    The 1st source tile. Max size is 1024 Bytes.
    278 /// \param src1
    279 ///    The 2nd source r32. Size is 4 Bytes.
    280 /// \returns
    281 ///    The destination v16f32 data. Size is 64 Bytes.
    282 __DEFAULT_FN_ATTRS_AVX512
    283 static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) {
    284   return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1);
    285 }
    286 
    287 /// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
    288 /// elements to bf16 at high 16-bits of each dword.
    289 /// The row and chunk elements of tile is fetched from 32bit src1.
    290 ///
    291 /// \headerfile <immintrin.h>
    292 ///
    293 /// This intrinsic corresponds to the <c> TCVTROWPS2BF16H </c> instruction.
    294 ///
    295 /// \param src0
    296 ///    The 1st source tile. Max size is 1024 Bytes.
    297 /// \param src1
    298 ///    The 2nd source r32. Size is 4 Bytes.
    299 /// \returns
    300 ///    The destination v32bf16 data. Size is 64 Bytes.
    301 __DEFAULT_FN_ATTRS_AVX512
    302 static __m512bh __tile_cvtrowps2bf16h(__tile1024i src0, unsigned src1) {
    303   return _tile_cvtrowps2bf16h_internal(src0.row, src0.col, src0.tile, src1);
    304 }
    305 
    306 /// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
    307 /// elements to bf16 at low 16-bits of each dword.
    308 /// The row and chunk elements of tile is fetched from 32bit src1.
    309 ///
    310 /// \headerfile <immintrin.h>
    311 ///
    312 /// This intrinsic corresponds to the <c> TCVTROWPS2BF16L </c> instruction.
    313 ///
    314 /// \param src0
    315 ///    The 1st source tile. Max size is 1024 Bytes.
    316 /// \param src1
    317 ///    The 2nd source r32. Size is 4 Bytes.
    318 /// \returns
    319 ///    The destination v32bf16 data. Size is 64 Bytes.
    320 __DEFAULT_FN_ATTRS_AVX512
    321 static __m512bh __tile_cvtrowps2bf16l(__tile1024i src0, unsigned src1) {
    322   return _tile_cvtrowps2bf16l_internal(src0.row, src0.col, src0.tile, src1);
    323 }
    324 
    325 /// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
    326 /// elements to fp16 at high 16-bits of each dword.
    327 /// The row and chunk elements of tile is fetched from 32bit src1.
    328 ///
    329 /// \headerfile <immintrin.h>
    330 ///
    331 /// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction.
    332 ///
    333 /// \param src0
    334 ///    The 1st source tile. Max size is 1024 Bytes.
    335 /// \param src1
    336 ///    The 2nd source r32. Size is 4 Bytes.
    337 /// \returns
    338 ///    The destination v32fp16 data. Size is 64 Bytes.
    339 __DEFAULT_FN_ATTRS_AVX512
    340 static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) {
    341   return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1);
    342 }
    343 
    344 /// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
    345 /// elements to fp16 at low 16-bits of each dword.
    346 /// The row and chunk elements of tile is fetched from 32bit src1.
    347 ///
    348 /// \headerfile <immintrin.h>
    349 ///
    350 /// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction.
    351 ///
    352 /// \param src0
    353 ///    The 1st source tile. Max size is 1024 Bytes.
    354 /// \param src1
    355 ///    The 2nd source r32. Size is 4 Bytes.
    356 /// \returns
    357 ///    The destination v32fp16 data. Size is 64 Bytes.
    358 __DEFAULT_FN_ATTRS_AVX512
    359 static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) {
    360   return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1);
    361 }
    362 
    363 /// Move one row of a tile data to a v16f32 data.
    364 /// The row of the tile is selected by a 32b GPR.
    365 ///
    366 /// \headerfile <immintrin.h>
    367 ///
    368 /// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
    369 ///
    370 /// \param src0
    371 ///    The 1st source tile. Max size is 1024 Bytes.
    372 /// \param src1
    373 ///    The 2nd source r32. Size is 4 Bytes.
    374 /// \returns
    375 ///    The destination v16i32 data. Size is 64 Bytes.
    376 __DEFAULT_FN_ATTRS_AVX512
    377 static __m512i __tile_movrow(__tile1024i src0, unsigned src1) {
    378   return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1);
    379 }
    380 
    381 #endif // __x86_64__ && __SSE2__
    382 #endif // __AMX_AVX512INTRIN_H