zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

amxtransposeintrin.h (10853B) - Raw


      1 /* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  * ===-----------------------------------------------------------------------===
      8  */
      9 
     10 #ifndef __IMMINTRIN_H
     11 #error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
     12 #endif /* __IMMINTRIN_H */
     13 
     14 #ifndef __AMX_TRANSPOSEINTRIN_H
     15 #define __AMX_TRANSPOSEINTRIN_H
     16 #ifdef __x86_64__
     17 
     18 #define __DEFAULT_FN_ATTRS_TRANSPOSE                                           \
     19   __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
     20 
     21 #define _tile_2rpntlvwz0(tdst, base, stride)                                   \
     22   __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
     23 #define _tile_2rpntlvwz0t1(tdst, base, stride)                                 \
     24   __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
     25 #define _tile_2rpntlvwz1(tdst, base, stride)                                   \
     26   __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
     27 #define _tile_2rpntlvwz1t1(tdst, base, stride)                                 \
     28   __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
     29 
     30 /// Transpose 32-bit elements from \a src and write the result to \a dst.
     31 ///
     32 /// \headerfile <immintrin.h>
     33 ///
     34 /// \code
     35 /// void _tile_transposed(__tile dst, __tile src);
     36 /// \endcode
     37 ///
     38 /// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
     39 ///
     40 /// \param dst
     41 /// 	The destination tile. Max size is 1024 Bytes.
     42 /// \param src
     43 /// 	The source tile. Max size is 1024 Bytes.
     44 ///
     45 /// \code{.operation}
     46 ///
     47 /// FOR i := 0 TO (dst.rows-1)
     48 /// 	tmp[511:0] := 0
     49 /// 	FOR j := 0 TO (dst.colsb/4-1)
     50 /// 		tmp.dword[j] := src.row[j].dword[i]
     51 /// 	ENDFOR
     52 /// 	dst.row[i] := tmp
     53 /// ENDFOR
     54 ///
     55 /// zero_upper_rows(dst, dst.rows)
     56 /// zero_tileconfig_start()
     57 /// \endcode
     58 #define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
     59 
     60 static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
     61     unsigned short row, unsigned short col0, unsigned short col1,
     62     _tile1024i *dst0, _tile1024i *dst1, const void *base,
     63     __SIZE_TYPE__ stride) {
     64   // Use __tile1024i_1024a* to escape the alignment check in
     65   // clang/test/Headers/x86-intrinsics-headers-clean.cpp
     66   __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
     67                                       (_tile1024i_1024a *)dst1, base,
     68                                       (__SIZE_TYPE__)(stride));
     69 }
     70 
     71 static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
     72     unsigned short row, unsigned short col0, unsigned short col1,
     73     _tile1024i *dst0, _tile1024i *dst1, const void *base,
     74     __SIZE_TYPE__ stride) {
     75   __builtin_ia32_t2rpntlvwz0t1_internal(
     76       row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
     77       (__SIZE_TYPE__)(stride));
     78 }
     79 
     80 static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
     81     unsigned short row, unsigned short col0, unsigned short col1,
     82     _tile1024i *dst0, _tile1024i *dst1, const void *base,
     83     __SIZE_TYPE__ stride) {
     84   __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
     85                                       (_tile1024i_1024a *)dst1, base,
     86                                       (__SIZE_TYPE__)(stride));
     87 }
     88 
     89 static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
     90     unsigned short row, unsigned short col0, unsigned short col1,
     91     _tile1024i *dst0, _tile1024i *dst1, const void *base,
     92     __SIZE_TYPE__ stride) {
     93   __builtin_ia32_t2rpntlvwz1t1_internal(
     94       row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
     95       (__SIZE_TYPE__)(stride));
     96 }
     97 
     98 // This is internal intrinsic. C/C++ user should avoid calling it directly.
     99 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
    100 _tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
    101   return __builtin_ia32_ttransposed_internal(m, n, src);
    102 }
    103 
    104 /// Converts a pair of tiles from memory into VNNI format, and places the
    105 /// results in a pair of destinations specified by dst. The pair of tiles
    106 /// in memory is specified via a tsib; the second tile is after the first
    107 /// one, separated by the same stride that separates each row.
    108 /// The tile configuration for the destination tiles indicates the amount
    109 /// of data to read from memory. The instruction will load a number of rows
    110 /// that is equal to twice the number of rows in tmm1. The size of each row
    111 /// is equal to the average width of the destination tiles. If the second
    112 /// tile is configured with zero rows and columns, only the first tile will
    113 /// be written.
    114 /// Provides a hint to the implementation that the data will likely not be
    115 /// reused in the near future and the data caching can be optimized.
    116 ///
    117 /// \headerfile <immintrin.h>
    118 ///
    119 /// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
    120 ///
    121 /// \param dst0
    122 ///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    123 /// \param dst1
    124 ///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    125 /// \param base
    126 ///    A pointer to base address.
    127 /// \param stride
    128 ///    The stride between the rows' data to be loaded in memory.
    129 __DEFAULT_FN_ATTRS_TRANSPOSE
    130 static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
    131                               const void *base, __SIZE_TYPE__ stride) {
    132   _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    133                             &dst1->tile, base, stride);
    134 }
    135 
    136 /// Converts a pair of tiles from memory into VNNI format, and places the
    137 /// results in a pair of destinations specified by dst. The pair of tiles
    138 /// in memory is specified via a tsib; the second tile is after the first
    139 /// one, separated by the same stride that separates each row.
    140 /// The tile configuration for the destination tiles indicates the amount
    141 /// of data to read from memory. The instruction will load a number of rows
    142 /// that is equal to twice the number of rows in tmm1. The size of each row
    143 /// is equal to the average width of the destination tiles. If the second
    144 /// tile is configured with zero rows and columns, only the first tile will
    145 /// be written.
    146 ///
    147 /// \headerfile <immintrin.h>
    148 ///
    149 /// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
    150 ///
    151 /// \param dst0
    152 ///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    153 /// \param dst1
    154 ///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    155 /// \param base
    156 ///    A pointer to base address.
    157 /// \param stride
    158 ///    The stride between the rows' data to be loaded in memory.
    159 __DEFAULT_FN_ATTRS_TRANSPOSE
    160 static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
    161                                 const void *base, __SIZE_TYPE__ stride) {
    162   _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    163                               &dst1->tile, base, stride);
    164 }
    165 
    166 /// Converts a pair of tiles from memory into VNNI format, and places the
    167 /// results in a pair of destinations specified by dst. The pair of tiles
    168 /// in memory is specified via a tsib; the second tile is after the first
    169 /// one, separated by the same stride that separates each row.
    170 /// The tile configuration for the destination tiles indicates the amount
    171 /// of data to read from memory. The instruction will load a number of rows
    172 /// that is equal to twice the number of rows in tmm1. The size of each row
    173 /// is equal to the average width of the destination tiles. If the second
    174 /// tile is configured with zero rows and columns, only the first tile will
    175 /// be written. The last row will be not be read from memory but instead
    176 /// filled with zeros.
    177 /// Provides a hint to the implementation that the data will likely not be
    178 /// reused in the near future and the data caching can be optimized.
    179 ///
    180 /// \headerfile <immintrin.h>
    181 ///
    182 /// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
    183 ///
    184 /// \param dst0
    185 ///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    186 /// \param dst1
    187 ///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    188 /// \param base
    189 ///    A pointer to base address.
    190 /// \param stride
    191 ///    The stride between the rows' data to be loaded in memory.
    192 __DEFAULT_FN_ATTRS_TRANSPOSE
    193 static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
    194                               const void *base, __SIZE_TYPE__ stride) {
    195   _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    196                             &dst1->tile, base, stride);
    197 }
    198 
    199 /// Converts a pair of tiles from memory into VNNI format, and places the
    200 /// results in a pair of destinations specified by dst. The pair of tiles
    201 /// in memory is specified via a tsib; the second tile is after the first
    202 /// one, separated by the same stride that separates each row.
    203 /// The tile configuration for the destination tiles indicates the amount
    204 /// of data to read from memory. The instruction will load a number of rows
    205 /// that is equal to twice the number of rows in tmm1. The size of each row
    206 /// is equal to the average width of the destination tiles. If the second
    207 /// tile is configured with zero rows and columns, only the first tile will
    208 /// be written. The last row will be not be read from memory but instead
    209 /// filled with zeros.
    210 /// Provides a hint to the implementation that the data will likely not be
    211 /// reused in the near future and the data caching can be optimized.
    212 ///
    213 /// \headerfile <immintrin.h>
    214 ///
    215 /// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
    216 ///
    217 /// \param dst0
    218 ///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    219 /// \param dst1
    220 ///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    221 /// \param base
    222 ///    A pointer to base address.
    223 /// \param stride
    224 ///    The stride between the rows' data to be loaded in memory.
    225 __DEFAULT_FN_ATTRS_TRANSPOSE
    226 static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
    227                                 const void *base, __SIZE_TYPE__ stride) {
    228   _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    229                               &dst1->tile, base, stride);
    230 }
    231 
    232 /// Transpose 32-bit elements from src and write the result to dst.
    233 ///
    234 /// \headerfile <immintrin.h>
    235 ///
    236 /// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
    237 ///
    238 /// \param dst
    239 ///    The destination tile. Max size is 1024 Bytes.
    240 /// \param src
    241 ///    The source tile. Max size is 1024 Bytes.
    242 __DEFAULT_FN_ATTRS_TRANSPOSE
    243 static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
    244   dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
    245 }
    246 
    247 #endif /* __x86_64__ */
    248 #endif /* __AMX_TRANSPOSEINTRIN_H */