amxtransposeintrin.h (10853B) - Raw
1 /* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 * ===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead." 12 #endif /* __IMMINTRIN_H */ 13 14 #ifndef __AMX_TRANSPOSEINTRIN_H 15 #define __AMX_TRANSPOSEINTRIN_H 16 #ifdef __x86_64__ 17 18 #define __DEFAULT_FN_ATTRS_TRANSPOSE \ 19 __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose"))) 20 21 #define _tile_2rpntlvwz0(tdst, base, stride) \ 22 __builtin_ia32_t2rpntlvwz0(tdst, base, stride) 23 #define _tile_2rpntlvwz0t1(tdst, base, stride) \ 24 __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride) 25 #define _tile_2rpntlvwz1(tdst, base, stride) \ 26 __builtin_ia32_t2rpntlvwz1(tdst, base, stride) 27 #define _tile_2rpntlvwz1t1(tdst, base, stride) \ 28 __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride) 29 30 /// Transpose 32-bit elements from \a src and write the result to \a dst. 31 /// 32 /// \headerfile <immintrin.h> 33 /// 34 /// \code 35 /// void _tile_transposed(__tile dst, __tile src); 36 /// \endcode 37 /// 38 /// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction. 39 /// 40 /// \param dst 41 /// The destination tile. Max size is 1024 Bytes. 42 /// \param src 43 /// The source tile. Max size is 1024 Bytes. 44 /// 45 /// \code{.operation} 46 /// 47 /// FOR i := 0 TO (dst.rows-1) 48 /// tmp[511:0] := 0 49 /// FOR j := 0 TO (dst.colsb/4-1) 50 /// tmp.dword[j] := src.row[j].dword[i] 51 /// ENDFOR 52 /// dst.row[i] := tmp 53 /// ENDFOR 54 /// 55 /// zero_upper_rows(dst, dst.rows) 56 /// zero_tileconfig_start() 57 /// \endcode 58 #define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src) 59 60 static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal( 61 unsigned short row, unsigned short col0, unsigned short col1, 62 _tile1024i *dst0, _tile1024i *dst1, const void *base, 63 __SIZE_TYPE__ stride) { 64 // Use __tile1024i_1024a* to escape the alignment check in 65 // clang/test/Headers/x86-intrinsics-headers-clean.cpp 66 __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0, 67 (_tile1024i_1024a *)dst1, base, 68 (__SIZE_TYPE__)(stride)); 69 } 70 71 static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal( 72 unsigned short row, unsigned short col0, unsigned short col1, 73 _tile1024i *dst0, _tile1024i *dst1, const void *base, 74 __SIZE_TYPE__ stride) { 75 __builtin_ia32_t2rpntlvwz0t1_internal( 76 row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, 77 (__SIZE_TYPE__)(stride)); 78 } 79 80 static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal( 81 unsigned short row, unsigned short col0, unsigned short col1, 82 _tile1024i *dst0, _tile1024i *dst1, const void *base, 83 __SIZE_TYPE__ stride) { 84 __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0, 85 (_tile1024i_1024a *)dst1, base, 86 (__SIZE_TYPE__)(stride)); 87 } 88 89 static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal( 90 unsigned short row, unsigned short col0, unsigned short col1, 91 _tile1024i *dst0, _tile1024i *dst1, const void *base, 92 __SIZE_TYPE__ stride) { 93 __builtin_ia32_t2rpntlvwz1t1_internal( 94 row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, 95 (__SIZE_TYPE__)(stride)); 96 } 97 98 // This is internal intrinsic. C/C++ user should avoid calling it directly. 99 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE 100 _tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) { 101 return __builtin_ia32_ttransposed_internal(m, n, src); 102 } 103 104 /// Converts a pair of tiles from memory into VNNI format, and places the 105 /// results in a pair of destinations specified by dst. The pair of tiles 106 /// in memory is specified via a tsib; the second tile is after the first 107 /// one, separated by the same stride that separates each row. 108 /// The tile configuration for the destination tiles indicates the amount 109 /// of data to read from memory. The instruction will load a number of rows 110 /// that is equal to twice the number of rows in tmm1. The size of each row 111 /// is equal to the average width of the destination tiles. If the second 112 /// tile is configured with zero rows and columns, only the first tile will 113 /// be written. 114 /// Provides a hint to the implementation that the data will likely not be 115 /// reused in the near future and the data caching can be optimized. 116 /// 117 /// \headerfile <immintrin.h> 118 /// 119 /// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction. 120 /// 121 /// \param dst0 122 /// First tile of destination tile pair. Max size is 1024i*2 Bytes. 123 /// \param dst1 124 /// Second tile of destination tile pair. Max size is 1024i*2 Bytes. 125 /// \param base 126 /// A pointer to base address. 127 /// \param stride 128 /// The stride between the rows' data to be loaded in memory. 129 __DEFAULT_FN_ATTRS_TRANSPOSE 130 static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1, 131 const void *base, __SIZE_TYPE__ stride) { 132 _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, 133 &dst1->tile, base, stride); 134 } 135 136 /// Converts a pair of tiles from memory into VNNI format, and places the 137 /// results in a pair of destinations specified by dst. The pair of tiles 138 /// in memory is specified via a tsib; the second tile is after the first 139 /// one, separated by the same stride that separates each row. 140 /// The tile configuration for the destination tiles indicates the amount 141 /// of data to read from memory. The instruction will load a number of rows 142 /// that is equal to twice the number of rows in tmm1. The size of each row 143 /// is equal to the average width of the destination tiles. If the second 144 /// tile is configured with zero rows and columns, only the first tile will 145 /// be written. 146 /// 147 /// \headerfile <immintrin.h> 148 /// 149 /// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction. 150 /// 151 /// \param dst0 152 /// First tile of destination tile pair. Max size is 1024i*2 Bytes. 153 /// \param dst1 154 /// Second tile of destination tile pair. Max size is 1024i*2 Bytes. 155 /// \param base 156 /// A pointer to base address. 157 /// \param stride 158 /// The stride between the rows' data to be loaded in memory. 159 __DEFAULT_FN_ATTRS_TRANSPOSE 160 static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1, 161 const void *base, __SIZE_TYPE__ stride) { 162 _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, 163 &dst1->tile, base, stride); 164 } 165 166 /// Converts a pair of tiles from memory into VNNI format, and places the 167 /// results in a pair of destinations specified by dst. The pair of tiles 168 /// in memory is specified via a tsib; the second tile is after the first 169 /// one, separated by the same stride that separates each row. 170 /// The tile configuration for the destination tiles indicates the amount 171 /// of data to read from memory. The instruction will load a number of rows 172 /// that is equal to twice the number of rows in tmm1. The size of each row 173 /// is equal to the average width of the destination tiles. If the second 174 /// tile is configured with zero rows and columns, only the first tile will 175 /// be written. The last row will be not be read from memory but instead 176 /// filled with zeros. 177 /// Provides a hint to the implementation that the data will likely not be 178 /// reused in the near future and the data caching can be optimized. 179 /// 180 /// \headerfile <immintrin.h> 181 /// 182 /// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction. 183 /// 184 /// \param dst0 185 /// First tile of destination tile pair. Max size is 1024i*2 Bytes. 186 /// \param dst1 187 /// Second tile of destination tile pair. Max size is 1024i*2 Bytes. 188 /// \param base 189 /// A pointer to base address. 190 /// \param stride 191 /// The stride between the rows' data to be loaded in memory. 192 __DEFAULT_FN_ATTRS_TRANSPOSE 193 static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1, 194 const void *base, __SIZE_TYPE__ stride) { 195 _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, 196 &dst1->tile, base, stride); 197 } 198 199 /// Converts a pair of tiles from memory into VNNI format, and places the 200 /// results in a pair of destinations specified by dst. The pair of tiles 201 /// in memory is specified via a tsib; the second tile is after the first 202 /// one, separated by the same stride that separates each row. 203 /// The tile configuration for the destination tiles indicates the amount 204 /// of data to read from memory. The instruction will load a number of rows 205 /// that is equal to twice the number of rows in tmm1. The size of each row 206 /// is equal to the average width of the destination tiles. If the second 207 /// tile is configured with zero rows and columns, only the first tile will 208 /// be written. The last row will be not be read from memory but instead 209 /// filled with zeros. 210 /// Provides a hint to the implementation that the data will likely not be 211 /// reused in the near future and the data caching can be optimized. 212 /// 213 /// \headerfile <immintrin.h> 214 /// 215 /// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction. 216 /// 217 /// \param dst0 218 /// First tile of destination tile pair. Max size is 1024i*2 Bytes. 219 /// \param dst1 220 /// Second tile of destination tile pair. Max size is 1024i*2 Bytes. 221 /// \param base 222 /// A pointer to base address. 223 /// \param stride 224 /// The stride between the rows' data to be loaded in memory. 225 __DEFAULT_FN_ATTRS_TRANSPOSE 226 static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1, 227 const void *base, __SIZE_TYPE__ stride) { 228 _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, 229 &dst1->tile, base, stride); 230 } 231 232 /// Transpose 32-bit elements from src and write the result to dst. 233 /// 234 /// \headerfile <immintrin.h> 235 /// 236 /// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction. 237 /// 238 /// \param dst 239 /// The destination tile. Max size is 1024 Bytes. 240 /// \param src 241 /// The source tile. Max size is 1024 Bytes. 242 __DEFAULT_FN_ATTRS_TRANSPOSE 243 static void __tile_transposed(__tile1024i *dst, __tile1024i src) { 244 dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile); 245 } 246 247 #endif /* __x86_64__ */ 248 #endif /* __AMX_TRANSPOSEINTRIN_H */