amxmovrstransposeintrin.h (9401B) - Raw
1 /* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 * ===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error \ 12 "Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead." 13 #endif /* __IMMINTRIN_H */ 14 15 #ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H 16 #define __AMX_MOVRS_TRANSPOSEINTRIN_H 17 #ifdef __x86_64__ 18 19 #define __DEFAULT_FN_ATTRS \ 20 __attribute__((__always_inline__, __nodebug__, \ 21 __target__("amx-transpose,amx-movrs"))) 22 23 #define _tile_2rpntlvwz0rs(tdst, base, stride) \ 24 __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride) 25 #define _tile_2rpntlvwz0rst1(tdst, base, stride) \ 26 __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride) 27 #define _tile_2rpntlvwz1rs(tdst, base, stride) \ 28 __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride) 29 #define _tile_2rpntlvwz1rst1(tdst, base, stride) \ 30 __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride) 31 32 static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal( 33 unsigned short row, unsigned short col0, unsigned short col1, 34 _tile1024i *dst0, _tile1024i *dst1, const void *base, 35 __SIZE_TYPE__ stride) { 36 // Use __tile1024i_1024a* to escape the alignment check in 37 // clang/test/Headers/x86-intrinsics-headers-clean.cpp 38 __builtin_ia32_t2rpntlvwz0rs_internal( 39 row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, 40 (__SIZE_TYPE__)(stride)); 41 } 42 43 static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal( 44 unsigned short row, unsigned short col0, unsigned short col1, 45 _tile1024i *dst0, _tile1024i *dst1, const void *base, 46 __SIZE_TYPE__ stride) { 47 __builtin_ia32_t2rpntlvwz0rst1_internal( 48 row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, 49 (__SIZE_TYPE__)(stride)); 50 } 51 52 static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal( 53 unsigned short row, unsigned short col0, unsigned short col1, 54 _tile1024i *dst0, _tile1024i *dst1, const void *base, 55 __SIZE_TYPE__ stride) { 56 __builtin_ia32_t2rpntlvwz1rs_internal( 57 row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, 58 (__SIZE_TYPE__)(stride)); 59 } 60 61 static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal( 62 unsigned short row, unsigned short col0, unsigned short col1, 63 _tile1024i *dst0, _tile1024i *dst1, const void *base, 64 __SIZE_TYPE__ stride) { 65 __builtin_ia32_t2rpntlvwz1rst1_internal( 66 row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, 67 (__SIZE_TYPE__)(stride)); 68 } 69 70 /// Converts a pair of tiles from memory into VNNI format, and places the 71 /// results in a pair of destinations specified by dst. The pair of tiles 72 /// in memory is specified via a tsib; the second tile is after the first 73 /// one, separated by the same stride that separates each row. 74 /// The tile configuration for the destination tiles indicates the amount 75 /// of data to read from memory. The instruction will load a number of rows 76 /// that is equal to twice the number of rows in tmm1. The size of each row 77 /// is equal to the average width of the destination tiles. If the second 78 /// tile is configured with zero rows and columns, only the first tile will 79 /// be written. 80 /// Provides a hint to the implementation that the data will likely become 81 /// read shared in the near future and the data caching can be optimized. 82 /// 83 /// \headerfile <immintrin.h> 84 /// 85 /// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction. 86 /// 87 /// \param dst0 88 /// First tile of destination tile pair. Max size is 1024i*2 Bytes. 89 /// \param dst1 90 /// Second tile of destination tile pair. Max size is 1024i*2 Bytes. 91 /// \param base 92 /// A pointer to base address. 93 /// \param stride 94 /// The stride between the rows' data to be loaded in memory. 95 __DEFAULT_FN_ATTRS 96 static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1, 97 const void *base, __SIZE_TYPE__ stride) { 98 _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, 99 &dst1->tile, base, stride); 100 } 101 102 /// Converts a pair of tiles from memory into VNNI format, and places the 103 /// results in a pair of destinations specified by dst. The pair of tiles 104 /// in memory is specified via a tsib; the second tile is after the first 105 /// one, separated by the same stride that separates each row. 106 /// The tile configuration for the destination tiles indicates the amount 107 /// of data to read from memory. The instruction will load a number of rows 108 /// that is equal to twice the number of rows in tmm1. The size of each row 109 /// is equal to the average width of the destination tiles. If the second 110 /// tile is configured with zero rows and columns, only the first tile will 111 /// be written. 112 /// 113 /// \headerfile <immintrin.h> 114 /// 115 /// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction. 116 /// 117 /// \param dst0 118 /// First tile of destination tile pair. Max size is 1024i*2 Bytes. 119 /// \param dst1 120 /// Second tile of destination tile pair. Max size is 1024i*2 Bytes. 121 /// \param base 122 /// A pointer to base address. 123 /// \param stride 124 /// The stride between the rows' data to be loaded in memory. 125 __DEFAULT_FN_ATTRS 126 static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1, 127 const void *base, __SIZE_TYPE__ stride) { 128 _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, 129 &dst1->tile, base, stride); 130 } 131 132 /// Converts a pair of tiles from memory into VNNI format, and places the 133 /// results in a pair of destinations specified by dst. The pair of tiles 134 /// in memory is specified via a tsib; the second tile is after the first 135 /// one, separated by the same stride that separates each row. 136 /// The tile configuration for the destination tiles indicates the amount 137 /// of data to read from memory. The instruction will load a number of rows 138 /// that is equal to twice the number of rows in tmm1. The size of each row 139 /// is equal to the average width of the destination tiles. If the second 140 /// tile is configured with zero rows and columns, only the first tile will 141 /// be written. The last row will be not be read from memory but instead 142 /// filled with zeros. 143 /// Provides a hint to the implementation that the data will likely become 144 /// read shared in the near future and the data caching can be optimized. 145 /// 146 /// \headerfile <immintrin.h> 147 /// 148 /// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction. 149 /// 150 /// \param dst0 151 /// First tile of destination tile pair. Max size is 1024i*2 Bytes. 152 /// \param dst1 153 /// Second tile of destination tile pair. Max size is 1024i*2 Bytes. 154 /// \param base 155 /// A pointer to base address. 156 /// \param stride 157 /// The stride between the rows' data to be loaded in memory. 158 __DEFAULT_FN_ATTRS 159 static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1, 160 const void *base, __SIZE_TYPE__ stride) { 161 _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, 162 &dst1->tile, base, stride); 163 } 164 165 /// Converts a pair of tiles from memory into VNNI format, and places the 166 /// results in a pair of destinations specified by dst. The pair of tiles 167 /// in memory is specified via a tsib; the second tile is after the first 168 /// one, separated by the same stride that separates each row. 169 /// The tile configuration for the destination tiles indicates the amount 170 /// of data to read from memory. The instruction will load a number of rows 171 /// that is equal to twice the number of rows in tmm1. The size of each row 172 /// is equal to the average width of the destination tiles. If the second 173 /// tile is configured with zero rows and columns, only the first tile will 174 /// be written. The last row will be not be read from memory but instead 175 /// filled with zeros. 176 /// Provides a hint to the implementation that the data will likely become 177 /// read shared in the near future and the data caching can be optimized. 178 /// 179 /// \headerfile <immintrin.h> 180 /// 181 /// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction. 182 /// 183 /// \param dst0 184 /// First tile of destination tile pair. Max size is 1024i*2 Bytes. 185 /// \param dst1 186 /// Second tile of destination tile pair. Max size is 1024i*2 Bytes. 187 /// \param base 188 /// A pointer to base address. 189 /// \param stride 190 /// The stride between the rows' data to be loaded in memory. 191 __DEFAULT_FN_ATTRS 192 static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1, 193 const void *base, __SIZE_TYPE__ stride) { 194 _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, 195 &dst1->tile, base, stride); 196 } 197 198 #undef __DEFAULT_FN_ATTRS 199 #endif /* __x86_64__ */ 200 #endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */