zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

amxmovrstransposeintrin.h (9401B) - Raw


      1 /* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  * ===-----------------------------------------------------------------------===
      8  */
      9 
     10 #ifndef __IMMINTRIN_H
     11 #error                                                                         \
     12     "Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead."
     13 #endif /* __IMMINTRIN_H */
     14 
     15 #ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H
     16 #define __AMX_MOVRS_TRANSPOSEINTRIN_H
     17 #ifdef __x86_64__
     18 
     19 #define __DEFAULT_FN_ATTRS                                                     \
     20   __attribute__((__always_inline__, __nodebug__,                               \
     21                  __target__("amx-transpose,amx-movrs")))
     22 
     23 #define _tile_2rpntlvwz0rs(tdst, base, stride)                                 \
     24   __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
     25 #define _tile_2rpntlvwz0rst1(tdst, base, stride)                               \
     26   __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
     27 #define _tile_2rpntlvwz1rs(tdst, base, stride)                                 \
     28   __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
     29 #define _tile_2rpntlvwz1rst1(tdst, base, stride)                               \
     30   __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
     31 
     32 static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal(
     33     unsigned short row, unsigned short col0, unsigned short col1,
     34     _tile1024i *dst0, _tile1024i *dst1, const void *base,
     35     __SIZE_TYPE__ stride) {
     36   // Use __tile1024i_1024a* to escape the alignment check in
     37   // clang/test/Headers/x86-intrinsics-headers-clean.cpp
     38   __builtin_ia32_t2rpntlvwz0rs_internal(
     39       row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
     40       (__SIZE_TYPE__)(stride));
     41 }
     42 
     43 static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal(
     44     unsigned short row, unsigned short col0, unsigned short col1,
     45     _tile1024i *dst0, _tile1024i *dst1, const void *base,
     46     __SIZE_TYPE__ stride) {
     47   __builtin_ia32_t2rpntlvwz0rst1_internal(
     48       row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
     49       (__SIZE_TYPE__)(stride));
     50 }
     51 
     52 static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal(
     53     unsigned short row, unsigned short col0, unsigned short col1,
     54     _tile1024i *dst0, _tile1024i *dst1, const void *base,
     55     __SIZE_TYPE__ stride) {
     56   __builtin_ia32_t2rpntlvwz1rs_internal(
     57       row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
     58       (__SIZE_TYPE__)(stride));
     59 }
     60 
     61 static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal(
     62     unsigned short row, unsigned short col0, unsigned short col1,
     63     _tile1024i *dst0, _tile1024i *dst1, const void *base,
     64     __SIZE_TYPE__ stride) {
     65   __builtin_ia32_t2rpntlvwz1rst1_internal(
     66       row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
     67       (__SIZE_TYPE__)(stride));
     68 }
     69 
     70 /// Converts a pair of tiles from memory into VNNI format, and places the
     71 /// results in a pair of destinations specified by dst. The pair of tiles
     72 /// in memory is specified via a tsib; the second tile is after the first
     73 /// one, separated by the same stride that separates each row.
     74 /// The tile configuration for the destination tiles indicates the amount
     75 /// of data to read from memory. The instruction will load a number of rows
     76 /// that is equal to twice the number of rows in tmm1. The size of each row
     77 /// is equal to the average width of the destination tiles. If the second
     78 /// tile is configured with zero rows and columns, only the first tile will
     79 /// be written.
     80 /// Provides a hint to the implementation that the data will likely become
     81 /// read shared in the near future and the data caching can be optimized.
     82 ///
     83 /// \headerfile <immintrin.h>
     84 ///
     85 /// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction.
     86 ///
     87 /// \param dst0
     88 ///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
     89 /// \param dst1
     90 ///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
     91 /// \param base
     92 ///    A pointer to base address.
     93 /// \param stride
     94 ///    The stride between the rows' data to be loaded in memory.
     95 __DEFAULT_FN_ATTRS
     96 static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
     97                                 const void *base, __SIZE_TYPE__ stride) {
     98   _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
     99                               &dst1->tile, base, stride);
    100 }
    101 
    102 /// Converts a pair of tiles from memory into VNNI format, and places the
    103 /// results in a pair of destinations specified by dst. The pair of tiles
    104 /// in memory is specified via a tsib; the second tile is after the first
    105 /// one, separated by the same stride that separates each row.
    106 /// The tile configuration for the destination tiles indicates the amount
    107 /// of data to read from memory. The instruction will load a number of rows
    108 /// that is equal to twice the number of rows in tmm1. The size of each row
    109 /// is equal to the average width of the destination tiles. If the second
    110 /// tile is configured with zero rows and columns, only the first tile will
    111 /// be written.
    112 ///
    113 /// \headerfile <immintrin.h>
    114 ///
    115 /// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction.
    116 ///
    117 /// \param dst0
    118 ///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    119 /// \param dst1
    120 ///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    121 /// \param base
    122 ///    A pointer to base address.
    123 /// \param stride
    124 ///    The stride between the rows' data to be loaded in memory.
    125 __DEFAULT_FN_ATTRS
    126 static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
    127                                   const void *base, __SIZE_TYPE__ stride) {
    128   _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    129                                 &dst1->tile, base, stride);
    130 }
    131 
    132 /// Converts a pair of tiles from memory into VNNI format, and places the
    133 /// results in a pair of destinations specified by dst. The pair of tiles
    134 /// in memory is specified via a tsib; the second tile is after the first
    135 /// one, separated by the same stride that separates each row.
    136 /// The tile configuration for the destination tiles indicates the amount
    137 /// of data to read from memory. The instruction will load a number of rows
    138 /// that is equal to twice the number of rows in tmm1. The size of each row
    139 /// is equal to the average width of the destination tiles. If the second
    140 /// tile is configured with zero rows and columns, only the first tile will
    141 /// be written. The last row will be not be read from memory but instead
    142 /// filled with zeros.
    143 /// Provides a hint to the implementation that the data will likely become
    144 /// read shared in the near future and the data caching can be optimized.
    145 ///
    146 /// \headerfile <immintrin.h>
    147 ///
    148 /// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
    149 ///
    150 /// \param dst0
    151 ///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    152 /// \param dst1
    153 ///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    154 /// \param base
    155 ///    A pointer to base address.
    156 /// \param stride
    157 ///    The stride between the rows' data to be loaded in memory.
    158 __DEFAULT_FN_ATTRS
    159 static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
    160                                 const void *base, __SIZE_TYPE__ stride) {
    161   _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    162                               &dst1->tile, base, stride);
    163 }
    164 
    165 /// Converts a pair of tiles from memory into VNNI format, and places the
    166 /// results in a pair of destinations specified by dst. The pair of tiles
    167 /// in memory is specified via a tsib; the second tile is after the first
    168 /// one, separated by the same stride that separates each row.
    169 /// The tile configuration for the destination tiles indicates the amount
    170 /// of data to read from memory. The instruction will load a number of rows
    171 /// that is equal to twice the number of rows in tmm1. The size of each row
    172 /// is equal to the average width of the destination tiles. If the second
    173 /// tile is configured with zero rows and columns, only the first tile will
    174 /// be written. The last row will be not be read from memory but instead
    175 /// filled with zeros.
    176 /// Provides a hint to the implementation that the data will likely become
    177 /// read shared in the near future and the data caching can be optimized.
    178 ///
    179 /// \headerfile <immintrin.h>
    180 ///
    181 /// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction.
    182 ///
    183 /// \param dst0
    184 ///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
    185 /// \param dst1
    186 ///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
    187 /// \param base
    188 ///    A pointer to base address.
    189 /// \param stride
    190 ///    The stride between the rows' data to be loaded in memory.
    191 __DEFAULT_FN_ATTRS
    192 static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1,
    193                                   const void *base, __SIZE_TYPE__ stride) {
    194   _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
    195                                 &dst1->tile, base, stride);
    196 }
    197 
    198 #undef __DEFAULT_FN_ATTRS
    199 #endif /* __x86_64__ */
    200 #endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */