zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

avxifmaintrin.h (5886B) - Raw


      1 /*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 #ifndef __IMMINTRIN_H
     11 #error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
     12 #endif
     13 
     14 #ifndef __AVXIFMAINTRIN_H
     15 #define __AVXIFMAINTRIN_H
     16 
     17 /* Define the default attributes for the functions in this file. */
     18 #define __DEFAULT_FN_ATTRS128                                                  \
     19   __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
     20                  __min_vector_width__(128)))
     21 #define __DEFAULT_FN_ATTRS256                                                  \
     22   __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
     23                  __min_vector_width__(256)))
     24 
     25 // must vex-encoding
     26 
     27 /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
     28 /// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
     29 /// unsigned integer from the intermediate result with the corresponding
     30 /// unsigned 64-bit integer in \a __X, and store the results in \a dst.
     31 ///
     32 /// \headerfile <immintrin.h>
     33 ///
     34 /// \code
     35 /// __m128i
     36 /// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
     37 /// \endcode
     38 ///
     39 /// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
     40 ///
     41 /// \return
     42 /// 	return __m128i dst.
     43 /// \param __X
     44 /// 	A 128-bit vector of [2 x i64]
     45 /// \param __Y
     46 /// 	A 128-bit vector of [2 x i64]
     47 /// \param __Z
     48 /// 	A 128-bit vector of [2 x i64]
     49 ///
     50 /// \code{.operation}
     51 /// FOR j := 0 to 1
     52 /// 	i := j*64
     53 /// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
     54 /// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
     55 /// ENDFOR
     56 /// dst[MAX:128] := 0
     57 /// \endcode
     58 static __inline__ __m128i __DEFAULT_FN_ATTRS128
     59 _mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
     60   return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
     61                                                 (__v2di)__Z);
     62 }
     63 
     64 /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
     65 /// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
     66 /// unsigned integer from the intermediate result with the corresponding
     67 /// unsigned 64-bit integer in \a __X, and store the results in \a dst.
     68 ///
     69 /// \headerfile <immintrin.h>
     70 ///
     71 /// \code
     72 /// __m256i
     73 /// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
     74 /// \endcode
     75 ///
     76 /// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
     77 ///
     78 /// \return
     79 /// 	return __m256i dst.
     80 /// \param __X
     81 /// 	A 256-bit vector of [4 x i64]
     82 /// \param __Y
     83 /// 	A 256-bit vector of [4 x i64]
     84 /// \param __Z
     85 /// 	A 256-bit vector of [4 x i64]
     86 ///
     87 /// \code{.operation}
     88 /// FOR j := 0 to 3
     89 /// 	i := j*64
     90 /// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
     91 /// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
     92 /// ENDFOR
     93 /// dst[MAX:256] := 0
     94 /// \endcode
     95 static __inline__ __m256i __DEFAULT_FN_ATTRS256
     96 _mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
     97   return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
     98                                                 (__v4di)__Z);
     99 }
    100 
    101 /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
    102 /// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
    103 /// unsigned integer from the intermediate result with the corresponding
    104 /// unsigned 64-bit integer in \a __X, and store the results in \a dst.
    105 ///
    106 /// \headerfile <immintrin.h>
    107 ///
    108 /// \code
    109 /// __m128i
    110 /// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
    111 /// \endcode
    112 ///
    113 /// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
    114 ///
    115 /// \return
    116 /// 	return __m128i dst.
    117 /// \param __X
    118 /// 	A 128-bit vector of [2 x i64]
    119 /// \param __Y
    120 /// 	A 128-bit vector of [2 x i64]
    121 /// \param __Z
    122 /// 	A 128-bit vector of [2 x i64]
    123 ///
    124 /// \code{.operation}
    125 /// FOR j := 0 to 1
    126 /// 	i := j*64
    127 /// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
    128 /// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
    129 /// ENDFOR
    130 /// dst[MAX:128] := 0
    131 /// \endcode
    132 static __inline__ __m128i __DEFAULT_FN_ATTRS128
    133 _mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
    134   return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
    135                                                 (__v2di)__Z);
    136 }
    137 
    138 /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
    139 /// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
    140 /// unsigned integer from the intermediate result with the corresponding
    141 /// unsigned 64-bit integer in \a __X, and store the results in \a dst.
    142 ///
    143 /// \headerfile <immintrin.h>
    144 ///
    145 /// \code
    146 /// __m256i
    147 /// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
    148 /// \endcode
    149 ///
    150 /// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
    151 ///
    152 /// \return
    153 /// 	return __m256i dst.
    154 /// \param __X
    155 /// 	A 256-bit vector of [4 x i64]
    156 /// \param __Y
    157 /// 	A 256-bit vector of [4 x i64]
    158 /// \param __Z
    159 /// 	A 256-bit vector of [4 x i64]
    160 ///
    161 /// \code{.operation}
    162 /// FOR j := 0 to 3
    163 /// 	i := j*64
    164 /// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
    165 /// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
    166 /// ENDFOR
    167 /// dst[MAX:256] := 0
    168 /// \endcode
    169 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    170 _mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
    171   return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
    172                                                 (__v4di)__Z);
    173 }
    174 #undef __DEFAULT_FN_ATTRS128
    175 #undef __DEFAULT_FN_ATTRS256
    176 
    177 #endif // __AVXIFMAINTRIN_H