zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

tmmintrin.h (31927B) - Raw


      1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 #ifndef __TMMINTRIN_H
     11 #define __TMMINTRIN_H
     12 
     13 #if !defined(__i386__) && !defined(__x86_64__)
     14 #error "This header is only meant to be used on x86 and x64 architecture"
     15 #endif
     16 
     17 #include <pmmintrin.h>
     18 
     19 /* Define the default attributes for the functions in this file. */
     20 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
     21 #define __DEFAULT_FN_ATTRS                                                     \
     22   __attribute__((__always_inline__, __nodebug__,                               \
     23                  __target__("ssse3,no-evex512"), __min_vector_width__(128)))
     24 #else
     25 #define __DEFAULT_FN_ATTRS                                                     \
     26   __attribute__((__always_inline__, __nodebug__, __target__("ssse3"),          \
     27                  __min_vector_width__(128)))
     28 #endif
     29 
     30 #define __trunc64(x)                                                           \
     31   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
     32 #define __anyext128(x)                                                         \
     33   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
     34                                     1, -1, -1)
     35 
     36 /// Computes the absolute value of each of the packed 8-bit signed
     37 ///    integers in the source operand and stores the 8-bit unsigned integer
     38 ///    results in the destination.
     39 ///
     40 /// \headerfile <x86intrin.h>
     41 ///
     42 /// This intrinsic corresponds to the \c PABSB instruction.
     43 ///
     44 /// \param __a
     45 ///    A 64-bit vector of [8 x i8].
     46 /// \returns A 64-bit integer vector containing the absolute values of the
     47 ///    elements in the operand.
     48 static __inline__ __m64 __DEFAULT_FN_ATTRS
     49 _mm_abs_pi8(__m64 __a)
     50 {
     51   return (__m64)__builtin_elementwise_abs((__v8qs)__a);
     52 }
     53 
     54 /// Computes the absolute value of each of the packed 8-bit signed
     55 ///    integers in the source operand and stores the 8-bit unsigned integer
     56 ///    results in the destination.
     57 ///
     58 /// \headerfile <x86intrin.h>
     59 ///
     60 /// This intrinsic corresponds to the \c VPABSB instruction.
     61 ///
     62 /// \param __a
     63 ///    A 128-bit vector of [16 x i8].
     64 /// \returns A 128-bit integer vector containing the absolute values of the
     65 ///    elements in the operand.
     66 static __inline__ __m128i __DEFAULT_FN_ATTRS
     67 _mm_abs_epi8(__m128i __a)
     68 {
     69     return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
     70 }
     71 
     72 /// Computes the absolute value of each of the packed 16-bit signed
     73 ///    integers in the source operand and stores the 16-bit unsigned integer
     74 ///    results in the destination.
     75 ///
     76 /// \headerfile <x86intrin.h>
     77 ///
     78 /// This intrinsic corresponds to the \c PABSW instruction.
     79 ///
     80 /// \param __a
     81 ///    A 64-bit vector of [4 x i16].
     82 /// \returns A 64-bit integer vector containing the absolute values of the
     83 ///    elements in the operand.
     84 static __inline__ __m64 __DEFAULT_FN_ATTRS
     85 _mm_abs_pi16(__m64 __a)
     86 {
     87     return (__m64)__builtin_elementwise_abs((__v4hi)__a);
     88 }
     89 
     90 /// Computes the absolute value of each of the packed 16-bit signed
     91 ///    integers in the source operand and stores the 16-bit unsigned integer
     92 ///    results in the destination.
     93 ///
     94 /// \headerfile <x86intrin.h>
     95 ///
     96 /// This intrinsic corresponds to the \c VPABSW instruction.
     97 ///
     98 /// \param __a
     99 ///    A 128-bit vector of [8 x i16].
    100 /// \returns A 128-bit integer vector containing the absolute values of the
    101 ///    elements in the operand.
    102 static __inline__ __m128i __DEFAULT_FN_ATTRS
    103 _mm_abs_epi16(__m128i __a)
    104 {
    105     return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
    106 }
    107 
    108 /// Computes the absolute value of each of the packed 32-bit signed
    109 ///    integers in the source operand and stores the 32-bit unsigned integer
    110 ///    results in the destination.
    111 ///
    112 /// \headerfile <x86intrin.h>
    113 ///
    114 /// This intrinsic corresponds to the \c PABSD instruction.
    115 ///
    116 /// \param __a
    117 ///    A 64-bit vector of [2 x i32].
    118 /// \returns A 64-bit integer vector containing the absolute values of the
    119 ///    elements in the operand.
    120 static __inline__ __m64 __DEFAULT_FN_ATTRS
    121 _mm_abs_pi32(__m64 __a)
    122 {
    123     return (__m64)__builtin_elementwise_abs((__v2si)__a);
    124 }
    125 
    126 /// Computes the absolute value of each of the packed 32-bit signed
    127 ///    integers in the source operand and stores the 32-bit unsigned integer
    128 ///    results in the destination.
    129 ///
    130 /// \headerfile <x86intrin.h>
    131 ///
    132 /// This intrinsic corresponds to the \c VPABSD instruction.
    133 ///
    134 /// \param __a
    135 ///    A 128-bit vector of [4 x i32].
    136 /// \returns A 128-bit integer vector containing the absolute values of the
    137 ///    elements in the operand.
    138 static __inline__ __m128i __DEFAULT_FN_ATTRS
    139 _mm_abs_epi32(__m128i __a)
    140 {
    141     return (__m128i)__builtin_elementwise_abs((__v4si)__a);
    142 }
    143 
    144 /// Concatenates the two 128-bit integer vector operands, and
    145 ///    right-shifts the result by the number of bytes specified in the immediate
    146 ///    operand.
    147 ///
    148 /// \headerfile <x86intrin.h>
    149 ///
    150 /// \code
    151 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
    152 /// \endcode
    153 ///
    154 /// This intrinsic corresponds to the \c PALIGNR instruction.
    155 ///
    156 /// \param a
    157 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
    158 /// \param b
    159 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
    160 /// \param n
    161 ///    An immediate operand specifying how many bytes to right-shift the result.
    162 /// \returns A 128-bit integer vector containing the concatenated right-shifted
    163 ///    value.
    164 #define _mm_alignr_epi8(a, b, n) \
    165   ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
    166                                       (__v16qi)(__m128i)(b), (n)))
    167 
    168 /// Concatenates the two 64-bit integer vector operands, and right-shifts
    169 ///    the result by the number of bytes specified in the immediate operand.
    170 ///
    171 /// \headerfile <x86intrin.h>
    172 ///
    173 /// \code
    174 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
    175 /// \endcode
    176 ///
    177 /// This intrinsic corresponds to the \c PALIGNR instruction.
    178 ///
    179 /// \param a
    180 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
    181 /// \param b
    182 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
    183 /// \param n
    184 ///    An immediate operand specifying how many bytes to right-shift the result.
    185 /// \returns A 64-bit integer vector containing the concatenated right-shifted
    186 ///    value.
    187 #define _mm_alignr_pi8(a, b, n) \
    188   ((__m64)__builtin_shufflevector(                                       \
    189        __builtin_ia32_psrldqi128_byteshift(                              \
    190            __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0),      \
    191            (n)), __extension__ (__v2di){}, 0))
    192 
    193 /// Horizontally adds the adjacent pairs of values contained in 2 packed
    194 ///    128-bit vectors of [8 x i16].
    195 ///
    196 /// \headerfile <x86intrin.h>
    197 ///
    198 /// This intrinsic corresponds to the \c VPHADDW instruction.
    199 ///
    200 /// \param __a
    201 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    202 ///    horizontal sums of the values are stored in the lower bits of the
    203 ///    destination.
    204 /// \param __b
    205 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    206 ///    horizontal sums of the values are stored in the upper bits of the
    207 ///    destination.
    208 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
    209 ///    both operands.
    210 static __inline__ __m128i __DEFAULT_FN_ATTRS
    211 _mm_hadd_epi16(__m128i __a, __m128i __b)
    212 {
    213     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
    214 }
    215 
    216 /// Horizontally adds the adjacent pairs of values contained in 2 packed
    217 ///    128-bit vectors of [4 x i32].
    218 ///
    219 /// \headerfile <x86intrin.h>
    220 ///
    221 /// This intrinsic corresponds to the \c VPHADDD instruction.
    222 ///
    223 /// \param __a
    224 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
    225 ///    horizontal sums of the values are stored in the lower bits of the
    226 ///    destination.
    227 /// \param __b
    228 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
    229 ///    horizontal sums of the values are stored in the upper bits of the
    230 ///    destination.
    231 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
    232 ///    both operands.
    233 static __inline__ __m128i __DEFAULT_FN_ATTRS
    234 _mm_hadd_epi32(__m128i __a, __m128i __b)
    235 {
    236     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
    237 }
    238 
    239 /// Horizontally adds the adjacent pairs of values contained in 2 packed
    240 ///    64-bit vectors of [4 x i16].
    241 ///
    242 /// \headerfile <x86intrin.h>
    243 ///
    244 /// This intrinsic corresponds to the \c PHADDW instruction.
    245 ///
    246 /// \param __a
    247 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    248 ///    horizontal sums of the values are stored in the lower bits of the
    249 ///    destination.
    250 /// \param __b
    251 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    252 ///    horizontal sums of the values are stored in the upper bits of the
    253 ///    destination.
    254 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
    255 ///    operands.
    256 static __inline__ __m64 __DEFAULT_FN_ATTRS
    257 _mm_hadd_pi16(__m64 __a, __m64 __b)
    258 {
    259     return __trunc64(__builtin_ia32_phaddw128(
    260         (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
    261 }
    262 
    263 /// Horizontally adds the adjacent pairs of values contained in 2 packed
    264 ///    64-bit vectors of [2 x i32].
    265 ///
    266 /// \headerfile <x86intrin.h>
    267 ///
    268 /// This intrinsic corresponds to the \c PHADDD instruction.
    269 ///
    270 /// \param __a
    271 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
    272 ///    horizontal sums of the values are stored in the lower bits of the
    273 ///    destination.
    274 /// \param __b
    275 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
    276 ///    horizontal sums of the values are stored in the upper bits of the
    277 ///    destination.
    278 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
    279 ///    operands.
    280 static __inline__ __m64 __DEFAULT_FN_ATTRS
    281 _mm_hadd_pi32(__m64 __a, __m64 __b)
    282 {
    283     return __trunc64(__builtin_ia32_phaddd128(
    284         (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
    285 }
    286 
    287 /// Horizontally adds, with saturation, the adjacent pairs of values contained
    288 ///    in two packed 128-bit vectors of [8 x i16].
    289 ///
    290 ///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
    291 ///    less than 0x8000 are saturated to 0x8000.
    292 ///
    293 /// \headerfile <x86intrin.h>
    294 ///
    295 /// This intrinsic corresponds to the \c VPHADDSW instruction.
    296 ///
    297 /// \param __a
    298 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    299 ///    horizontal sums of the values are stored in the lower bits of the
    300 ///    destination.
    301 /// \param __b
    302 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    303 ///    horizontal sums of the values are stored in the upper bits of the
    304 ///    destination.
    305 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
    306 ///    sums of both operands.
    307 static __inline__ __m128i __DEFAULT_FN_ATTRS
    308 _mm_hadds_epi16(__m128i __a, __m128i __b)
    309 {
    310     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
    311 }
    312 
    313 /// Horizontally adds, with saturation, the adjacent pairs of values contained
    314 ///    in two packed 64-bit vectors of [4 x i16].
    315 ///
    316 ///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
    317 ///    less than 0x8000 are saturated to 0x8000.
    318 ///
    319 /// \headerfile <x86intrin.h>
    320 ///
    321 /// This intrinsic corresponds to the \c PHADDSW instruction.
    322 ///
    323 /// \param __a
    324 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    325 ///    horizontal sums of the values are stored in the lower bits of the
    326 ///    destination.
    327 /// \param __b
    328 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    329 ///    horizontal sums of the values are stored in the upper bits of the
    330 ///    destination.
    331 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
    332 ///    sums of both operands.
    333 static __inline__ __m64 __DEFAULT_FN_ATTRS
    334 _mm_hadds_pi16(__m64 __a, __m64 __b)
    335 {
    336     return __trunc64(__builtin_ia32_phaddsw128(
    337         (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
    338 }
    339 
    340 /// Horizontally subtracts the adjacent pairs of values contained in 2
    341 ///    packed 128-bit vectors of [8 x i16].
    342 ///
    343 /// \headerfile <x86intrin.h>
    344 ///
    345 /// This intrinsic corresponds to the \c VPHSUBW instruction.
    346 ///
    347 /// \param __a
    348 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    349 ///    horizontal differences between the values are stored in the lower bits of
    350 ///    the destination.
    351 /// \param __b
    352 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    353 ///    horizontal differences between the values are stored in the upper bits of
    354 ///    the destination.
    355 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
    356 ///    of both operands.
    357 static __inline__ __m128i __DEFAULT_FN_ATTRS
    358 _mm_hsub_epi16(__m128i __a, __m128i __b)
    359 {
    360     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
    361 }
    362 
    363 /// Horizontally subtracts the adjacent pairs of values contained in 2
    364 ///    packed 128-bit vectors of [4 x i32].
    365 ///
    366 /// \headerfile <x86intrin.h>
    367 ///
    368 /// This intrinsic corresponds to the \c VPHSUBD instruction.
    369 ///
    370 /// \param __a
    371 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
    372 ///    horizontal differences between the values are stored in the lower bits of
    373 ///    the destination.
    374 /// \param __b
    375 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
    376 ///    horizontal differences between the values are stored in the upper bits of
    377 ///    the destination.
    378 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
    379 ///    of both operands.
    380 static __inline__ __m128i __DEFAULT_FN_ATTRS
    381 _mm_hsub_epi32(__m128i __a, __m128i __b)
    382 {
    383     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
    384 }
    385 
    386 /// Horizontally subtracts the adjacent pairs of values contained in 2
    387 ///    packed 64-bit vectors of [4 x i16].
    388 ///
    389 /// \headerfile <x86intrin.h>
    390 ///
    391 /// This intrinsic corresponds to the \c PHSUBW instruction.
    392 ///
    393 /// \param __a
    394 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    395 ///    horizontal differences between the values are stored in the lower bits of
    396 ///    the destination.
    397 /// \param __b
    398 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    399 ///    horizontal differences between the values are stored in the upper bits of
    400 ///    the destination.
    401 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
    402 ///    of both operands.
    403 static __inline__ __m64 __DEFAULT_FN_ATTRS
    404 _mm_hsub_pi16(__m64 __a, __m64 __b)
    405 {
    406     return __trunc64(__builtin_ia32_phsubw128(
    407         (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
    408 }
    409 
    410 /// Horizontally subtracts the adjacent pairs of values contained in 2
    411 ///    packed 64-bit vectors of [2 x i32].
    412 ///
    413 /// \headerfile <x86intrin.h>
    414 ///
    415 /// This intrinsic corresponds to the \c PHSUBD instruction.
    416 ///
    417 /// \param __a
    418 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
    419 ///    horizontal differences between the values are stored in the lower bits of
    420 ///    the destination.
    421 /// \param __b
    422 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
    423 ///    horizontal differences between the values are stored in the upper bits of
    424 ///    the destination.
    425 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
    426 ///    of both operands.
    427 static __inline__ __m64 __DEFAULT_FN_ATTRS
    428 _mm_hsub_pi32(__m64 __a, __m64 __b)
    429 {
    430     return __trunc64(__builtin_ia32_phsubd128(
    431         (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
    432 }
    433 
    434 /// Horizontally subtracts, with saturation, the adjacent pairs of values
    435 ///    contained in two packed 128-bit vectors of [8 x i16].
    436 ///
    437 ///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
    438 ///    Negative differences less than 0x8000 are saturated to 0x8000.
    439 ///
    440 /// \headerfile <x86intrin.h>
    441 ///
    442 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
    443 ///
    444 /// \param __a
    445 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    446 ///    horizontal differences between the values are stored in the lower bits of
    447 ///    the destination.
    448 /// \param __b
    449 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
    450 ///    horizontal differences between the values are stored in the upper bits of
    451 ///    the destination.
    452 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
    453 ///    differences of both operands.
    454 static __inline__ __m128i __DEFAULT_FN_ATTRS
    455 _mm_hsubs_epi16(__m128i __a, __m128i __b)
    456 {
    457     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
    458 }
    459 
    460 /// Horizontally subtracts, with saturation, the adjacent pairs of values
    461 ///    contained in two packed 64-bit vectors of [4 x i16].
    462 ///
    463 ///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
    464 ///    Negative differences less than 0x8000 are saturated to 0x8000.
    465 ///
    466 /// \headerfile <x86intrin.h>
    467 ///
    468 /// This intrinsic corresponds to the \c PHSUBSW instruction.
    469 ///
    470 /// \param __a
    471 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    472 ///    horizontal differences between the values are stored in the lower bits of
    473 ///    the destination.
    474 /// \param __b
    475 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
    476 ///    horizontal differences between the values are stored in the upper bits of
    477 ///    the destination.
    478 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
    479 ///    differences of both operands.
    480 static __inline__ __m64 __DEFAULT_FN_ATTRS
    481 _mm_hsubs_pi16(__m64 __a, __m64 __b)
    482 {
    483     return __trunc64(__builtin_ia32_phsubsw128(
    484         (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
    485 }
    486 
    487 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
    488 ///    values contained in the first source operand and packed 8-bit signed
    489 ///    integer values contained in the second source operand, adds pairs of
    490 ///    contiguous products with signed saturation, and writes the 16-bit sums to
    491 ///    the corresponding bits in the destination.
    492 ///
    493 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
    494 ///    both operands are multiplied, and the sum of both results is written to
    495 ///    bits [15:0] of the destination.
    496 ///
    497 /// \headerfile <x86intrin.h>
    498 ///
    499 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
    500 ///
    501 /// \param __a
    502 ///    A 128-bit integer vector containing the first source operand.
    503 /// \param __b
    504 ///    A 128-bit integer vector containing the second source operand.
    505 /// \returns A 128-bit integer vector containing the sums of products of both
    506 ///    operands: \n
    507 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
    508 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
    509 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
    510 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
    511 ///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
    512 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
    513 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
    514 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
    515 static __inline__ __m128i __DEFAULT_FN_ATTRS
    516 _mm_maddubs_epi16(__m128i __a, __m128i __b)
    517 {
    518     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
    519 }
    520 
    521 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
    522 ///    values contained in the first source operand and packed 8-bit signed
    523 ///    integer values contained in the second source operand, adds pairs of
    524 ///    contiguous products with signed saturation, and writes the 16-bit sums to
    525 ///    the corresponding bits in the destination.
    526 ///
    527 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
    528 ///    both operands are multiplied, and the sum of both results is written to
    529 ///    bits [15:0] of the destination.
    530 ///
    531 /// \headerfile <x86intrin.h>
    532 ///
    533 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
    534 ///
    535 /// \param __a
    536 ///    A 64-bit integer vector containing the first source operand.
    537 /// \param __b
    538 ///    A 64-bit integer vector containing the second source operand.
    539 /// \returns A 64-bit integer vector containing the sums of products of both
    540 ///    operands: \n
    541 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
    542 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
    543 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
    544 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
    545 static __inline__ __m64 __DEFAULT_FN_ATTRS
    546 _mm_maddubs_pi16(__m64 __a, __m64 __b)
    547 {
    548     return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
    549                                                  (__v16qi)__anyext128(__b)));
    550 }
    551 
    552 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
    553 ///    products to the 18 most significant bits by right-shifting, rounds the
    554 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
    555 ///
    556 /// \headerfile <x86intrin.h>
    557 ///
    558 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
    559 ///
    560 /// \param __a
    561 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
    562 /// \param __b
    563 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
    564 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
    565 ///    products of both operands.
    566 static __inline__ __m128i __DEFAULT_FN_ATTRS
    567 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
    568 {
    569     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
    570 }
    571 
    572 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
    573 ///    products to the 18 most significant bits by right-shifting, rounds the
    574 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
    575 ///
    576 /// \headerfile <x86intrin.h>
    577 ///
    578 /// This intrinsic corresponds to the \c PMULHRSW instruction.
    579 ///
    580 /// \param __a
    581 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
    582 /// \param __b
    583 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
    584 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
    585 ///    products of both operands.
    586 static __inline__ __m64 __DEFAULT_FN_ATTRS
    587 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
    588 {
    589     return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
    590                                                 (__v8hi)__anyext128(__b)));
    591 }
    592 
    593 /// Copies the 8-bit integers from a 128-bit integer vector to the
    594 ///    destination or clears 8-bit values in the destination, as specified by
    595 ///    the second source operand.
    596 ///
    597 /// \headerfile <x86intrin.h>
    598 ///
    599 /// This intrinsic corresponds to the \c VPSHUFB instruction.
    600 ///
    601 /// \param __a
    602 ///    A 128-bit integer vector containing the values to be copied.
    603 /// \param __b
    604 ///    A 128-bit integer vector containing control bytes corresponding to
    605 ///    positions in the destination:
    606 ///    Bit 7: \n
    607 ///    1: Clear the corresponding byte in the destination. \n
    608 ///    0: Copy the selected source byte to the corresponding byte in the
    609 ///    destination. \n
    610 ///    Bits [6:4] Reserved.  \n
    611 ///    Bits [3:0] select the source byte to be copied.
    612 /// \returns A 128-bit integer vector containing the copied or cleared values.
    613 static __inline__ __m128i __DEFAULT_FN_ATTRS
    614 _mm_shuffle_epi8(__m128i __a, __m128i __b)
    615 {
    616     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
    617 }
    618 
    619 /// Copies the 8-bit integers from a 64-bit integer vector to the
    620 ///    destination or clears 8-bit values in the destination, as specified by
    621 ///    the second source operand.
    622 ///
    623 /// \headerfile <x86intrin.h>
    624 ///
    625 /// This intrinsic corresponds to the \c PSHUFB instruction.
    626 ///
    627 /// \param __a
    628 ///    A 64-bit integer vector containing the values to be copied.
    629 /// \param __b
    630 ///    A 64-bit integer vector containing control bytes corresponding to
    631 ///    positions in the destination:
    632 ///    Bit 7: \n
    633 ///    1: Clear the corresponding byte in the destination. \n
    634 ///    0: Copy the selected source byte to the corresponding byte in the
    635 ///    destination. \n
    636 ///    Bits [2:0] select the source byte to be copied.
    637 /// \returns A 64-bit integer vector containing the copied or cleared values.
    638 static __inline__ __m64 __DEFAULT_FN_ATTRS
    639 _mm_shuffle_pi8(__m64 __a, __m64 __b)
    640 {
    641     return __trunc64(__builtin_ia32_pshufb128(
    642         (__v16qi)__builtin_shufflevector(
    643             (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
    644         (__v16qi)__anyext128(__b)));
    645 }
    646 
    647 /// For each 8-bit integer in the first source operand, perform one of
    648 ///    the following actions as specified by the second source operand.
    649 ///
    650 ///    If the byte in the second source is negative, calculate the two's
    651 ///    complement of the corresponding byte in the first source, and write that
    652 ///    value to the destination. If the byte in the second source is positive,
    653 ///    copy the corresponding byte from the first source to the destination. If
    654 ///    the byte in the second source is zero, clear the corresponding byte in
    655 ///    the destination.
    656 ///
    657 /// \headerfile <x86intrin.h>
    658 ///
    659 /// This intrinsic corresponds to the \c VPSIGNB instruction.
    660 ///
    661 /// \param __a
    662 ///    A 128-bit integer vector containing the values to be copied.
    663 /// \param __b
    664 ///    A 128-bit integer vector containing control bytes corresponding to
    665 ///    positions in the destination.
    666 /// \returns A 128-bit integer vector containing the resultant values.
    667 static __inline__ __m128i __DEFAULT_FN_ATTRS
    668 _mm_sign_epi8(__m128i __a, __m128i __b)
    669 {
    670     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
    671 }
    672 
    673 /// For each 16-bit integer in the first source operand, perform one of
    674 ///    the following actions as specified by the second source operand.
    675 ///
    676 ///    If the word in the second source is negative, calculate the two's
    677 ///    complement of the corresponding word in the first source, and write that
    678 ///    value to the destination. If the word in the second source is positive,
    679 ///    copy the corresponding word from the first source to the destination. If
    680 ///    the word in the second source is zero, clear the corresponding word in
    681 ///    the destination.
    682 ///
    683 /// \headerfile <x86intrin.h>
    684 ///
    685 /// This intrinsic corresponds to the \c VPSIGNW instruction.
    686 ///
    687 /// \param __a
    688 ///    A 128-bit integer vector containing the values to be copied.
    689 /// \param __b
    690 ///    A 128-bit integer vector containing control words corresponding to
    691 ///    positions in the destination.
    692 /// \returns A 128-bit integer vector containing the resultant values.
    693 static __inline__ __m128i __DEFAULT_FN_ATTRS
    694 _mm_sign_epi16(__m128i __a, __m128i __b)
    695 {
    696     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
    697 }
    698 
    699 /// For each 32-bit integer in the first source operand, perform one of
    700 ///    the following actions as specified by the second source operand.
    701 ///
    702 ///    If the doubleword in the second source is negative, calculate the two's
    703 ///    complement of the corresponding word in the first source, and write that
    704 ///    value to the destination. If the doubleword in the second source is
    705 ///    positive, copy the corresponding word from the first source to the
    706 ///    destination. If the doubleword in the second source is zero, clear the
    707 ///    corresponding word in the destination.
    708 ///
    709 /// \headerfile <x86intrin.h>
    710 ///
    711 /// This intrinsic corresponds to the \c VPSIGND instruction.
    712 ///
    713 /// \param __a
    714 ///    A 128-bit integer vector containing the values to be copied.
    715 /// \param __b
    716 ///    A 128-bit integer vector containing control doublewords corresponding to
    717 ///    positions in the destination.
    718 /// \returns A 128-bit integer vector containing the resultant values.
    719 static __inline__ __m128i __DEFAULT_FN_ATTRS
    720 _mm_sign_epi32(__m128i __a, __m128i __b)
    721 {
    722     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
    723 }
    724 
    725 /// For each 8-bit integer in the first source operand, perform one of
    726 ///    the following actions as specified by the second source operand.
    727 ///
    728 ///    If the byte in the second source is negative, calculate the two's
    729 ///    complement of the corresponding byte in the first source, and write that
    730 ///    value to the destination. If the byte in the second source is positive,
    731 ///    copy the corresponding byte from the first source to the destination. If
    732 ///    the byte in the second source is zero, clear the corresponding byte in
    733 ///    the destination.
    734 ///
    735 /// \headerfile <x86intrin.h>
    736 ///
    737 /// This intrinsic corresponds to the \c PSIGNB instruction.
    738 ///
    739 /// \param __a
    740 ///    A 64-bit integer vector containing the values to be copied.
    741 /// \param __b
    742 ///    A 64-bit integer vector containing control bytes corresponding to
    743 ///    positions in the destination.
    744 /// \returns A 64-bit integer vector containing the resultant values.
    745 static __inline__ __m64 __DEFAULT_FN_ATTRS
    746 _mm_sign_pi8(__m64 __a, __m64 __b)
    747 {
    748     return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a),
    749                                               (__v16qi)__anyext128(__b)));
    750 }
    751 
    752 /// For each 16-bit integer in the first source operand, perform one of
    753 ///    the following actions as specified by the second source operand.
    754 ///
    755 ///    If the word in the second source is negative, calculate the two's
    756 ///    complement of the corresponding word in the first source, and write that
    757 ///    value to the destination. If the word in the second source is positive,
    758 ///    copy the corresponding word from the first source to the destination. If
    759 ///    the word in the second source is zero, clear the corresponding word in
    760 ///    the destination.
    761 ///
    762 /// \headerfile <x86intrin.h>
    763 ///
    764 /// This intrinsic corresponds to the \c PSIGNW instruction.
    765 ///
    766 /// \param __a
    767 ///    A 64-bit integer vector containing the values to be copied.
    768 /// \param __b
    769 ///    A 64-bit integer vector containing control words corresponding to
    770 ///    positions in the destination.
    771 /// \returns A 64-bit integer vector containing the resultant values.
    772 static __inline__ __m64 __DEFAULT_FN_ATTRS
    773 _mm_sign_pi16(__m64 __a, __m64 __b)
    774 {
    775     return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a),
    776                                               (__v8hi)__anyext128(__b)));
    777 }
    778 
    779 /// For each 32-bit integer in the first source operand, perform one of
    780 ///    the following actions as specified by the second source operand.
    781 ///
    782 ///    If the doubleword in the second source is negative, calculate the two's
    783 ///    complement of the corresponding doubleword in the first source, and
    784 ///    write that value to the destination. If the doubleword in the second
    785 ///    source is positive, copy the corresponding doubleword from the first
    786 ///    source to the destination. If the doubleword in the second source is
    787 ///    zero, clear the corresponding doubleword in the destination.
    788 ///
    789 /// \headerfile <x86intrin.h>
    790 ///
    791 /// This intrinsic corresponds to the \c PSIGND instruction.
    792 ///
    793 /// \param __a
    794 ///    A 64-bit integer vector containing the values to be copied.
    795 /// \param __b
    796 ///    A 64-bit integer vector containing two control doublewords corresponding
    797 ///    to positions in the destination.
    798 /// \returns A 64-bit integer vector containing the resultant values.
    799 static __inline__ __m64 __DEFAULT_FN_ATTRS
    800 _mm_sign_pi32(__m64 __a, __m64 __b)
    801 {
    802     return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a),
    803                                               (__v4si)__anyext128(__b)));
    804 }
    805 
    806 #undef __anyext128
    807 #undef __trunc64
    808 #undef __DEFAULT_FN_ATTRS
    809 
    810 #endif /* __TMMINTRIN_H */