zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

mmintrin.h (60004B) - Raw


      1 /*===---- mmintrin.h - MMX intrinsics --------------------------------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 #ifndef __MMINTRIN_H
     11 #define __MMINTRIN_H
     12 
     13 #if !defined(__i386__) && !defined(__x86_64__)
     14 #error "This header is only meant to be used on x86 and x64 architecture"
     15 #endif
     16 
     17 typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
     18 
     19 typedef long long __v1di __attribute__((__vector_size__(8)));
     20 typedef int __v2si __attribute__((__vector_size__(8)));
     21 typedef short __v4hi __attribute__((__vector_size__(8)));
     22 typedef char __v8qi __attribute__((__vector_size__(8)));
     23 
     24 /* Unsigned types */
     25 typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8)));
     26 typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
     27 typedef unsigned short __v4hu __attribute__((__vector_size__(8)));
     28 typedef unsigned char __v8qu __attribute__((__vector_size__(8)));
     29 
     30 /* We need an explicitly signed variant for char. Note that this shouldn't
     31  * appear in the interface though. */
     32 typedef signed char __v8qs __attribute__((__vector_size__(8)));
     33 
     34 /* SSE/SSE2 types */
     35 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
     36 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
     37 typedef int __v4si __attribute__((__vector_size__(16)));
     38 typedef short __v8hi __attribute__((__vector_size__(16)));
     39 typedef char __v16qi __attribute__((__vector_size__(16)));
     40 
     41 /* Define the default attributes for the functions in this file. */
     42 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
     43 #define __DEFAULT_FN_ATTRS_SSE2                                                \
     44   __attribute__((__always_inline__, __nodebug__,                               \
     45                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
     46 #else
     47 #define __DEFAULT_FN_ATTRS_SSE2                                                \
     48   __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
     49                  __min_vector_width__(128)))
     50 #endif
     51 
     52 #if defined(__cplusplus) && (__cplusplus >= 201103L)
     53 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
     54 #else
     55 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
     56 #endif
     57 
     58 #define __trunc64(x)                                                           \
     59   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
     60 #define __anyext128(x)                                                         \
     61   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
     62                                     1, -1, -1)
     63 
     64 /// Clears the MMX state by setting the state of the x87 stack registers
     65 ///    to empty.
     66 ///
     67 /// \headerfile <x86intrin.h>
     68 ///
     69 /// This intrinsic corresponds to the <c> EMMS </c> instruction.
     70 ///
     71 static __inline__ void __attribute__((__always_inline__, __nodebug__,
     72                                       __target__("mmx,no-evex512")))
     73 _mm_empty(void) {
     74   __builtin_ia32_emms();
     75 }
     76 
     77 /// Constructs a 64-bit integer vector, setting the lower 32 bits to the
     78 ///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
     79 ///
     80 /// \headerfile <x86intrin.h>
     81 ///
     82 /// This intrinsic corresponds to the <c> MOVD </c> instruction.
     83 ///
     84 /// \param __i
     85 ///    A 32-bit integer value.
     86 /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
     87 ///    parameter. The upper 32 bits are set to 0.
     88 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
     89 _mm_cvtsi32_si64(int __i)
     90 {
     91     return __extension__ (__m64)(__v2si){__i, 0};
     92 }
     93 
     94 /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
     95 ///    signed integer.
     96 ///
     97 /// \headerfile <x86intrin.h>
     98 ///
     99 /// This intrinsic corresponds to the <c> MOVD </c> instruction.
    100 ///
    101 /// \param __m
    102 ///    A 64-bit integer vector.
    103 /// \returns A 32-bit signed integer value containing the lower 32 bits of the
    104 ///    parameter.
    105 static __inline__ int __DEFAULT_FN_ATTRS_SSE2
    106 _mm_cvtsi64_si32(__m64 __m)
    107 {
    108     return ((__v2si)__m)[0];
    109 }
    110 
    111 /// Casts a 64-bit signed integer value into a 64-bit integer vector.
    112 ///
    113 /// \headerfile <x86intrin.h>
    114 ///
    115 /// This intrinsic corresponds to the <c> MOVQ </c> instruction.
    116 ///
    117 /// \param __i
    118 ///    A 64-bit signed integer.
    119 /// \returns A 64-bit integer vector containing the same bitwise pattern as the
    120 ///    parameter.
    121 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    122 _mm_cvtsi64_m64(long long __i)
    123 {
    124     return (__m64)__i;
    125 }
    126 
    127 /// Casts a 64-bit integer vector into a 64-bit signed integer value.
    128 ///
    129 /// \headerfile <x86intrin.h>
    130 ///
    131 /// This intrinsic corresponds to the <c> MOVQ </c> instruction.
    132 ///
    133 /// \param __m
    134 ///    A 64-bit integer vector.
    135 /// \returns A 64-bit signed integer containing the same bitwise pattern as the
    136 ///    parameter.
    137 static __inline__ long long __DEFAULT_FN_ATTRS_SSE2
    138 _mm_cvtm64_si64(__m64 __m)
    139 {
    140     return (long long)__m;
    141 }
    142 
    143 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
    144 ///    vector parameters of [4 x i16] into 8-bit signed integer values, and
    145 ///    constructs a 64-bit integer vector of [8 x i8] as the result.
    146 ///
    147 ///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
    148 ///    less than 0x80 are saturated to 0x80.
    149 ///
    150 /// \headerfile <x86intrin.h>
    151 ///
    152 /// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
    153 ///
    154 /// \param __m1
    155 ///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
    156 ///    written to the lower 32 bits of the result.
    157 /// \param __m2
    158 ///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
    159 ///    written to the upper 32 bits of the result.
    160 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
    161 ///    values.
    162 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    163 _mm_packs_pi16(__m64 __m1, __m64 __m2)
    164 {
    165     return __trunc64(__builtin_ia32_packsswb128(
    166         (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
    167 }
    168 
    169 /// Converts, with saturation, 32-bit signed integers from both 64-bit integer
    170 ///    vector parameters of [2 x i32] into 16-bit signed integer values, and
    171 ///    constructs a 64-bit integer vector of [4 x i16] as the result.
    172 ///
    173 ///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
    174 ///    values less than 0x8000 are saturated to 0x8000.
    175 ///
    176 /// \headerfile <x86intrin.h>
    177 ///
    178 /// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
    179 ///
    180 /// \param __m1
    181 ///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
    182 ///    written to the lower 32 bits of the result.
    183 /// \param __m2
    184 ///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
    185 ///    written to the upper 32 bits of the result.
    186 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
    187 ///    values.
    188 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    189 _mm_packs_pi32(__m64 __m1, __m64 __m2)
    190 {
    191     return __trunc64(__builtin_ia32_packssdw128(
    192         (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
    193 }
    194 
    195 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
    196 ///    vector parameters of [4 x i16] into 8-bit unsigned integer values, and
    197 ///    constructs a 64-bit integer vector of [8 x i8] as the result.
    198 ///
    199 ///    Values greater than 0xFF are saturated to 0xFF. Values less than 0 are
    200 ///    saturated to 0.
    201 ///
    202 /// \headerfile <x86intrin.h>
    203 ///
    204 /// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
    205 ///
    206 /// \param __m1
    207 ///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
    208 ///    written to the lower 32 bits of the result.
    209 /// \param __m2
    210 ///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
    211 ///    written to the upper 32 bits of the result.
    212 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
    213 ///    values.
    214 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    215 _mm_packs_pu16(__m64 __m1, __m64 __m2)
    216 {
    217     return __trunc64(__builtin_ia32_packuswb128(
    218         (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
    219 }
    220 
    221 /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
    222 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
    223 ///
    224 /// \headerfile <x86intrin.h>
    225 ///
    226 /// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
    227 ///
    228 /// \param __m1
    229 ///    A 64-bit integer vector of [8 x i8]. \n
    230 ///    Bits [39:32] are written to bits [7:0] of the result. \n
    231 ///    Bits [47:40] are written to bits [23:16] of the result. \n
    232 ///    Bits [55:48] are written to bits [39:32] of the result. \n
    233 ///    Bits [63:56] are written to bits [55:48] of the result.
    234 /// \param __m2
    235 ///    A 64-bit integer vector of [8 x i8].
    236 ///    Bits [39:32] are written to bits [15:8] of the result. \n
    237 ///    Bits [47:40] are written to bits [31:24] of the result. \n
    238 ///    Bits [55:48] are written to bits [47:40] of the result. \n
    239 ///    Bits [63:56] are written to bits [63:56] of the result.
    240 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
    241 ///    values.
    242 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    243 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
    244 {
    245     return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
    246                                           4, 12, 5, 13, 6, 14, 7, 15);
    247 }
    248 
    249 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
    250 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
    251 ///
    252 /// \headerfile <x86intrin.h>
    253 ///
    254 /// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
    255 ///
    256 /// \param __m1
    257 ///    A 64-bit integer vector of [4 x i16].
    258 ///    Bits [47:32] are written to bits [15:0] of the result. \n
    259 ///    Bits [63:48] are written to bits [47:32] of the result.
    260 /// \param __m2
    261 ///    A 64-bit integer vector of [4 x i16].
    262 ///    Bits [47:32] are written to bits [31:16] of the result. \n
    263 ///    Bits [63:48] are written to bits [63:48] of the result.
    264 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
    265 ///    values.
    266 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    267 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
    268 {
    269     return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
    270                                           2, 6, 3, 7);
    271 }
    272 
    273 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
    274 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
    275 ///
    276 /// \headerfile <x86intrin.h>
    277 ///
    278 /// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
    279 ///
    280 /// \param __m1
    281 ///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
    282 ///    the lower 32 bits of the result.
    283 /// \param __m2
    284 ///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
    285 ///    the upper 32 bits of the result.
    286 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
    287 ///    values.
    288 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    289 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
    290 {
    291     return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3);
    292 }
    293 
    294 /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
    295 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
    296 ///
    297 /// \headerfile <x86intrin.h>
    298 ///
    299 /// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
    300 ///
    301 /// \param __m1
    302 ///    A 64-bit integer vector of [8 x i8].
    303 ///    Bits [7:0] are written to bits [7:0] of the result. \n
    304 ///    Bits [15:8] are written to bits [23:16] of the result. \n
    305 ///    Bits [23:16] are written to bits [39:32] of the result. \n
    306 ///    Bits [31:24] are written to bits [55:48] of the result.
    307 /// \param __m2
    308 ///    A 64-bit integer vector of [8 x i8].
    309 ///    Bits [7:0] are written to bits [15:8] of the result. \n
    310 ///    Bits [15:8] are written to bits [31:24] of the result. \n
    311 ///    Bits [23:16] are written to bits [47:40] of the result. \n
    312 ///    Bits [31:24] are written to bits [63:56] of the result.
    313 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
    314 ///    values.
    315 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    316 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
    317 {
    318     return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
    319                                           0, 8, 1, 9, 2, 10, 3, 11);
    320 }
    321 
    322 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
    323 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
    324 ///
    325 /// \headerfile <x86intrin.h>
    326 ///
    327 /// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
    328 ///
    329 /// \param __m1
    330 ///    A 64-bit integer vector of [4 x i16].
    331 ///    Bits [15:0] are written to bits [15:0] of the result. \n
    332 ///    Bits [31:16] are written to bits [47:32] of the result.
    333 /// \param __m2
    334 ///    A 64-bit integer vector of [4 x i16].
    335 ///    Bits [15:0] are written to bits [31:16] of the result. \n
    336 ///    Bits [31:16] are written to bits [63:48] of the result.
    337 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
    338 ///    values.
    339 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    340 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
    341 {
    342     return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
    343                                           0, 4, 1, 5);
    344 }
    345 
    346 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
    347 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
    348 ///
    349 /// \headerfile <x86intrin.h>
    350 ///
    351 /// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
    352 ///
    353 /// \param __m1
    354 ///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
    355 ///    the lower 32 bits of the result.
    356 /// \param __m2
    357 ///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
    358 ///    the upper 32 bits of the result.
    359 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
    360 ///    values.
    361 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    362 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
    363 {
    364     return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2);
    365 }
    366 
    367 /// Adds each 8-bit integer element of the first 64-bit integer vector
    368 ///    of [8 x i8] to the corresponding 8-bit integer element of the second
    369 ///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
    370 ///    packed into a 64-bit integer vector of [8 x i8].
    371 ///
    372 /// \headerfile <x86intrin.h>
    373 ///
    374 /// This intrinsic corresponds to the <c> PADDB </c> instruction.
    375 ///
    376 /// \param __m1
    377 ///    A 64-bit integer vector of [8 x i8].
    378 /// \param __m2
    379 ///    A 64-bit integer vector of [8 x i8].
    380 /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
    381 ///    parameters.
    382 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    383 _mm_add_pi8(__m64 __m1, __m64 __m2)
    384 {
    385     return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2));
    386 }
    387 
    388 /// Adds each 16-bit integer element of the first 64-bit integer vector
    389 ///    of [4 x i16] to the corresponding 16-bit integer element of the second
    390 ///    64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
    391 ///    packed into a 64-bit integer vector of [4 x i16].
    392 ///
    393 /// \headerfile <x86intrin.h>
    394 ///
    395 /// This intrinsic corresponds to the <c> PADDW </c> instruction.
    396 ///
    397 /// \param __m1
    398 ///    A 64-bit integer vector of [4 x i16].
    399 /// \param __m2
    400 ///    A 64-bit integer vector of [4 x i16].
    401 /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
    402 ///    parameters.
    403 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    404 _mm_add_pi16(__m64 __m1, __m64 __m2)
    405 {
    406     return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2));
    407 }
    408 
    409 /// Adds each 32-bit integer element of the first 64-bit integer vector
    410 ///    of [2 x i32] to the corresponding 32-bit integer element of the second
    411 ///    64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
    412 ///    packed into a 64-bit integer vector of [2 x i32].
    413 ///
    414 /// \headerfile <x86intrin.h>
    415 ///
    416 /// This intrinsic corresponds to the <c> PADDD </c> instruction.
    417 ///
    418 /// \param __m1
    419 ///    A 64-bit integer vector of [2 x i32].
    420 /// \param __m2
    421 ///    A 64-bit integer vector of [2 x i32].
    422 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
    423 ///    parameters.
    424 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    425 _mm_add_pi32(__m64 __m1, __m64 __m2)
    426 {
    427     return (__m64)(((__v2su)__m1) + ((__v2su)__m2));
    428 }
    429 
    430 /// Adds, with saturation, each 8-bit signed integer element of the first
    431 ///    64-bit integer vector of [8 x i8] to the corresponding 8-bit signed
    432 ///    integer element of the second 64-bit integer vector of [8 x i8].
    433 ///
    434 ///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
    435 ///    less than 0x80 are saturated to 0x80. The results are packed into a
    436 ///    64-bit integer vector of [8 x i8].
    437 ///
    438 /// \headerfile <x86intrin.h>
    439 ///
    440 /// This intrinsic corresponds to the <c> PADDSB </c> instruction.
    441 ///
    442 /// \param __m1
    443 ///    A 64-bit integer vector of [8 x i8].
    444 /// \param __m2
    445 ///    A 64-bit integer vector of [8 x i8].
    446 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
    447 ///    of both parameters.
    448 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    449 _mm_adds_pi8(__m64 __m1, __m64 __m2)
    450 {
    451     return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2);
    452 }
    453 
    454 /// Adds, with saturation, each 16-bit signed integer element of the first
    455 ///    64-bit integer vector of [4 x i16] to the corresponding 16-bit signed
    456 ///    integer element of the second 64-bit integer vector of [4 x i16].
    457 ///
    458 ///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
    459 ///    less than 0x8000 are saturated to 0x8000. The results are packed into a
    460 ///    64-bit integer vector of [4 x i16].
    461 ///
    462 /// \headerfile <x86intrin.h>
    463 ///
    464 /// This intrinsic corresponds to the <c> PADDSW </c> instruction.
    465 ///
    466 /// \param __m1
    467 ///    A 64-bit integer vector of [4 x i16].
    468 /// \param __m2
    469 ///    A 64-bit integer vector of [4 x i16].
    470 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
    471 ///    of both parameters.
    472 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    473 _mm_adds_pi16(__m64 __m1, __m64 __m2)
    474 {
    475     return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2);
    476 }
    477 
    478 /// Adds, with saturation, each 8-bit unsigned integer element of the first
    479 ///    64-bit integer vector of [8 x i8] to the corresponding 8-bit unsigned
    480 ///    integer element of the second 64-bit integer vector of [8 x i8].
    481 ///
    482 ///    Sums greater than 0xFF are saturated to 0xFF. The results are packed
    483 ///    into a 64-bit integer vector of [8 x i8].
    484 ///
    485 /// \headerfile <x86intrin.h>
    486 ///
    487 /// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
    488 ///
    489 /// \param __m1
    490 ///    A 64-bit integer vector of [8 x i8].
    491 /// \param __m2
    492 ///    A 64-bit integer vector of [8 x i8].
    493 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
    494 ///    unsigned sums of both parameters.
    495 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    496 _mm_adds_pu8(__m64 __m1, __m64 __m2)
    497 {
    498     return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2);
    499 }
    500 
    501 /// Adds, with saturation, each 16-bit unsigned integer element of the first
    502 ///    64-bit integer vector of [4 x i16] to the corresponding 16-bit unsigned
    503 ///    integer element of the second 64-bit integer vector of [4 x i16].
    504 ///
    505 ///    Sums greater than 0xFFFF are saturated to 0xFFFF. The results are packed
    506 ///    into a 64-bit integer vector of [4 x i16].
    507 ///
    508 /// \headerfile <x86intrin.h>
    509 ///
    510 /// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
    511 ///
    512 /// \param __m1
    513 ///    A 64-bit integer vector of [4 x i16].
    514 /// \param __m2
    515 ///    A 64-bit integer vector of [4 x i16].
    516 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
    517 ///    unsigned sums of both parameters.
    518 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    519 _mm_adds_pu16(__m64 __m1, __m64 __m2)
    520 {
    521     return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2);
    522 }
    523 
    524 /// Subtracts each 8-bit integer element of the second 64-bit integer
    525 ///    vector of [8 x i8] from the corresponding 8-bit integer element of the
    526 ///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
    527 ///    are packed into a 64-bit integer vector of [8 x i8].
    528 ///
    529 /// \headerfile <x86intrin.h>
    530 ///
    531 /// This intrinsic corresponds to the <c> PSUBB </c> instruction.
    532 ///
    533 /// \param __m1
    534 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
    535 /// \param __m2
    536 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
    537 /// \returns A 64-bit integer vector of [8 x i8] containing the differences of
    538 ///    both parameters.
    539 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    540 _mm_sub_pi8(__m64 __m1, __m64 __m2)
    541 {
    542     return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2));
    543 }
    544 
    545 /// Subtracts each 16-bit integer element of the second 64-bit integer
    546 ///    vector of [4 x i16] from the corresponding 16-bit integer element of the
    547 ///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
    548 ///    results are packed into a 64-bit integer vector of [4 x i16].
    549 ///
    550 /// \headerfile <x86intrin.h>
    551 ///
    552 /// This intrinsic corresponds to the <c> PSUBW </c> instruction.
    553 ///
    554 /// \param __m1
    555 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
    556 /// \param __m2
    557 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
    558 /// \returns A 64-bit integer vector of [4 x i16] containing the differences of
    559 ///    both parameters.
    560 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    561 _mm_sub_pi16(__m64 __m1, __m64 __m2)
    562 {
    563     return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2));
    564 }
    565 
    566 /// Subtracts each 32-bit integer element of the second 64-bit integer
    567 ///    vector of [2 x i32] from the corresponding 32-bit integer element of the
    568 ///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
    569 ///    results are packed into a 64-bit integer vector of [2 x i32].
    570 ///
    571 /// \headerfile <x86intrin.h>
    572 ///
    573 /// This intrinsic corresponds to the <c> PSUBD </c> instruction.
    574 ///
    575 /// \param __m1
    576 ///    A 64-bit integer vector of [2 x i32] containing the minuends.
    577 /// \param __m2
    578 ///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
    579 /// \returns A 64-bit integer vector of [2 x i32] containing the differences of
    580 ///    both parameters.
    581 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    582 _mm_sub_pi32(__m64 __m1, __m64 __m2)
    583 {
    584     return (__m64)(((__v2su)__m1) - ((__v2su)__m2));
    585 }
    586 
    587 /// Subtracts, with saturation, each 8-bit signed integer element of the second
    588 ///    64-bit integer vector of [8 x i8] from the corresponding 8-bit signed
    589 ///    integer element of the first 64-bit integer vector of [8 x i8].
    590 ///
    591 ///    Positive results greater than 0x7F are saturated to 0x7F. Negative
    592 ///    results less than 0x80 are saturated to 0x80. The results are packed
    593 ///    into a 64-bit integer vector of [8 x i8].
    594 ///
    595 /// \headerfile <x86intrin.h>
    596 ///
    597 /// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
    598 ///
    599 /// \param __m1
    600 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
    601 /// \param __m2
    602 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
    603 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
    604 ///    differences of both parameters.
    605 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    606 _mm_subs_pi8(__m64 __m1, __m64 __m2)
    607 {
    608     return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2);
    609 }
    610 
    611 /// Subtracts, with saturation, each 16-bit signed integer element of the
    612 ///    second 64-bit integer vector of [4 x i16] from the corresponding 16-bit
    613 ///    signed integer element of the first 64-bit integer vector of [4 x i16].
    614 ///
    615 ///    Positive results greater than 0x7FFF are saturated to 0x7FFF. Negative
    616 ///    results less than 0x8000 are saturated to 0x8000. The results are packed
    617 ///    into a 64-bit integer vector of [4 x i16].
    618 ///
    619 /// \headerfile <x86intrin.h>
    620 ///
    621 /// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
    622 ///
    623 /// \param __m1
    624 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
    625 /// \param __m2
    626 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
    627 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
    628 ///    differences of both parameters.
    629 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    630 _mm_subs_pi16(__m64 __m1, __m64 __m2)
    631 {
    632     return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2);
    633 }
    634 
    635 /// Subtracts each 8-bit unsigned integer element of the second 64-bit
    636 ///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
    637 ///    element of the first 64-bit integer vector of [8 x i8].
    638 ///
    639 ///    If an element of the first vector is less than the corresponding element
    640 ///    of the second vector, the result is saturated to 0. The results are
    641 ///    packed into a 64-bit integer vector of [8 x i8].
    642 ///
    643 /// \headerfile <x86intrin.h>
    644 ///
    645 /// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
    646 ///
    647 /// \param __m1
    648 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
    649 /// \param __m2
    650 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
    651 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
    652 ///    differences of both parameters.
    653 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    654 _mm_subs_pu8(__m64 __m1, __m64 __m2)
    655 {
    656     return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2);
    657 }
    658 
    659 /// Subtracts each 16-bit unsigned integer element of the second 64-bit
    660 ///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
    661 ///    integer element of the first 64-bit integer vector of [4 x i16].
    662 ///
    663 ///    If an element of the first vector is less than the corresponding element
    664 ///    of the second vector, the result is saturated to 0. The results are
    665 ///    packed into a 64-bit integer vector of [4 x i16].
    666 ///
    667 /// \headerfile <x86intrin.h>
    668 ///
    669 /// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
    670 ///
    671 /// \param __m1
    672 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
    673 /// \param __m2
    674 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
    675 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
    676 ///    differences of both parameters.
    677 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    678 _mm_subs_pu16(__m64 __m1, __m64 __m2)
    679 {
    680     return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2);
    681 }
    682 
    683 /// Multiplies each 16-bit signed integer element of the first 64-bit
    684 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
    685 ///    element of the second 64-bit integer vector of [4 x i16] and get four
    686 ///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
    687 ///    The lower 32 bits of these two sums are packed into a 64-bit integer
    688 ///    vector of [2 x i32].
    689 ///
    690 ///    For example, bits [15:0] of both parameters are multiplied, bits [31:16]
    691 ///    of both parameters are multiplied, and the sum of both results is written
    692 ///    to bits [31:0] of the result.
    693 ///
    694 /// \headerfile <x86intrin.h>
    695 ///
    696 /// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
    697 ///
    698 /// \param __m1
    699 ///    A 64-bit integer vector of [4 x i16].
    700 /// \param __m2
    701 ///    A 64-bit integer vector of [4 x i16].
    702 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of
    703 ///    products of both parameters.
    704 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    705 _mm_madd_pi16(__m64 __m1, __m64 __m2)
    706 {
    707     return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
    708                                                (__v8hi)__anyext128(__m2)));
    709 }
    710 
    711 /// Multiplies each 16-bit signed integer element of the first 64-bit
    712 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
    713 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the upper
    714 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
    715 ///
    716 /// \headerfile <x86intrin.h>
    717 ///
    718 /// This intrinsic corresponds to the <c> PMULHW </c> instruction.
    719 ///
    720 /// \param __m1
    721 ///    A 64-bit integer vector of [4 x i16].
    722 /// \param __m2
    723 ///    A 64-bit integer vector of [4 x i16].
    724 /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
    725 ///    of the products of both parameters.
    726 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    727 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
    728 {
    729     return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1),
    730                                               (__v8hi)__anyext128(__m2)));
    731 }
    732 
    733 /// Multiplies each 16-bit signed integer element of the first 64-bit
    734 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
    735 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the lower
    736 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
    737 ///
    738 /// \headerfile <x86intrin.h>
    739 ///
    740 /// This intrinsic corresponds to the <c> PMULLW </c> instruction.
    741 ///
    742 /// \param __m1
    743 ///    A 64-bit integer vector of [4 x i16].
    744 /// \param __m2
    745 ///    A 64-bit integer vector of [4 x i16].
    746 /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
    747 ///    of the products of both parameters.
    748 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    749 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
    750 {
    751     return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2));
    752 }
    753 
    754 /// Left-shifts each 16-bit signed integer element of the first
    755 ///    parameter, which is a 64-bit integer vector of [4 x i16], by the number
    756 ///    of bits specified by the second parameter, which is a 64-bit integer. The
    757 ///    lower 16 bits of the results are packed into a 64-bit integer vector of
    758 ///    [4 x i16].
    759 ///
    760 /// \headerfile <x86intrin.h>
    761 ///
    762 /// This intrinsic corresponds to the <c> PSLLW </c> instruction.
    763 ///
    764 /// \param __m
    765 ///    A 64-bit integer vector of [4 x i16].
    766 /// \param __count
    767 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    768 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
    769 ///    values. If \a __count is greater or equal to 16, the result is set to all
    770 ///    0.
    771 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    772 _mm_sll_pi16(__m64 __m, __m64 __count)
    773 {
    774     return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m),
    775                                              (__v8hi)__anyext128(__count)));
    776 }
    777 
    778 /// Left-shifts each 16-bit signed integer element of a 64-bit integer
    779 ///    vector of [4 x i16] by the number of bits specified by a 32-bit integer.
    780 ///    The lower 16 bits of the results are packed into a 64-bit integer vector
    781 ///    of [4 x i16].
    782 ///
    783 /// \headerfile <x86intrin.h>
    784 ///
    785 /// This intrinsic corresponds to the <c> PSLLW </c> instruction.
    786 ///
    787 /// \param __m
    788 ///    A 64-bit integer vector of [4 x i16].
    789 /// \param __count
    790 ///    A 32-bit integer value.
    791 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
    792 ///    values. If \a __count is greater or equal to 16, the result is set to all
    793 ///    0.
    794 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    795 _mm_slli_pi16(__m64 __m, int __count)
    796 {
    797     return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m),
    798                                               __count));
    799 }
    800 
    801 /// Left-shifts each 32-bit signed integer element of the first
    802 ///    parameter, which is a 64-bit integer vector of [2 x i32], by the number
    803 ///    of bits specified by the second parameter, which is a 64-bit integer. The
    804 ///    lower 32 bits of the results are packed into a 64-bit integer vector of
    805 ///    [2 x i32].
    806 ///
    807 /// \headerfile <x86intrin.h>
    808 ///
    809 /// This intrinsic corresponds to the <c> PSLLD </c> instruction.
    810 ///
    811 /// \param __m
    812 ///    A 64-bit integer vector of [2 x i32].
    813 /// \param __count
    814 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    815 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
    816 ///    values. If \a __count is greater or equal to 32, the result is set to all
    817 ///    0.
    818 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    819 _mm_sll_pi32(__m64 __m, __m64 __count)
    820 {
    821     return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m),
    822                                              (__v4si)__anyext128(__count)));
    823 }
    824 
    825 /// Left-shifts each 32-bit signed integer element of a 64-bit integer
    826 ///    vector of [2 x i32] by the number of bits specified by a 32-bit integer.
    827 ///    The lower 32 bits of the results are packed into a 64-bit integer vector
    828 ///    of [2 x i32].
    829 ///
    830 /// \headerfile <x86intrin.h>
    831 ///
    832 /// This intrinsic corresponds to the <c> PSLLD </c> instruction.
    833 ///
    834 /// \param __m
    835 ///    A 64-bit integer vector of [2 x i32].
    836 /// \param __count
    837 ///    A 32-bit integer value.
    838 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
    839 ///    values. If \a __count is greater or equal to 32, the result is set to all
    840 ///    0.
    841 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    842 _mm_slli_pi32(__m64 __m, int __count)
    843 {
    844     return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m),
    845                                               __count));
    846 }
    847 
    848 /// Left-shifts the first 64-bit integer parameter by the number of bits
    849 ///    specified by the second 64-bit integer parameter. The lower 64 bits of
    850 ///    result are returned.
    851 ///
    852 /// \headerfile <x86intrin.h>
    853 ///
    854 /// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
    855 ///
    856 /// \param __m
    857 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    858 /// \param __count
    859 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    860 /// \returns A 64-bit integer vector containing the left-shifted value. If
    861 ///     \a __count is greater or equal to 64, the result is set to 0.
    862 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    863 _mm_sll_si64(__m64 __m, __m64 __count)
    864 {
    865     return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m),
    866                                              (__v2di)__anyext128(__count)));
    867 }
    868 
    869 /// Left-shifts the first parameter, which is a 64-bit integer, by the
    870 ///    number of bits specified by the second parameter, which is a 32-bit
    871 ///    integer. The lower 64 bits of result are returned.
    872 ///
    873 /// \headerfile <x86intrin.h>
    874 ///
    875 /// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
    876 ///
    877 /// \param __m
    878 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    879 /// \param __count
    880 ///    A 32-bit integer value.
    881 /// \returns A 64-bit integer vector containing the left-shifted value. If
    882 ///     \a __count is greater or equal to 64, the result is set to 0.
    883 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    884 _mm_slli_si64(__m64 __m, int __count)
    885 {
    886     return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m),
    887                                               __count));
    888 }
    889 
    890 /// Right-shifts each 16-bit integer element of the first parameter,
    891 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
    892 ///    specified by the second parameter, which is a 64-bit integer.
    893 ///
    894 ///    High-order bits are filled with the sign bit of the initial value of each
    895 ///    16-bit element. The 16-bit results are packed into a 64-bit integer
    896 ///    vector of [4 x i16].
    897 ///
    898 /// \headerfile <x86intrin.h>
    899 ///
    900 /// This intrinsic corresponds to the <c> PSRAW </c> instruction.
    901 ///
    902 /// \param __m
    903 ///    A 64-bit integer vector of [4 x i16].
    904 /// \param __count
    905 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    906 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
    907 ///    values.
    908 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    909 _mm_sra_pi16(__m64 __m, __m64 __count)
    910 {
    911     return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m),
    912                                              (__v8hi)__anyext128(__count)));
    913 }
    914 
    915 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
    916 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
    917 ///
    918 ///    High-order bits are filled with the sign bit of the initial value of each
    919 ///    16-bit element. The 16-bit results are packed into a 64-bit integer
    920 ///    vector of [4 x i16].
    921 ///
    922 /// \headerfile <x86intrin.h>
    923 ///
    924 /// This intrinsic corresponds to the <c> PSRAW </c> instruction.
    925 ///
    926 /// \param __m
    927 ///    A 64-bit integer vector of [4 x i16].
    928 /// \param __count
    929 ///    A 32-bit integer value.
    930 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
    931 ///    values.
    932 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    933 _mm_srai_pi16(__m64 __m, int __count)
    934 {
    935     return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m),
    936                                               __count));
    937 }
    938 
    939 /// Right-shifts each 32-bit integer element of the first parameter,
    940 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
    941 ///    specified by the second parameter, which is a 64-bit integer.
    942 ///
    943 ///    High-order bits are filled with the sign bit of the initial value of each
    944 ///    32-bit element. The 32-bit results are packed into a 64-bit integer
    945 ///    vector of [2 x i32].
    946 ///
    947 /// \headerfile <x86intrin.h>
    948 ///
    949 /// This intrinsic corresponds to the <c> PSRAD </c> instruction.
    950 ///
    951 /// \param __m
    952 ///    A 64-bit integer vector of [2 x i32].
    953 /// \param __count
    954 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
    955 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
    956 ///    values.
    957 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    958 _mm_sra_pi32(__m64 __m, __m64 __count)
    959 {
    960     return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m),
    961                                              (__v4si)__anyext128(__count)));
    962 }
    963 
    964 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
    965 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
    966 ///
    967 ///    High-order bits are filled with the sign bit of the initial value of each
    968 ///    32-bit element. The 32-bit results are packed into a 64-bit integer
    969 ///    vector of [2 x i32].
    970 ///
    971 /// \headerfile <x86intrin.h>
    972 ///
    973 /// This intrinsic corresponds to the <c> PSRAD </c> instruction.
    974 ///
    975 /// \param __m
    976 ///    A 64-bit integer vector of [2 x i32].
    977 /// \param __count
    978 ///    A 32-bit integer value.
    979 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
    980 ///    values.
    981 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
    982 _mm_srai_pi32(__m64 __m, int __count)
    983 {
    984     return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m),
    985                                               __count));
    986 }
    987 
    988 /// Right-shifts each 16-bit integer element of the first parameter,
    989 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
    990 ///    specified by the second parameter, which is a 64-bit integer.
    991 ///
    992 ///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
    993 ///    integer vector of [4 x i16].
    994 ///
    995 /// \headerfile <x86intrin.h>
    996 ///
    997 /// This intrinsic corresponds to the <c> PSRLW </c> instruction.
    998 ///
    999 /// \param __m
   1000 ///    A 64-bit integer vector of [4 x i16].
   1001 /// \param __count
   1002 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
   1003 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
   1004 ///    values.
   1005 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1006 _mm_srl_pi16(__m64 __m, __m64 __count)
   1007 {
   1008     return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m),
   1009                                              (__v8hi)__anyext128(__count)));
   1010 }
   1011 
   1012 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
   1013 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
   1014 ///
   1015 ///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
   1016 ///    integer vector of [4 x i16].
   1017 ///
   1018 /// \headerfile <x86intrin.h>
   1019 ///
   1020 /// This intrinsic corresponds to the <c> PSRLW </c> instruction.
   1021 ///
   1022 /// \param __m
   1023 ///    A 64-bit integer vector of [4 x i16].
   1024 /// \param __count
   1025 ///    A 32-bit integer value.
   1026 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
   1027 ///    values.
   1028 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1029 _mm_srli_pi16(__m64 __m, int __count)
   1030 {
   1031     return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m),
   1032                                               __count));
   1033 }
   1034 
   1035 /// Right-shifts each 32-bit integer element of the first parameter,
   1036 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
   1037 ///    specified by the second parameter, which is a 64-bit integer.
   1038 ///
   1039 ///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
   1040 ///    integer vector of [2 x i32].
   1041 ///
   1042 /// \headerfile <x86intrin.h>
   1043 ///
   1044 /// This intrinsic corresponds to the <c> PSRLD </c> instruction.
   1045 ///
   1046 /// \param __m
   1047 ///    A 64-bit integer vector of [2 x i32].
   1048 /// \param __count
   1049 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
   1050 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
   1051 ///    values.
   1052 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1053 _mm_srl_pi32(__m64 __m, __m64 __count)
   1054 {
   1055     return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m),
   1056                                              (__v4si)__anyext128(__count)));
   1057 }
   1058 
   1059 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
   1060 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
   1061 ///
   1062 ///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
   1063 ///    integer vector of [2 x i32].
   1064 ///
   1065 /// \headerfile <x86intrin.h>
   1066 ///
   1067 /// This intrinsic corresponds to the <c> PSRLD </c> instruction.
   1068 ///
   1069 /// \param __m
   1070 ///    A 64-bit integer vector of [2 x i32].
   1071 /// \param __count
   1072 ///    A 32-bit integer value.
   1073 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
   1074 ///    values.
   1075 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1076 _mm_srli_pi32(__m64 __m, int __count)
   1077 {
   1078     return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m),
   1079                                               __count));
   1080 }
   1081 
   1082 /// Right-shifts the first 64-bit integer parameter by the number of bits
   1083 ///    specified by the second 64-bit integer parameter.
   1084 ///
   1085 ///    High-order bits are cleared.
   1086 ///
   1087 /// \headerfile <x86intrin.h>
   1088 ///
   1089 /// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
   1090 ///
   1091 /// \param __m
   1092 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
   1093 /// \param __count
   1094 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
   1095 /// \returns A 64-bit integer vector containing the right-shifted value.
   1096 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1097 _mm_srl_si64(__m64 __m, __m64 __count)
   1098 {
   1099     return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m),
   1100                                              (__v2di)__anyext128(__count)));
   1101 }
   1102 
   1103 /// Right-shifts the first parameter, which is a 64-bit integer, by the
   1104 ///    number of bits specified by the second parameter, which is a 32-bit
   1105 ///    integer.
   1106 ///
   1107 ///    High-order bits are cleared.
   1108 ///
   1109 /// \headerfile <x86intrin.h>
   1110 ///
   1111 /// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
   1112 ///
   1113 /// \param __m
   1114 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
   1115 /// \param __count
   1116 ///    A 32-bit integer value.
   1117 /// \returns A 64-bit integer vector containing the right-shifted value.
   1118 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1119 _mm_srli_si64(__m64 __m, int __count)
   1120 {
   1121     return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m),
   1122                                               __count));
   1123 }
   1124 
   1125 /// Performs a bitwise AND of two 64-bit integer vectors.
   1126 ///
   1127 /// \headerfile <x86intrin.h>
   1128 ///
   1129 /// This intrinsic corresponds to the <c> PAND </c> instruction.
   1130 ///
   1131 /// \param __m1
   1132 ///    A 64-bit integer vector.
   1133 /// \param __m2
   1134 ///    A 64-bit integer vector.
   1135 /// \returns A 64-bit integer vector containing the bitwise AND of both
   1136 ///    parameters.
   1137 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1138 _mm_and_si64(__m64 __m1, __m64 __m2)
   1139 {
   1140     return (__m64)(((__v1du)__m1) & ((__v1du)__m2));
   1141 }
   1142 
   1143 /// Performs a bitwise NOT of the first 64-bit integer vector, and then
   1144 ///    performs a bitwise AND of the intermediate result and the second 64-bit
   1145 ///    integer vector.
   1146 ///
   1147 /// \headerfile <x86intrin.h>
   1148 ///
   1149 /// This intrinsic corresponds to the <c> PANDN </c> instruction.
   1150 ///
   1151 /// \param __m1
   1152 ///    A 64-bit integer vector. The one's complement of this parameter is used
   1153 ///    in the bitwise AND.
   1154 /// \param __m2
   1155 ///    A 64-bit integer vector.
   1156 /// \returns A 64-bit integer vector containing the bitwise AND of the second
   1157 ///    parameter and the one's complement of the first parameter.
   1158 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1159 _mm_andnot_si64(__m64 __m1, __m64 __m2)
   1160 {
   1161     return (__m64)(~((__v1du)__m1) & ((__v1du)__m2));
   1162 }
   1163 
   1164 /// Performs a bitwise OR of two 64-bit integer vectors.
   1165 ///
   1166 /// \headerfile <x86intrin.h>
   1167 ///
   1168 /// This intrinsic corresponds to the <c> POR </c> instruction.
   1169 ///
   1170 /// \param __m1
   1171 ///    A 64-bit integer vector.
   1172 /// \param __m2
   1173 ///    A 64-bit integer vector.
   1174 /// \returns A 64-bit integer vector containing the bitwise OR of both
   1175 ///    parameters.
   1176 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1177 _mm_or_si64(__m64 __m1, __m64 __m2)
   1178 {
   1179     return (__m64)(((__v1du)__m1) | ((__v1du)__m2));
   1180 }
   1181 
   1182 /// Performs a bitwise exclusive OR of two 64-bit integer vectors.
   1183 ///
   1184 /// \headerfile <x86intrin.h>
   1185 ///
   1186 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
   1187 ///
   1188 /// \param __m1
   1189 ///    A 64-bit integer vector.
   1190 /// \param __m2
   1191 ///    A 64-bit integer vector.
   1192 /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
   1193 ///    parameters.
   1194 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1195 _mm_xor_si64(__m64 __m1, __m64 __m2)
   1196 {
   1197     return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2));
   1198 }
   1199 
   1200 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
   1201 ///    [8 x i8] to determine if the element of the first vector is equal to the
   1202 ///    corresponding element of the second vector.
   1203 ///
   1204 ///    Each comparison returns 0 for false, 0xFF for true.
   1205 ///
   1206 /// \headerfile <x86intrin.h>
   1207 ///
   1208 /// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
   1209 ///
   1210 /// \param __m1
   1211 ///    A 64-bit integer vector of [8 x i8].
   1212 /// \param __m2
   1213 ///    A 64-bit integer vector of [8 x i8].
   1214 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
   1215 ///    results.
   1216 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1217 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
   1218 {
   1219     return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
   1220 }
   1221 
   1222 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
   1223 ///    [4 x i16] to determine if the element of the first vector is equal to the
   1224 ///    corresponding element of the second vector.
   1225 ///
   1226 ///    Each comparison returns 0 for false, 0xFFFF for true.
   1227 ///
   1228 /// \headerfile <x86intrin.h>
   1229 ///
   1230 /// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
   1231 ///
   1232 /// \param __m1
   1233 ///    A 64-bit integer vector of [4 x i16].
   1234 /// \param __m2
   1235 ///    A 64-bit integer vector of [4 x i16].
   1236 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
   1237 ///    results.
   1238 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1239 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
   1240 {
   1241     return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
   1242 }
   1243 
   1244 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
   1245 ///    [2 x i32] to determine if the element of the first vector is equal to the
   1246 ///    corresponding element of the second vector.
   1247 ///
   1248 ///    Each comparison returns 0 for false, 0xFFFFFFFF for true.
   1249 ///
   1250 /// \headerfile <x86intrin.h>
   1251 ///
   1252 /// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
   1253 ///
   1254 /// \param __m1
   1255 ///    A 64-bit integer vector of [2 x i32].
   1256 /// \param __m2
   1257 ///    A 64-bit integer vector of [2 x i32].
   1258 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
   1259 ///    results.
   1260 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1261 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
   1262 {
   1263     return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
   1264 }
   1265 
   1266 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
   1267 ///    [8 x i8] to determine if the element of the first vector is greater than
   1268 ///    the corresponding element of the second vector.
   1269 ///
   1270 ///    Each comparison returns 0 for false, 0xFF for true.
   1271 ///
   1272 /// \headerfile <x86intrin.h>
   1273 ///
   1274 /// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
   1275 ///
   1276 /// \param __m1
   1277 ///    A 64-bit integer vector of [8 x i8].
   1278 /// \param __m2
   1279 ///    A 64-bit integer vector of [8 x i8].
   1280 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
   1281 ///    results.
   1282 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1283 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
   1284 {
   1285   /* This function always performs a signed comparison, but __v8qi is a char
   1286      which may be signed or unsigned, so use __v8qs. */
   1287     return (__m64)((__v8qs)__m1 > (__v8qs)__m2);
   1288 }
   1289 
   1290 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
   1291 ///    [4 x i16] to determine if the element of the first vector is greater than
   1292 ///    the corresponding element of the second vector.
   1293 ///
   1294 ///    Each comparison returns 0 for false, 0xFFFF for true.
   1295 ///
   1296 /// \headerfile <x86intrin.h>
   1297 ///
   1298 /// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
   1299 ///
   1300 /// \param __m1
   1301 ///    A 64-bit integer vector of [4 x i16].
   1302 /// \param __m2
   1303 ///    A 64-bit integer vector of [4 x i16].
   1304 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
   1305 ///    results.
   1306 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1307 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
   1308 {
   1309     return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
   1310 }
   1311 
   1312 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
   1313 ///    [2 x i32] to determine if the element of the first vector is greater than
   1314 ///    the corresponding element of the second vector.
   1315 ///
   1316 ///    Each comparison returns 0 for false, 0xFFFFFFFF for true.
   1317 ///
   1318 /// \headerfile <x86intrin.h>
   1319 ///
   1320 /// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
   1321 ///
   1322 /// \param __m1
   1323 ///    A 64-bit integer vector of [2 x i32].
   1324 /// \param __m2
   1325 ///    A 64-bit integer vector of [2 x i32].
   1326 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
   1327 ///    results.
   1328 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1329 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
   1330 {
   1331     return (__m64)((__v2si)__m1 > (__v2si)__m2);
   1332 }
   1333 
   1334 /// Constructs a 64-bit integer vector initialized to zero.
   1335 ///
   1336 /// \headerfile <x86intrin.h>
   1337 ///
   1338 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
   1339 ///
   1340 /// \returns An initialized 64-bit integer vector with all elements set to zero.
   1341 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
   1342 _mm_setzero_si64(void) {
   1343   return __extension__(__m64){0LL};
   1344 }
   1345 
   1346 /// Constructs a 64-bit integer vector initialized with the specified
   1347 ///    32-bit integer values.
   1348 ///
   1349 /// \headerfile <x86intrin.h>
   1350 ///
   1351 /// This intrinsic is a utility function and does not correspond to a specific
   1352 ///    instruction.
   1353 ///
   1354 /// \param __i1
   1355 ///    A 32-bit integer value used to initialize the upper 32 bits of the
   1356 ///    result.
   1357 /// \param __i0
   1358 ///    A 32-bit integer value used to initialize the lower 32 bits of the
   1359 ///    result.
   1360 /// \returns An initialized 64-bit integer vector.
   1361 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
   1362 _mm_set_pi32(int __i1, int __i0) {
   1363   return __extension__(__m64)(__v2si){__i0, __i1};
   1364 }
   1365 
   1366 /// Constructs a 64-bit integer vector initialized with the specified
   1367 ///    16-bit integer values.
   1368 ///
   1369 /// \headerfile <x86intrin.h>
   1370 ///
   1371 /// This intrinsic is a utility function and does not correspond to a specific
   1372 ///    instruction.
   1373 ///
   1374 /// \param __s3
   1375 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
   1376 /// \param __s2
   1377 ///    A 16-bit integer value used to initialize bits [47:32] of the result.
   1378 /// \param __s1
   1379 ///    A 16-bit integer value used to initialize bits [31:16] of the result.
   1380 /// \param __s0
   1381 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
   1382 /// \returns An initialized 64-bit integer vector.
   1383 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
   1384 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0) {
   1385   return __extension__(__m64)(__v4hi){__s0, __s1, __s2, __s3};
   1386 }
   1387 
   1388 /// Constructs a 64-bit integer vector initialized with the specified
   1389 ///    8-bit integer values.
   1390 ///
   1391 /// \headerfile <x86intrin.h>
   1392 ///
   1393 /// This intrinsic is a utility function and does not correspond to a specific
   1394 ///    instruction.
   1395 ///
   1396 /// \param __b7
   1397 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
   1398 /// \param __b6
   1399 ///    An 8-bit integer value used to initialize bits [55:48] of the result.
   1400 /// \param __b5
   1401 ///    An 8-bit integer value used to initialize bits [47:40] of the result.
   1402 /// \param __b4
   1403 ///    An 8-bit integer value used to initialize bits [39:32] of the result.
   1404 /// \param __b3
   1405 ///    An 8-bit integer value used to initialize bits [31:24] of the result.
   1406 /// \param __b2
   1407 ///    An 8-bit integer value used to initialize bits [23:16] of the result.
   1408 /// \param __b1
   1409 ///    An 8-bit integer value used to initialize bits [15:8] of the result.
   1410 /// \param __b0
   1411 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
   1412 /// \returns An initialized 64-bit integer vector.
   1413 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
   1414 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
   1415             char __b1, char __b0) {
   1416   return __extension__(__m64)(__v8qi){__b0, __b1, __b2, __b3,
   1417                                       __b4, __b5, __b6, __b7};
   1418 }
   1419 
   1420 /// Constructs a 64-bit integer vector of [2 x i32], with each of the
   1421 ///    32-bit integer vector elements set to the specified 32-bit integer
   1422 ///    value.
   1423 ///
   1424 /// \headerfile <x86intrin.h>
   1425 ///
   1426 /// This intrinsic is a utility function and does not correspond to a specific
   1427 ///    instruction.
   1428 ///
   1429 /// \param __i
   1430 ///    A 32-bit integer value used to initialize each vector element of the
   1431 ///    result.
   1432 /// \returns An initialized 64-bit integer vector of [2 x i32].
   1433 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
   1434 _mm_set1_pi32(int __i) {
   1435   return _mm_set_pi32(__i, __i);
   1436 }
   1437 
   1438 /// Constructs a 64-bit integer vector of [4 x i16], with each of the
   1439 ///    16-bit integer vector elements set to the specified 16-bit integer
   1440 ///    value.
   1441 ///
   1442 /// \headerfile <x86intrin.h>
   1443 ///
   1444 /// This intrinsic is a utility function and does not correspond to a specific
   1445 ///    instruction.
   1446 ///
   1447 /// \param __w
   1448 ///    A 16-bit integer value used to initialize each vector element of the
   1449 ///    result.
   1450 /// \returns An initialized 64-bit integer vector of [4 x i16].
   1451 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
   1452 _mm_set1_pi16(short __w) {
   1453   return _mm_set_pi16(__w, __w, __w, __w);
   1454 }
   1455 
   1456 /// Constructs a 64-bit integer vector of [8 x i8], with each of the
   1457 ///    8-bit integer vector elements set to the specified 8-bit integer value.
   1458 ///
   1459 /// \headerfile <x86intrin.h>
   1460 ///
   1461 /// This intrinsic is a utility function and does not correspond to a specific
   1462 ///    instruction.
   1463 ///
   1464 /// \param __b
   1465 ///    An 8-bit integer value used to initialize each vector element of the
   1466 ///    result.
   1467 /// \returns An initialized 64-bit integer vector of [8 x i8].
   1468 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
   1469 _mm_set1_pi8(char __b) {
   1470   return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
   1471 }
   1472 
   1473 /// Constructs a 64-bit integer vector, initialized in reverse order with
   1474 ///    the specified 32-bit integer values.
   1475 ///
   1476 /// \headerfile <x86intrin.h>
   1477 ///
   1478 /// This intrinsic is a utility function and does not correspond to a specific
   1479 ///    instruction.
   1480 ///
   1481 /// \param __i0
   1482 ///    A 32-bit integer value used to initialize the lower 32 bits of the
   1483 ///    result.
   1484 /// \param __i1
   1485 ///    A 32-bit integer value used to initialize the upper 32 bits of the
   1486 ///    result.
   1487 /// \returns An initialized 64-bit integer vector.
   1488 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
   1489 _mm_setr_pi32(int __i0, int __i1) {
   1490   return _mm_set_pi32(__i1, __i0);
   1491 }
   1492 
   1493 /// Constructs a 64-bit integer vector, initialized in reverse order with
   1494 ///    the specified 16-bit integer values.
   1495 ///
   1496 /// \headerfile <x86intrin.h>
   1497 ///
   1498 /// This intrinsic is a utility function and does not correspond to a specific
   1499 ///    instruction.
   1500 ///
   1501 /// \param __w0
   1502 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
   1503 /// \param __w1
   1504 ///    A 16-bit integer value used to initialize bits [31:16] of the result.
   1505 /// \param __w2
   1506 ///    A 16-bit integer value used to initialize bits [47:32] of the result.
   1507 /// \param __w3
   1508 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
   1509 /// \returns An initialized 64-bit integer vector.
   1510 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
   1511 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
   1512   return _mm_set_pi16(__w3, __w2, __w1, __w0);
   1513 }
   1514 
   1515 /// Constructs a 64-bit integer vector, initialized in reverse order with
   1516 ///    the specified 8-bit integer values.
   1517 ///
   1518 /// \headerfile <x86intrin.h>
   1519 ///
   1520 /// This intrinsic is a utility function and does not correspond to a specific
   1521 ///    instruction.
   1522 ///
   1523 /// \param __b0
   1524 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
   1525 /// \param __b1
   1526 ///    An 8-bit integer value used to initialize bits [15:8] of the result.
   1527 /// \param __b2
   1528 ///    An 8-bit integer value used to initialize bits [23:16] of the result.
   1529 /// \param __b3
   1530 ///    An 8-bit integer value used to initialize bits [31:24] of the result.
   1531 /// \param __b4
   1532 ///    An 8-bit integer value used to initialize bits [39:32] of the result.
   1533 /// \param __b5
   1534 ///    An 8-bit integer value used to initialize bits [47:40] of the result.
   1535 /// \param __b6
   1536 ///    An 8-bit integer value used to initialize bits [55:48] of the result.
   1537 /// \param __b7
   1538 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
   1539 /// \returns An initialized 64-bit integer vector.
   1540 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
   1541 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
   1542              char __b6, char __b7) {
   1543   return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
   1544 }
   1545 
   1546 #undef __anyext128
   1547 #undef __trunc64
   1548 #undef __DEFAULT_FN_ATTRS_SSE2
   1549 
   1550 /* Aliases for compatibility. */
   1551 #define _m_empty _mm_empty
   1552 #define _m_from_int _mm_cvtsi32_si64
   1553 #define _m_from_int64 _mm_cvtsi64_m64
   1554 #define _m_to_int _mm_cvtsi64_si32
   1555 #define _m_to_int64 _mm_cvtm64_si64
   1556 #define _m_packsswb _mm_packs_pi16
   1557 #define _m_packssdw _mm_packs_pi32
   1558 #define _m_packuswb _mm_packs_pu16
   1559 #define _m_punpckhbw _mm_unpackhi_pi8
   1560 #define _m_punpckhwd _mm_unpackhi_pi16
   1561 #define _m_punpckhdq _mm_unpackhi_pi32
   1562 #define _m_punpcklbw _mm_unpacklo_pi8
   1563 #define _m_punpcklwd _mm_unpacklo_pi16
   1564 #define _m_punpckldq _mm_unpacklo_pi32
   1565 #define _m_paddb _mm_add_pi8
   1566 #define _m_paddw _mm_add_pi16
   1567 #define _m_paddd _mm_add_pi32
   1568 #define _m_paddsb _mm_adds_pi8
   1569 #define _m_paddsw _mm_adds_pi16
   1570 #define _m_paddusb _mm_adds_pu8
   1571 #define _m_paddusw _mm_adds_pu16
   1572 #define _m_psubb _mm_sub_pi8
   1573 #define _m_psubw _mm_sub_pi16
   1574 #define _m_psubd _mm_sub_pi32
   1575 #define _m_psubsb _mm_subs_pi8
   1576 #define _m_psubsw _mm_subs_pi16
   1577 #define _m_psubusb _mm_subs_pu8
   1578 #define _m_psubusw _mm_subs_pu16
   1579 #define _m_pmaddwd _mm_madd_pi16
   1580 #define _m_pmulhw _mm_mulhi_pi16
   1581 #define _m_pmullw _mm_mullo_pi16
   1582 #define _m_psllw _mm_sll_pi16
   1583 #define _m_psllwi _mm_slli_pi16
   1584 #define _m_pslld _mm_sll_pi32
   1585 #define _m_pslldi _mm_slli_pi32
   1586 #define _m_psllq _mm_sll_si64
   1587 #define _m_psllqi _mm_slli_si64
   1588 #define _m_psraw _mm_sra_pi16
   1589 #define _m_psrawi _mm_srai_pi16
   1590 #define _m_psrad _mm_sra_pi32
   1591 #define _m_psradi _mm_srai_pi32
   1592 #define _m_psrlw _mm_srl_pi16
   1593 #define _m_psrlwi _mm_srli_pi16
   1594 #define _m_psrld _mm_srl_pi32
   1595 #define _m_psrldi _mm_srli_pi32
   1596 #define _m_psrlq _mm_srl_si64
   1597 #define _m_psrlqi _mm_srli_si64
   1598 #define _m_pand _mm_and_si64
   1599 #define _m_pandn _mm_andnot_si64
   1600 #define _m_por _mm_or_si64
   1601 #define _m_pxor _mm_xor_si64
   1602 #define _m_pcmpeqb _mm_cmpeq_pi8
   1603 #define _m_pcmpeqw _mm_cmpeq_pi16
   1604 #define _m_pcmpeqd _mm_cmpeq_pi32
   1605 #define _m_pcmpgtb _mm_cmpgt_pi8
   1606 #define _m_pcmpgtw _mm_cmpgt_pi16
   1607 #define _m_pcmpgtd _mm_cmpgt_pi32
   1608 
   1609 #endif /* __MMINTRIN_H */
   1610