zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

xmmintrin.h (120252B) - Raw


      1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 #ifndef __XMMINTRIN_H
     11 #define __XMMINTRIN_H
     12 
     13 #if !defined(__i386__) && !defined(__x86_64__)
     14 #error "This header is only meant to be used on x86 and x64 architecture"
     15 #endif
     16 
     17 #include <mmintrin.h>
     18 
     19 typedef int __v4si __attribute__((__vector_size__(16)));
     20 typedef float __v4sf __attribute__((__vector_size__(16)));
     21 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
     22 
     23 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
     24 
     25 /* Unsigned types */
     26 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
     27 
     28 /* This header should only be included in a hosted environment as it depends on
     29  * a standard library to provide allocation routines. */
     30 #if __STDC_HOSTED__
     31 #include <mm_malloc.h>
     32 #endif
     33 
     34 /* Define the default attributes for the functions in this file. */
     35 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
     36 #define __DEFAULT_FN_ATTRS                                                     \
     37   __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
     38                  __min_vector_width__(128)))
     39 #define __DEFAULT_FN_ATTRS_SSE2                                                \
     40   __attribute__((__always_inline__, __nodebug__,                               \
     41                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
     42 #else
     43 #define __DEFAULT_FN_ATTRS                                                     \
     44   __attribute__((__always_inline__, __nodebug__, __target__("sse"),            \
     45                  __min_vector_width__(128)))
     46 #define __DEFAULT_FN_ATTRS_SSE2                                                \
     47   __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
     48                  __min_vector_width__(128)))
     49 #endif
     50 
     51 #if defined(__cplusplus) && (__cplusplus >= 201103L)
     52 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
     53 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
     54 #else
     55 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
     56 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
     57 #endif
     58 
     59 #define __trunc64(x)                                                           \
     60   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
     61 #define __zext128(x)                                                           \
     62   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
     63                                     1, 2, 3)
     64 #define __anyext128(x)                                                         \
     65   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
     66                                     1, -1, -1)
     67 #define __zeroupper64(x)                                                       \
     68   (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0,   \
     69                                     1, 4, 5)
     70 
     71 /// Adds the 32-bit float values in the low-order bits of the operands.
     72 ///
     73 /// \headerfile <x86intrin.h>
     74 ///
     75 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
     76 ///
     77 /// \param __a
     78 ///    A 128-bit vector of [4 x float] containing one of the source operands.
     79 ///    The lower 32 bits of this operand are used in the calculation.
     80 /// \param __b
     81 ///    A 128-bit vector of [4 x float] containing one of the source operands.
     82 ///    The lower 32 bits of this operand are used in the calculation.
     83 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
     84 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
     85 ///    the upper 96 bits of the first source operand.
     86 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
     87 _mm_add_ss(__m128 __a, __m128 __b) {
     88   __a[0] += __b[0];
     89   return __a;
     90 }
     91 
     92 /// Adds two 128-bit vectors of [4 x float], and returns the results of
     93 ///    the addition.
     94 ///
     95 /// \headerfile <x86intrin.h>
     96 ///
     97 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
     98 ///
     99 /// \param __a
    100 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    101 /// \param __b
    102 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    103 /// \returns A 128-bit vector of [4 x float] containing the sums of both
    104 ///    operands.
    105 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
    106 _mm_add_ps(__m128 __a, __m128 __b) {
    107   return (__m128)((__v4sf)__a + (__v4sf)__b);
    108 }
    109 
    110 /// Subtracts the 32-bit float value in the low-order bits of the second
    111 ///    operand from the corresponding value in the first operand.
    112 ///
    113 /// \headerfile <x86intrin.h>
    114 ///
    115 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
    116 ///
    117 /// \param __a
    118 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
    119 ///    of this operand are used in the calculation.
    120 /// \param __b
    121 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
    122 ///    bits of this operand are used in the calculation.
    123 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
    124 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
    125 ///    copied from the upper 96 bits of the first source operand.
    126 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
    127 _mm_sub_ss(__m128 __a, __m128 __b) {
    128   __a[0] -= __b[0];
    129   return __a;
    130 }
    131 
    132 /// Subtracts each of the values of the second operand from the first
    133 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
    134 ///    the results of the subtraction.
    135 ///
    136 /// \headerfile <x86intrin.h>
    137 ///
    138 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
    139 ///
    140 /// \param __a
    141 ///    A 128-bit vector of [4 x float] containing the minuend.
    142 /// \param __b
    143 ///    A 128-bit vector of [4 x float] containing the subtrahend.
    144 /// \returns A 128-bit vector of [4 x float] containing the differences between
    145 ///    both operands.
    146 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
    147 _mm_sub_ps(__m128 __a, __m128 __b) {
    148   return (__m128)((__v4sf)__a - (__v4sf)__b);
    149 }
    150 
    151 /// Multiplies two 32-bit float values in the low-order bits of the
    152 ///    operands.
    153 ///
    154 /// \headerfile <x86intrin.h>
    155 ///
    156 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
    157 ///
    158 /// \param __a
    159 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    160 ///    The lower 32 bits of this operand are used in the calculation.
    161 /// \param __b
    162 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    163 ///    The lower 32 bits of this operand are used in the calculation.
    164 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
    165 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
    166 ///    bits of the first source operand.
    167 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
    168 _mm_mul_ss(__m128 __a, __m128 __b) {
    169   __a[0] *= __b[0];
    170   return __a;
    171 }
    172 
    173 /// Multiplies two 128-bit vectors of [4 x float] and returns the
    174 ///    results of the multiplication.
    175 ///
    176 /// \headerfile <x86intrin.h>
    177 ///
    178 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
    179 ///
    180 /// \param __a
    181 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    182 /// \param __b
    183 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    184 /// \returns A 128-bit vector of [4 x float] containing the products of both
    185 ///    operands.
    186 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
    187 _mm_mul_ps(__m128 __a, __m128 __b) {
    188   return (__m128)((__v4sf)__a * (__v4sf)__b);
    189 }
    190 
    191 /// Divides the value in the low-order 32 bits of the first operand by
    192 ///    the corresponding value in the second operand.
    193 ///
    194 /// \headerfile <x86intrin.h>
    195 ///
    196 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
    197 ///
    198 /// \param __a
    199 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
    200 ///    bits of this operand are used in the calculation.
    201 /// \param __b
    202 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
    203 ///    of this operand are used in the calculation.
    204 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
    205 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
    206 ///    upper 96 bits of the first source operand.
    207 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
    208 _mm_div_ss(__m128 __a, __m128 __b) {
    209   __a[0] /= __b[0];
    210   return __a;
    211 }
    212 
    213 /// Divides two 128-bit vectors of [4 x float].
    214 ///
    215 /// \headerfile <x86intrin.h>
    216 ///
    217 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
    218 ///
    219 /// \param __a
    220 ///    A 128-bit vector of [4 x float] containing the dividend.
    221 /// \param __b
    222 ///    A 128-bit vector of [4 x float] containing the divisor.
    223 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
    224 ///    operands.
    225 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
    226 _mm_div_ps(__m128 __a, __m128 __b) {
    227   return (__m128)((__v4sf)__a / (__v4sf)__b);
    228 }
    229 
    230 /// Calculates the square root of the value stored in the low-order bits
    231 ///    of a 128-bit vector of [4 x float].
    232 ///
    233 /// \headerfile <x86intrin.h>
    234 ///
    235 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
    236 ///
    237 /// \param __a
    238 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
    239 ///    used in the calculation.
    240 /// \returns A 128-bit vector of [4 x float] containing the square root of the
    241 ///    value in the low-order bits of the operand.
    242 static __inline__ __m128 __DEFAULT_FN_ATTRS
    243 _mm_sqrt_ss(__m128 __a)
    244 {
    245   return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
    246 }
    247 
    248 /// Calculates the square roots of the values stored in a 128-bit vector
    249 ///    of [4 x float].
    250 ///
    251 /// \headerfile <x86intrin.h>
    252 ///
    253 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
    254 ///
    255 /// \param __a
    256 ///    A 128-bit vector of [4 x float].
    257 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
    258 ///    values in the operand.
    259 static __inline__ __m128 __DEFAULT_FN_ATTRS
    260 _mm_sqrt_ps(__m128 __a)
    261 {
    262   return __builtin_ia32_sqrtps((__v4sf)__a);
    263 }
    264 
    265 /// Calculates the approximate reciprocal of the value stored in the
    266 ///    low-order bits of a 128-bit vector of [4 x float].
    267 ///
    268 /// \headerfile <x86intrin.h>
    269 ///
    270 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
    271 ///
    272 /// \param __a
    273 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
    274 ///    used in the calculation.
    275 /// \returns A 128-bit vector of [4 x float] containing the approximate
    276 ///    reciprocal of the value in the low-order bits of the operand.
    277 static __inline__ __m128 __DEFAULT_FN_ATTRS
    278 _mm_rcp_ss(__m128 __a)
    279 {
    280   return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
    281 }
    282 
    283 /// Calculates the approximate reciprocals of the values stored in a
    284 ///    128-bit vector of [4 x float].
    285 ///
    286 /// \headerfile <x86intrin.h>
    287 ///
    288 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
    289 ///
    290 /// \param __a
    291 ///    A 128-bit vector of [4 x float].
    292 /// \returns A 128-bit vector of [4 x float] containing the approximate
    293 ///    reciprocals of the values in the operand.
    294 static __inline__ __m128 __DEFAULT_FN_ATTRS
    295 _mm_rcp_ps(__m128 __a)
    296 {
    297   return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
    298 }
    299 
    300 /// Calculates the approximate reciprocal of the square root of the value
    301 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
    302 ///
    303 /// \headerfile <x86intrin.h>
    304 ///
    305 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
    306 ///
    307 /// \param __a
    308 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
    309 ///    used in the calculation.
    310 /// \returns A 128-bit vector of [4 x float] containing the approximate
    311 ///    reciprocal of the square root of the value in the low-order bits of the
    312 ///    operand.
    313 static __inline__ __m128 __DEFAULT_FN_ATTRS
    314 _mm_rsqrt_ss(__m128 __a)
    315 {
    316   return __builtin_ia32_rsqrtss((__v4sf)__a);
    317 }
    318 
    319 /// Calculates the approximate reciprocals of the square roots of the
    320 ///    values stored in a 128-bit vector of [4 x float].
    321 ///
    322 /// \headerfile <x86intrin.h>
    323 ///
    324 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
    325 ///
    326 /// \param __a
    327 ///    A 128-bit vector of [4 x float].
    328 /// \returns A 128-bit vector of [4 x float] containing the approximate
    329 ///    reciprocals of the square roots of the values in the operand.
    330 static __inline__ __m128 __DEFAULT_FN_ATTRS
    331 _mm_rsqrt_ps(__m128 __a)
    332 {
    333   return __builtin_ia32_rsqrtps((__v4sf)__a);
    334 }
    335 
    336 /// Compares two 32-bit float values in the low-order bits of both
    337 ///    operands and returns the lesser value in the low-order bits of the
    338 ///    vector of [4 x float].
    339 ///
    340 ///    If either value in a comparison is NaN, returns the value from \a __b.
    341 ///
    342 /// \headerfile <x86intrin.h>
    343 ///
    344 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
    345 ///
    346 /// \param __a
    347 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    348 ///    32 bits of this operand are used in the comparison.
    349 /// \param __b
    350 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    351 ///    32 bits of this operand are used in the comparison.
    352 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
    353 ///    minimum value between both operands. The upper 96 bits are copied from
    354 ///    the upper 96 bits of the first source operand.
    355 static __inline__ __m128 __DEFAULT_FN_ATTRS
    356 _mm_min_ss(__m128 __a, __m128 __b)
    357 {
    358   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
    359 }
    360 
    361 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
    362 ///    of each pair of values.
    363 ///
    364 ///    If either value in a comparison is NaN, returns the value from \a __b.
    365 ///
    366 /// \headerfile <x86intrin.h>
    367 ///
    368 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
    369 ///
    370 /// \param __a
    371 ///    A 128-bit vector of [4 x float] containing one of the operands.
    372 /// \param __b
    373 ///    A 128-bit vector of [4 x float] containing one of the operands.
    374 /// \returns A 128-bit vector of [4 x float] containing the minimum values
    375 ///    between both operands.
    376 static __inline__ __m128 __DEFAULT_FN_ATTRS
    377 _mm_min_ps(__m128 __a, __m128 __b)
    378 {
    379   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
    380 }
    381 
    382 /// Compares two 32-bit float values in the low-order bits of both
    383 ///    operands and returns the greater value in the low-order bits of a 128-bit
    384 ///    vector of [4 x float].
    385 ///
    386 ///    If either value in a comparison is NaN, returns the value from \a __b.
    387 ///
    388 /// \headerfile <x86intrin.h>
    389 ///
    390 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
    391 ///
    392 /// \param __a
    393 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    394 ///    32 bits of this operand are used in the comparison.
    395 /// \param __b
    396 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    397 ///    32 bits of this operand are used in the comparison.
    398 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
    399 ///    maximum value between both operands. The upper 96 bits are copied from
    400 ///    the upper 96 bits of the first source operand.
    401 static __inline__ __m128 __DEFAULT_FN_ATTRS
    402 _mm_max_ss(__m128 __a, __m128 __b)
    403 {
    404   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
    405 }
    406 
    407 /// Compares two 128-bit vectors of [4 x float] and returns the greater
    408 ///    of each pair of values.
    409 ///
    410 ///    If either value in a comparison is NaN, returns the value from \a __b.
    411 ///
    412 /// \headerfile <x86intrin.h>
    413 ///
    414 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
    415 ///
    416 /// \param __a
    417 ///    A 128-bit vector of [4 x float] containing one of the operands.
    418 /// \param __b
    419 ///    A 128-bit vector of [4 x float] containing one of the operands.
    420 /// \returns A 128-bit vector of [4 x float] containing the maximum values
    421 ///    between both operands.
    422 static __inline__ __m128 __DEFAULT_FN_ATTRS
    423 _mm_max_ps(__m128 __a, __m128 __b)
    424 {
    425   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
    426 }
    427 
    428 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
    429 ///
    430 /// \headerfile <x86intrin.h>
    431 ///
    432 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
    433 ///
    434 /// \param __a
    435 ///    A 128-bit vector containing one of the source operands.
    436 /// \param __b
    437 ///    A 128-bit vector containing one of the source operands.
    438 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
    439 ///    values between both operands.
    440 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
    441 _mm_and_ps(__m128 __a, __m128 __b) {
    442   return (__m128)((__v4su)__a & (__v4su)__b);
    443 }
    444 
    445 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
    446 ///    the one's complement of the values contained in the first source
    447 ///    operand.
    448 ///
    449 /// \headerfile <x86intrin.h>
    450 ///
    451 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
    452 ///
    453 /// \param __a
    454 ///    A 128-bit vector of [4 x float] containing the first source operand. The
    455 ///    one's complement of this value is used in the bitwise AND.
    456 /// \param __b
    457 ///    A 128-bit vector of [4 x float] containing the second source operand.
    458 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
    459 ///    one's complement of the first operand and the values in the second
    460 ///    operand.
    461 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
    462 _mm_andnot_ps(__m128 __a, __m128 __b) {
    463   return (__m128)(~(__v4su)__a & (__v4su)__b);
    464 }
    465 
    466 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
    467 ///
    468 /// \headerfile <x86intrin.h>
    469 ///
    470 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
    471 ///
    472 /// \param __a
    473 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    474 /// \param __b
    475 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    476 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
    477 ///    values between both operands.
    478 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
    479 _mm_or_ps(__m128 __a, __m128 __b) {
    480   return (__m128)((__v4su)__a | (__v4su)__b);
    481 }
    482 
    483 /// Performs a bitwise exclusive OR of two 128-bit vectors of
    484 ///    [4 x float].
    485 ///
    486 /// \headerfile <x86intrin.h>
    487 ///
    488 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
    489 ///
    490 /// \param __a
    491 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    492 /// \param __b
    493 ///    A 128-bit vector of [4 x float] containing one of the source operands.
    494 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
    495 ///    of the values between both operands.
    496 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
    497 _mm_xor_ps(__m128 __a, __m128 __b) {
    498   return (__m128)((__v4su)__a ^ (__v4su)__b);
    499 }
    500 
    501 /// Compares two 32-bit float values in the low-order bits of both
    502 ///    operands for equality.
    503 ///
    504 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
    505 ///    low-order bits of a vector [4 x float].
    506 ///    If either value in a comparison is NaN, returns false.
    507 ///
    508 /// \headerfile <x86intrin.h>
    509 ///
    510 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
    511 ///
    512 /// \param __a
    513 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    514 ///    32 bits of this operand are used in the comparison.
    515 /// \param __b
    516 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    517 ///    32 bits of this operand are used in the comparison.
    518 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    519 ///    in the low-order bits.
    520 static __inline__ __m128 __DEFAULT_FN_ATTRS
    521 _mm_cmpeq_ss(__m128 __a, __m128 __b)
    522 {
    523   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
    524 }
    525 
    526 /// Compares each of the corresponding 32-bit float values of the
    527 ///    128-bit vectors of [4 x float] for equality.
    528 ///
    529 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
    530 ///    If either value in a comparison is NaN, returns false.
    531 ///
    532 /// \headerfile <x86intrin.h>
    533 ///
    534 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
    535 ///
    536 /// \param __a
    537 ///    A 128-bit vector of [4 x float].
    538 /// \param __b
    539 ///    A 128-bit vector of [4 x float].
    540 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    541 static __inline__ __m128 __DEFAULT_FN_ATTRS
    542 _mm_cmpeq_ps(__m128 __a, __m128 __b)
    543 {
    544   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
    545 }
    546 
    547 /// Compares two 32-bit float values in the low-order bits of both
    548 ///    operands to determine if the value in the first operand is less than the
    549 ///    corresponding value in the second operand.
    550 ///
    551 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
    552 ///    low-order bits of a vector of [4 x float].
    553 ///    If either value in a comparison is NaN, returns false.
    554 ///
    555 /// \headerfile <x86intrin.h>
    556 ///
    557 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
    558 ///
    559 /// \param __a
    560 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    561 ///    32 bits of this operand are used in the comparison.
    562 /// \param __b
    563 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    564 ///    32 bits of this operand are used in the comparison.
    565 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    566 ///    in the low-order bits.
    567 static __inline__ __m128 __DEFAULT_FN_ATTRS
    568 _mm_cmplt_ss(__m128 __a, __m128 __b)
    569 {
    570   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
    571 }
    572 
    573 /// Compares each of the corresponding 32-bit float values of the
    574 ///    128-bit vectors of [4 x float] to determine if the values in the first
    575 ///    operand are less than those in the second operand.
    576 ///
    577 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    578 ///    If either value in a comparison is NaN, returns false.
    579 ///
    580 /// \headerfile <x86intrin.h>
    581 ///
    582 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
    583 ///
    584 /// \param __a
    585 ///    A 128-bit vector of [4 x float].
    586 /// \param __b
    587 ///    A 128-bit vector of [4 x float].
    588 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    589 static __inline__ __m128 __DEFAULT_FN_ATTRS
    590 _mm_cmplt_ps(__m128 __a, __m128 __b)
    591 {
    592   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
    593 }
    594 
    595 /// Compares two 32-bit float values in the low-order bits of both
    596 ///    operands to determine if the value in the first operand is less than or
    597 ///    equal to the corresponding value in the second operand.
    598 ///
    599 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
    600 ///    the low-order bits of a vector of [4 x float].
    601 ///    If either value in a comparison is NaN, returns false.
    602 ///
    603 /// \headerfile <x86intrin.h>
    604 ///
    605 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
    606 ///
    607 /// \param __a
    608 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    609 ///    32 bits of this operand are used in the comparison.
    610 /// \param __b
    611 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    612 ///    32 bits of this operand are used in the comparison.
    613 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    614 ///    in the low-order bits.
    615 static __inline__ __m128 __DEFAULT_FN_ATTRS
    616 _mm_cmple_ss(__m128 __a, __m128 __b)
    617 {
    618   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
    619 }
    620 
    621 /// Compares each of the corresponding 32-bit float values of the
    622 ///    128-bit vectors of [4 x float] to determine if the values in the first
    623 ///    operand are less than or equal to those in the second operand.
    624 ///
    625 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
    626 ///    If either value in a comparison is NaN, returns false.
    627 ///
    628 /// \headerfile <x86intrin.h>
    629 ///
    630 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
    631 ///
    632 /// \param __a
    633 ///    A 128-bit vector of [4 x float].
    634 /// \param __b
    635 ///    A 128-bit vector of [4 x float].
    636 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    637 static __inline__ __m128 __DEFAULT_FN_ATTRS
    638 _mm_cmple_ps(__m128 __a, __m128 __b)
    639 {
    640   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
    641 }
    642 
    643 /// Compares two 32-bit float values in the low-order bits of both
    644 ///    operands to determine if the value in the first operand is greater than
    645 ///    the corresponding value in the second operand.
    646 ///
    647 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
    648 ///    low-order bits of a vector of [4 x float].
    649 ///    If either value in a comparison is NaN, returns false.
    650 ///
    651 /// \headerfile <x86intrin.h>
    652 ///
    653 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
    654 ///
    655 /// \param __a
    656 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    657 ///    32 bits of this operand are used in the comparison.
    658 /// \param __b
    659 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    660 ///    32 bits of this operand are used in the comparison.
    661 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    662 ///    in the low-order bits.
    663 static __inline__ __m128 __DEFAULT_FN_ATTRS
    664 _mm_cmpgt_ss(__m128 __a, __m128 __b)
    665 {
    666   return (__m128)__builtin_shufflevector((__v4sf)__a,
    667                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
    668                                          4, 1, 2, 3);
    669 }
    670 
    671 /// Compares each of the corresponding 32-bit float values of the
    672 ///    128-bit vectors of [4 x float] to determine if the values in the first
    673 ///    operand are greater than those in the second operand.
    674 ///
    675 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
    676 ///    If either value in a comparison is NaN, returns false.
    677 ///
    678 /// \headerfile <x86intrin.h>
    679 ///
    680 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
    681 ///
    682 /// \param __a
    683 ///    A 128-bit vector of [4 x float].
    684 /// \param __b
    685 ///    A 128-bit vector of [4 x float].
    686 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    687 static __inline__ __m128 __DEFAULT_FN_ATTRS
    688 _mm_cmpgt_ps(__m128 __a, __m128 __b)
    689 {
    690   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
    691 }
    692 
    693 /// Compares two 32-bit float values in the low-order bits of both
    694 ///    operands to determine if the value in the first operand is greater than
    695 ///    or equal to the corresponding value in the second operand.
    696 ///
    697 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
    698 ///    low-order bits of a vector of [4 x float].
    699 ///    If either value in a comparison is NaN, returns false.
    700 ///
    701 /// \headerfile <x86intrin.h>
    702 ///
    703 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
    704 ///
    705 /// \param __a
    706 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    707 ///    32 bits of this operand are used in the comparison.
    708 /// \param __b
    709 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    710 ///    32 bits of this operand are used in the comparison.
    711 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    712 ///    in the low-order bits.
    713 static __inline__ __m128 __DEFAULT_FN_ATTRS
    714 _mm_cmpge_ss(__m128 __a, __m128 __b)
    715 {
    716   return (__m128)__builtin_shufflevector((__v4sf)__a,
    717                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
    718                                          4, 1, 2, 3);
    719 }
    720 
    721 /// Compares each of the corresponding 32-bit float values of the
    722 ///    128-bit vectors of [4 x float] to determine if the values in the first
    723 ///    operand are greater than or equal to those in the second operand.
    724 ///
    725 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    726 ///    If either value in a comparison is NaN, returns false.
    727 ///
    728 /// \headerfile <x86intrin.h>
    729 ///
    730 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
    731 ///
    732 /// \param __a
    733 ///    A 128-bit vector of [4 x float].
    734 /// \param __b
    735 ///    A 128-bit vector of [4 x float].
    736 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    737 static __inline__ __m128 __DEFAULT_FN_ATTRS
    738 _mm_cmpge_ps(__m128 __a, __m128 __b)
    739 {
    740   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
    741 }
    742 
    743 /// Compares two 32-bit float values in the low-order bits of both operands
    744 ///    for inequality.
    745 ///
    746 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
    747 ///    low-order bits of a vector of [4 x float].
    748 ///    If either value in a comparison is NaN, returns true.
    749 ///
    750 /// \headerfile <x86intrin.h>
    751 ///
    752 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
    753 ///   instructions.
    754 ///
    755 /// \param __a
    756 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    757 ///    32 bits of this operand are used in the comparison.
    758 /// \param __b
    759 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    760 ///    32 bits of this operand are used in the comparison.
    761 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    762 ///    in the low-order bits.
    763 static __inline__ __m128 __DEFAULT_FN_ATTRS
    764 _mm_cmpneq_ss(__m128 __a, __m128 __b)
    765 {
    766   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
    767 }
    768 
    769 /// Compares each of the corresponding 32-bit float values of the
    770 ///    128-bit vectors of [4 x float] for inequality.
    771 ///
    772 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
    773 ///    If either value in a comparison is NaN, returns true.
    774 ///
    775 /// \headerfile <x86intrin.h>
    776 ///
    777 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
    778 ///   instructions.
    779 ///
    780 /// \param __a
    781 ///    A 128-bit vector of [4 x float].
    782 /// \param __b
    783 ///    A 128-bit vector of [4 x float].
    784 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    785 static __inline__ __m128 __DEFAULT_FN_ATTRS
    786 _mm_cmpneq_ps(__m128 __a, __m128 __b)
    787 {
    788   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
    789 }
    790 
    791 /// Compares two 32-bit float values in the low-order bits of both
    792 ///    operands to determine if the value in the first operand is not less than
    793 ///    the corresponding value in the second operand.
    794 ///
    795 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
    796 ///    low-order bits of a vector of [4 x float].
    797 ///    If either value in a comparison is NaN, returns true.
    798 ///
    799 /// \headerfile <x86intrin.h>
    800 ///
    801 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
    802 ///   instructions.
    803 ///
    804 /// \param __a
    805 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    806 ///    32 bits of this operand are used in the comparison.
    807 /// \param __b
    808 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    809 ///    32 bits of this operand are used in the comparison.
    810 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    811 ///    in the low-order bits.
    812 static __inline__ __m128 __DEFAULT_FN_ATTRS
    813 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
    814 {
    815   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
    816 }
    817 
    818 /// Compares each of the corresponding 32-bit float values of the
    819 ///    128-bit vectors of [4 x float] to determine if the values in the first
    820 ///    operand are not less than those in the second operand.
    821 ///
    822 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
    823 ///    If either value in a comparison is NaN, returns true.
    824 ///
    825 /// \headerfile <x86intrin.h>
    826 ///
    827 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
    828 ///   instructions.
    829 ///
    830 /// \param __a
    831 ///    A 128-bit vector of [4 x float].
    832 /// \param __b
    833 ///    A 128-bit vector of [4 x float].
    834 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    835 static __inline__ __m128 __DEFAULT_FN_ATTRS
    836 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
    837 {
    838   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
    839 }
    840 
    841 /// Compares two 32-bit float values in the low-order bits of both
    842 ///    operands to determine if the value in the first operand is not less than
    843 ///    or equal to the corresponding value in the second operand.
    844 ///
    845 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
    846 ///    low-order bits of a vector of [4 x float].
    847 ///    If either value in a comparison is NaN, returns true.
    848 ///
    849 /// \headerfile <x86intrin.h>
    850 ///
    851 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
    852 ///   instructions.
    853 ///
    854 /// \param __a
    855 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    856 ///    32 bits of this operand are used in the comparison.
    857 /// \param __b
    858 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    859 ///    32 bits of this operand are used in the comparison.
    860 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    861 ///    in the low-order bits.
    862 static __inline__ __m128 __DEFAULT_FN_ATTRS
    863 _mm_cmpnle_ss(__m128 __a, __m128 __b)
    864 {
    865   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
    866 }
    867 
    868 /// Compares each of the corresponding 32-bit float values of the
    869 ///    128-bit vectors of [4 x float] to determine if the values in the first
    870 ///    operand are not less than or equal to those in the second operand.
    871 ///
    872 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
    873 ///    If either value in a comparison is NaN, returns true.
    874 ///
    875 /// \headerfile <x86intrin.h>
    876 ///
    877 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
    878 ///   instructions.
    879 ///
    880 /// \param __a
    881 ///    A 128-bit vector of [4 x float].
    882 /// \param __b
    883 ///    A 128-bit vector of [4 x float].
    884 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    885 static __inline__ __m128 __DEFAULT_FN_ATTRS
    886 _mm_cmpnle_ps(__m128 __a, __m128 __b)
    887 {
    888   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
    889 }
    890 
    891 /// Compares two 32-bit float values in the low-order bits of both
    892 ///    operands to determine if the value in the first operand is not greater
    893 ///    than the corresponding value in the second operand.
    894 ///
    895 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
    896 ///    low-order bits of a vector of [4 x float].
    897 ///    If either value in a comparison is NaN, returns true.
    898 ///
    899 /// \headerfile <x86intrin.h>
    900 ///
    901 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
    902 ///   instructions.
    903 ///
    904 /// \param __a
    905 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    906 ///    32 bits of this operand are used in the comparison.
    907 /// \param __b
    908 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    909 ///    32 bits of this operand are used in the comparison.
    910 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    911 ///    in the low-order bits.
    912 static __inline__ __m128 __DEFAULT_FN_ATTRS
    913 _mm_cmpngt_ss(__m128 __a, __m128 __b)
    914 {
    915   return (__m128)__builtin_shufflevector((__v4sf)__a,
    916                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
    917                                          4, 1, 2, 3);
    918 }
    919 
    920 /// Compares each of the corresponding 32-bit float values of the
    921 ///    128-bit vectors of [4 x float] to determine if the values in the first
    922 ///    operand are not greater than those in the second operand.
    923 ///
    924 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
    925 ///    If either value in a comparison is NaN, returns true.
    926 ///
    927 /// \headerfile <x86intrin.h>
    928 ///
    929 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
    930 ///   instructions.
    931 ///
    932 /// \param __a
    933 ///    A 128-bit vector of [4 x float].
    934 /// \param __b
    935 ///    A 128-bit vector of [4 x float].
    936 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    937 static __inline__ __m128 __DEFAULT_FN_ATTRS
    938 _mm_cmpngt_ps(__m128 __a, __m128 __b)
    939 {
    940   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
    941 }
    942 
    943 /// Compares two 32-bit float values in the low-order bits of both
    944 ///    operands to determine if the value in the first operand is not greater
    945 ///    than or equal to the corresponding value in the second operand.
    946 ///
    947 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
    948 ///    low-order bits of a vector of [4 x float].
    949 ///    If either value in a comparison is NaN, returns true.
    950 ///
    951 /// \headerfile <x86intrin.h>
    952 ///
    953 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
    954 ///   instructions.
    955 ///
    956 /// \param __a
    957 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    958 ///    32 bits of this operand are used in the comparison.
    959 /// \param __b
    960 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
    961 ///    32 bits of this operand are used in the comparison.
    962 /// \returns A 128-bit vector of [4 x float] containing the comparison results
    963 ///    in the low-order bits.
    964 static __inline__ __m128 __DEFAULT_FN_ATTRS
    965 _mm_cmpnge_ss(__m128 __a, __m128 __b)
    966 {
    967   return (__m128)__builtin_shufflevector((__v4sf)__a,
    968                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
    969                                          4, 1, 2, 3);
    970 }
    971 
    972 /// Compares each of the corresponding 32-bit float values of the
    973 ///    128-bit vectors of [4 x float] to determine if the values in the first
    974 ///    operand are not greater than or equal to those in the second operand.
    975 ///
    976 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
    977 ///    If either value in a comparison is NaN, returns true.
    978 ///
    979 /// \headerfile <x86intrin.h>
    980 ///
    981 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
    982 ///   instructions.
    983 ///
    984 /// \param __a
    985 ///    A 128-bit vector of [4 x float].
    986 /// \param __b
    987 ///    A 128-bit vector of [4 x float].
    988 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
    989 static __inline__ __m128 __DEFAULT_FN_ATTRS
    990 _mm_cmpnge_ps(__m128 __a, __m128 __b)
    991 {
    992   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
    993 }
    994 
    995 /// Compares two 32-bit float values in the low-order bits of both
    996 ///    operands to determine if the value in the first operand is ordered with
    997 ///    respect to the corresponding value in the second operand.
    998 ///
    999 ///    A pair of floating-point values are ordered with respect to each
   1000 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
   1001 ///    0xFFFFFFFF for true.
   1002 ///
   1003 /// \headerfile <x86intrin.h>
   1004 ///
   1005 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
   1006 ///   instructions.
   1007 ///
   1008 /// \param __a
   1009 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
   1010 ///    32 bits of this operand are used in the comparison.
   1011 /// \param __b
   1012 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
   1013 ///    32 bits of this operand are used in the comparison.
   1014 /// \returns A 128-bit vector of [4 x float] containing the comparison results
   1015 ///    in the low-order bits.
   1016 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1017 _mm_cmpord_ss(__m128 __a, __m128 __b)
   1018 {
   1019   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
   1020 }
   1021 
   1022 /// Compares each of the corresponding 32-bit float values of the
   1023 ///    128-bit vectors of [4 x float] to determine if the values in the first
   1024 ///    operand are ordered with respect to those in the second operand.
   1025 ///
   1026 ///    A pair of floating-point values are ordered with respect to each
   1027 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
   1028 ///    0xFFFFFFFF for true.
   1029 ///
   1030 /// \headerfile <x86intrin.h>
   1031 ///
   1032 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
   1033 ///   instructions.
   1034 ///
   1035 /// \param __a
   1036 ///    A 128-bit vector of [4 x float].
   1037 /// \param __b
   1038 ///    A 128-bit vector of [4 x float].
   1039 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
   1040 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1041 _mm_cmpord_ps(__m128 __a, __m128 __b)
   1042 {
   1043   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
   1044 }
   1045 
   1046 /// Compares two 32-bit float values in the low-order bits of both
   1047 ///    operands to determine if the value in the first operand is unordered
   1048 ///    with respect to the corresponding value in the second operand.
   1049 ///
   1050 ///    A pair of double-precision values are unordered with respect to each
   1051 ///    other if one or both values are NaN. Each comparison returns 0x0 for
   1052 ///    false, 0xFFFFFFFF for true.
   1053 ///
   1054 /// \headerfile <x86intrin.h>
   1055 ///
   1056 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
   1057 ///   instructions.
   1058 ///
   1059 /// \param __a
   1060 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
   1061 ///    32 bits of this operand are used in the comparison.
   1062 /// \param __b
   1063 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
   1064 ///    32 bits of this operand are used in the comparison.
   1065 /// \returns A 128-bit vector of [4 x float] containing the comparison results
   1066 ///    in the low-order bits.
   1067 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1068 _mm_cmpunord_ss(__m128 __a, __m128 __b)
   1069 {
   1070   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
   1071 }
   1072 
   1073 /// Compares each of the corresponding 32-bit float values of the
   1074 ///    128-bit vectors of [4 x float] to determine if the values in the first
   1075 ///    operand are unordered with respect to those in the second operand.
   1076 ///
   1077 ///    A pair of double-precision values are unordered with respect to each
   1078 ///    other if one or both values are NaN. Each comparison returns 0x0 for
   1079 ///    false, 0xFFFFFFFFFFFFFFFF for true.
   1080 ///
   1081 /// \headerfile <x86intrin.h>
   1082 ///
   1083 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
   1084 ///   instructions.
   1085 ///
   1086 /// \param __a
   1087 ///    A 128-bit vector of [4 x float].
   1088 /// \param __b
   1089 ///    A 128-bit vector of [4 x float].
   1090 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
   1091 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1092 _mm_cmpunord_ps(__m128 __a, __m128 __b)
   1093 {
   1094   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
   1095 }
   1096 
   1097 /// Compares two 32-bit float values in the low-order bits of both
   1098 ///    operands for equality.
   1099 ///
   1100 ///    The comparison returns 0 for false, 1 for true. If either value in a
   1101 ///    comparison is NaN, returns 0.
   1102 ///
   1103 /// \headerfile <x86intrin.h>
   1104 ///
   1105 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
   1106 ///   instructions.
   1107 ///
   1108 /// \param __a
   1109 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1110 ///    used in the comparison.
   1111 /// \param __b
   1112 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1113 ///    used in the comparison.
   1114 /// \returns An integer containing the comparison results.
   1115 static __inline__ int __DEFAULT_FN_ATTRS
   1116 _mm_comieq_ss(__m128 __a, __m128 __b)
   1117 {
   1118   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
   1119 }
   1120 
   1121 /// Compares two 32-bit float values in the low-order bits of both
   1122 ///    operands to determine if the first operand is less than the second
   1123 ///    operand.
   1124 ///
   1125 ///    The comparison returns 0 for false, 1 for true. If either value in a
   1126 ///    comparison is NaN, returns 0.
   1127 ///
   1128 /// \headerfile <x86intrin.h>
   1129 ///
   1130 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
   1131 ///   instructions.
   1132 ///
   1133 /// \param __a
   1134 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1135 ///    used in the comparison.
   1136 /// \param __b
   1137 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1138 ///    used in the comparison.
   1139 /// \returns An integer containing the comparison results.
   1140 static __inline__ int __DEFAULT_FN_ATTRS
   1141 _mm_comilt_ss(__m128 __a, __m128 __b)
   1142 {
   1143   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
   1144 }
   1145 
   1146 /// Compares two 32-bit float values in the low-order bits of both
   1147 ///    operands to determine if the first operand is less than or equal to the
   1148 ///    second operand.
   1149 ///
   1150 ///    The comparison returns 0 for false, 1 for true. If either value in a
   1151 ///    comparison is NaN, returns 0.
   1152 ///
   1153 /// \headerfile <x86intrin.h>
   1154 ///
   1155 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
   1156 ///
   1157 /// \param __a
   1158 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1159 ///    used in the comparison.
   1160 /// \param __b
   1161 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1162 ///    used in the comparison.
   1163 /// \returns An integer containing the comparison results.
   1164 static __inline__ int __DEFAULT_FN_ATTRS
   1165 _mm_comile_ss(__m128 __a, __m128 __b)
   1166 {
   1167   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
   1168 }
   1169 
   1170 /// Compares two 32-bit float values in the low-order bits of both
   1171 ///    operands to determine if the first operand is greater than the second
   1172 ///    operand.
   1173 ///
   1174 ///    The comparison returns 0 for false, 1 for true. If either value in a
   1175 ///    comparison is NaN, returns 0.
   1176 ///
   1177 /// \headerfile <x86intrin.h>
   1178 ///
   1179 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
   1180 ///
   1181 /// \param __a
   1182 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1183 ///    used in the comparison.
   1184 /// \param __b
   1185 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1186 ///    used in the comparison.
   1187 /// \returns An integer containing the comparison results.
   1188 static __inline__ int __DEFAULT_FN_ATTRS
   1189 _mm_comigt_ss(__m128 __a, __m128 __b)
   1190 {
   1191   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
   1192 }
   1193 
   1194 /// Compares two 32-bit float values in the low-order bits of both
   1195 ///    operands to determine if the first operand is greater than or equal to
   1196 ///    the second operand.
   1197 ///
   1198 ///    The comparison returns 0 for false, 1 for true. If either value in a
   1199 ///    comparison is NaN, returns 0.
   1200 ///
   1201 /// \headerfile <x86intrin.h>
   1202 ///
   1203 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
   1204 ///
   1205 /// \param __a
   1206 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1207 ///    used in the comparison.
   1208 /// \param __b
   1209 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1210 ///    used in the comparison.
   1211 /// \returns An integer containing the comparison results.
   1212 static __inline__ int __DEFAULT_FN_ATTRS
   1213 _mm_comige_ss(__m128 __a, __m128 __b)
   1214 {
   1215   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
   1216 }
   1217 
   1218 /// Compares two 32-bit float values in the low-order bits of both
   1219 ///    operands to determine if the first operand is not equal to the second
   1220 ///    operand.
   1221 ///
   1222 ///    The comparison returns 0 for false, 1 for true. If either value in a
   1223 ///    comparison is NaN, returns 1.
   1224 ///
   1225 /// \headerfile <x86intrin.h>
   1226 ///
   1227 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
   1228 ///
   1229 /// \param __a
   1230 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1231 ///    used in the comparison.
   1232 /// \param __b
   1233 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1234 ///    used in the comparison.
   1235 /// \returns An integer containing the comparison results.
   1236 static __inline__ int __DEFAULT_FN_ATTRS
   1237 _mm_comineq_ss(__m128 __a, __m128 __b)
   1238 {
   1239   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
   1240 }
   1241 
   1242 /// Performs an unordered comparison of two 32-bit float values using
   1243 ///    the low-order bits of both operands to determine equality.
   1244 ///
   1245 ///    The comparison returns 0 for false, 1 for true. If either value in a
   1246 ///    comparison is NaN, returns 0.
   1247 ///
   1248 /// \headerfile <x86intrin.h>
   1249 ///
   1250 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
   1251 ///
   1252 /// \param __a
   1253 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1254 ///    used in the comparison.
   1255 /// \param __b
   1256 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1257 ///    used in the comparison.
   1258 /// \returns An integer containing the comparison results.
   1259 static __inline__ int __DEFAULT_FN_ATTRS
   1260 _mm_ucomieq_ss(__m128 __a, __m128 __b)
   1261 {
   1262   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
   1263 }
   1264 
   1265 /// Performs an unordered comparison of two 32-bit float values using
   1266 ///    the low-order bits of both operands to determine if the first operand is
   1267 ///    less than the second operand.
   1268 ///
   1269 ///    The comparison returns 0 for false, 1 for true. If either value in a
   1270 ///    comparison is NaN, returns 0.
   1271 ///
   1272 /// \headerfile <x86intrin.h>
   1273 ///
   1274 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
   1275 ///
   1276 /// \param __a
   1277 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1278 ///    used in the comparison.
   1279 /// \param __b
   1280 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1281 ///    used in the comparison.
   1282 /// \returns An integer containing the comparison results.
   1283 static __inline__ int __DEFAULT_FN_ATTRS
   1284 _mm_ucomilt_ss(__m128 __a, __m128 __b)
   1285 {
   1286   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
   1287 }
   1288 
   1289 /// Performs an unordered comparison of two 32-bit float values using
   1290 ///    the low-order bits of both operands to determine if the first operand is
   1291 ///    less than or equal to the second operand.
   1292 ///
   1293 ///    The comparison returns 0 for false, 1 for true. If either value in a
   1294 ///    comparison is NaN, returns 0.
   1295 ///
   1296 /// \headerfile <x86intrin.h>
   1297 ///
   1298 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
   1299 ///
   1300 /// \param __a
   1301 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1302 ///    used in the comparison.
   1303 /// \param __b
   1304 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1305 ///    used in the comparison.
   1306 /// \returns An integer containing the comparison results.
   1307 static __inline__ int __DEFAULT_FN_ATTRS
   1308 _mm_ucomile_ss(__m128 __a, __m128 __b)
   1309 {
   1310   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
   1311 }
   1312 
   1313 /// Performs an unordered comparison of two 32-bit float values using
   1314 ///    the low-order bits of both operands to determine if the first operand is
   1315 ///    greater than the second operand.
   1316 ///
   1317 ///    The comparison returns 0 for false, 1 for true. If either value in a
   1318 ///    comparison is NaN, returns 0.
   1319 ///
   1320 /// \headerfile <x86intrin.h>
   1321 ///
   1322 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
   1323 ///
   1324 /// \param __a
   1325 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1326 ///    used in the comparison.
   1327 /// \param __b
   1328 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1329 ///    used in the comparison.
   1330 /// \returns An integer containing the comparison results.
   1331 static __inline__ int __DEFAULT_FN_ATTRS
   1332 _mm_ucomigt_ss(__m128 __a, __m128 __b)
   1333 {
   1334   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
   1335 }
   1336 
   1337 /// Performs an unordered comparison of two 32-bit float values using
   1338 ///    the low-order bits of both operands to determine if the first operand is
   1339 ///    greater than or equal to the second operand.
   1340 ///
   1341 ///    The comparison returns 0 for false, 1 for true. If either value in a
   1342 ///    comparison is NaN, returns 0.
   1343 ///
   1344 /// \headerfile <x86intrin.h>
   1345 ///
   1346 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
   1347 ///
   1348 /// \param __a
   1349 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1350 ///    used in the comparison.
   1351 /// \param __b
   1352 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1353 ///    used in the comparison.
   1354 /// \returns An integer containing the comparison results.
   1355 static __inline__ int __DEFAULT_FN_ATTRS
   1356 _mm_ucomige_ss(__m128 __a, __m128 __b)
   1357 {
   1358   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
   1359 }
   1360 
   1361 /// Performs an unordered comparison of two 32-bit float values using
   1362 ///    the low-order bits of both operands to determine inequality.
   1363 ///
   1364 ///    The comparison returns 0 for false, 1 for true. If either value in a
   1365 ///    comparison is NaN, returns 0.
   1366 ///
   1367 /// \headerfile <x86intrin.h>
   1368 ///
   1369 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
   1370 ///
   1371 /// \param __a
   1372 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1373 ///    used in the comparison.
   1374 /// \param __b
   1375 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1376 ///    used in the comparison.
   1377 /// \returns An integer containing the comparison results.
   1378 static __inline__ int __DEFAULT_FN_ATTRS
   1379 _mm_ucomineq_ss(__m128 __a, __m128 __b)
   1380 {
   1381   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
   1382 }
   1383 
   1384 /// Converts a float value contained in the lower 32 bits of a vector of
   1385 ///    [4 x float] into a 32-bit integer.
   1386 ///
   1387 ///    If the converted value does not fit in a 32-bit integer, raises a
   1388 ///    floating-point invalid exception. If the exception is masked, returns
   1389 ///    the most negative integer.
   1390 ///
   1391 /// \headerfile <x86intrin.h>
   1392 ///
   1393 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
   1394 ///   instructions.
   1395 ///
   1396 /// \param __a
   1397 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1398 ///    used in the conversion.
   1399 /// \returns A 32-bit integer containing the converted value.
   1400 static __inline__ int __DEFAULT_FN_ATTRS
   1401 _mm_cvtss_si32(__m128 __a)
   1402 {
   1403   return __builtin_ia32_cvtss2si((__v4sf)__a);
   1404 }
   1405 
   1406 /// Converts a float value contained in the lower 32 bits of a vector of
   1407 ///    [4 x float] into a 32-bit integer.
   1408 ///
   1409 ///    If the converted value does not fit in a 32-bit integer, raises a
   1410 ///    floating-point invalid exception. If the exception is masked, returns
   1411 ///    the most negative integer.
   1412 ///
   1413 /// \headerfile <x86intrin.h>
   1414 ///
   1415 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
   1416 ///   instructions.
   1417 ///
   1418 /// \param __a
   1419 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1420 ///    used in the conversion.
   1421 /// \returns A 32-bit integer containing the converted value.
   1422 static __inline__ int __DEFAULT_FN_ATTRS
   1423 _mm_cvt_ss2si(__m128 __a)
   1424 {
   1425   return _mm_cvtss_si32(__a);
   1426 }
   1427 
   1428 #ifdef __x86_64__
   1429 
   1430 /// Converts a float value contained in the lower 32 bits of a vector of
   1431 ///    [4 x float] into a 64-bit integer.
   1432 ///
   1433 ///    If the converted value does not fit in a 32-bit integer, raises a
   1434 ///    floating-point invalid exception. If the exception is masked, returns
   1435 ///    the most negative integer.
   1436 ///
   1437 /// \headerfile <x86intrin.h>
   1438 ///
   1439 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
   1440 ///   instructions.
   1441 ///
   1442 /// \param __a
   1443 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1444 ///    used in the conversion.
   1445 /// \returns A 64-bit integer containing the converted value.
   1446 static __inline__ long long __DEFAULT_FN_ATTRS
   1447 _mm_cvtss_si64(__m128 __a)
   1448 {
   1449   return __builtin_ia32_cvtss2si64((__v4sf)__a);
   1450 }
   1451 
   1452 #endif
   1453 
   1454 /// Converts two low-order float values in a 128-bit vector of
   1455 ///    [4 x float] into a 64-bit vector of [2 x i32].
   1456 ///
   1457 ///    If a converted value does not fit in a 32-bit integer, raises a
   1458 ///    floating-point invalid exception. If the exception is masked, returns
   1459 ///    the most negative integer.
   1460 ///
   1461 /// \headerfile <x86intrin.h>
   1462 ///
   1463 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
   1464 ///
   1465 /// \param __a
   1466 ///    A 128-bit vector of [4 x float].
   1467 /// \returns A 64-bit integer vector containing the converted values.
   1468 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1469 _mm_cvtps_pi32(__m128 __a)
   1470 {
   1471   return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
   1472 }
   1473 
   1474 /// Converts two low-order float values in a 128-bit vector of
   1475 ///    [4 x float] into a 64-bit vector of [2 x i32].
   1476 ///
   1477 ///    If a converted value does not fit in a 32-bit integer, raises a
   1478 ///    floating-point invalid exception. If the exception is masked, returns
   1479 ///    the most negative integer.
   1480 ///
   1481 /// \headerfile <x86intrin.h>
   1482 ///
   1483 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
   1484 ///
   1485 /// \param __a
   1486 ///    A 128-bit vector of [4 x float].
   1487 /// \returns A 64-bit integer vector containing the converted values.
   1488 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1489 _mm_cvt_ps2pi(__m128 __a)
   1490 {
   1491   return _mm_cvtps_pi32(__a);
   1492 }
   1493 
   1494 /// Converts the lower (first) element of a vector of [4 x float] into a signed
   1495 ///    truncated (rounded toward zero) 32-bit integer.
   1496 ///
   1497 ///    If the converted value does not fit in a 32-bit integer, raises a
   1498 ///    floating-point invalid exception. If the exception is masked, returns
   1499 ///    the most negative integer.
   1500 ///
   1501 /// \headerfile <x86intrin.h>
   1502 ///
   1503 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
   1504 ///   instructions.
   1505 ///
   1506 /// \param __a
   1507 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1508 ///    used in the conversion.
   1509 /// \returns A 32-bit integer containing the converted value.
   1510 static __inline__ int __DEFAULT_FN_ATTRS
   1511 _mm_cvttss_si32(__m128 __a)
   1512 {
   1513   return __builtin_ia32_cvttss2si((__v4sf)__a);
   1514 }
   1515 
   1516 /// Converts the lower (first) element of a vector of [4 x float] into a signed
   1517 ///    truncated (rounded toward zero) 32-bit integer.
   1518 ///
   1519 ///    If the converted value does not fit in a 32-bit integer, raises a
   1520 ///    floating-point invalid exception. If the exception is masked, returns
   1521 ///    the most negative integer.
   1522 ///
   1523 /// \headerfile <x86intrin.h>
   1524 ///
   1525 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
   1526 ///   instructions.
   1527 ///
   1528 /// \param __a
   1529 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1530 ///    used in the conversion.
   1531 /// \returns A 32-bit integer containing the converted value.
   1532 static __inline__ int __DEFAULT_FN_ATTRS
   1533 _mm_cvtt_ss2si(__m128 __a)
   1534 {
   1535   return _mm_cvttss_si32(__a);
   1536 }
   1537 
   1538 #ifdef __x86_64__
   1539 /// Converts the lower (first) element of a vector of [4 x float] into a signed
   1540 ///    truncated (rounded toward zero) 64-bit integer.
   1541 ///
   1542 ///    If the converted value does not fit in a 64-bit integer, raises a
   1543 ///    floating-point invalid exception. If the exception is masked, returns
   1544 ///    the most negative integer.
   1545 ///
   1546 /// \headerfile <x86intrin.h>
   1547 ///
   1548 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
   1549 ///   instructions.
   1550 ///
   1551 /// \param __a
   1552 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1553 ///    used in the conversion.
   1554 /// \returns A 64-bit integer containing the converted value.
   1555 static __inline__ long long __DEFAULT_FN_ATTRS
   1556 _mm_cvttss_si64(__m128 __a)
   1557 {
   1558   return __builtin_ia32_cvttss2si64((__v4sf)__a);
   1559 }
   1560 #endif
   1561 
   1562 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
   1563 ///    into two signed truncated (rounded toward zero) 32-bit integers,
   1564 ///    returned in a 64-bit vector of [2 x i32].
   1565 ///
   1566 ///    If a converted value does not fit in a 32-bit integer, raises a
   1567 ///    floating-point invalid exception. If the exception is masked, returns
   1568 ///    the most negative integer.
   1569 ///
   1570 /// \headerfile <x86intrin.h>
   1571 ///
   1572 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
   1573 ///   instructions.
   1574 ///
   1575 /// \param __a
   1576 ///    A 128-bit vector of [4 x float].
   1577 /// \returns A 64-bit integer vector containing the converted values.
   1578 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1579 _mm_cvttps_pi32(__m128 __a)
   1580 {
   1581   return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
   1582 }
   1583 
   1584 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
   1585 ///    into two signed truncated (rounded toward zero) 64-bit integers,
   1586 ///    returned in a 64-bit vector of [2 x i32].
   1587 ///
   1588 ///    If a converted value does not fit in a 32-bit integer, raises a
   1589 ///    floating-point invalid exception. If the exception is masked, returns
   1590 ///    the most negative integer.
   1591 ///
   1592 /// \headerfile <x86intrin.h>
   1593 ///
   1594 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
   1595 ///
   1596 /// \param __a
   1597 ///    A 128-bit vector of [4 x float].
   1598 /// \returns A 64-bit integer vector containing the converted values.
   1599 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   1600 _mm_cvtt_ps2pi(__m128 __a)
   1601 {
   1602   return _mm_cvttps_pi32(__a);
   1603 }
   1604 
   1605 /// Converts a 32-bit signed integer value into a floating point value
   1606 ///    and writes it to the lower 32 bits of the destination. The remaining
   1607 ///    higher order elements of the destination vector are copied from the
   1608 ///    corresponding elements in the first operand.
   1609 ///
   1610 /// \headerfile <x86intrin.h>
   1611 ///
   1612 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
   1613 ///
   1614 /// \param __a
   1615 ///    A 128-bit vector of [4 x float].
   1616 /// \param __b
   1617 ///    A 32-bit signed integer operand containing the value to be converted.
   1618 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
   1619 ///    converted value of the second operand. The upper 96 bits are copied from
   1620 ///    the upper 96 bits of the first operand.
   1621 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a,
   1622                                                                      int __b) {
   1623   __a[0] = __b;
   1624   return __a;
   1625 }
   1626 
   1627 /// Converts a 32-bit signed integer value into a floating point value
   1628 ///    and writes it to the lower 32 bits of the destination. The remaining
   1629 ///    higher order elements of the destination are copied from the
   1630 ///    corresponding elements in the first operand.
   1631 ///
   1632 /// \headerfile <x86intrin.h>
   1633 ///
   1634 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
   1635 ///
   1636 /// \param __a
   1637 ///    A 128-bit vector of [4 x float].
   1638 /// \param __b
   1639 ///    A 32-bit signed integer operand containing the value to be converted.
   1640 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
   1641 ///    converted value of the second operand. The upper 96 bits are copied from
   1642 ///    the upper 96 bits of the first operand.
   1643 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a,
   1644                                                                     int __b) {
   1645   return _mm_cvtsi32_ss(__a, __b);
   1646 }
   1647 
   1648 #ifdef __x86_64__
   1649 
   1650 /// Converts a 64-bit signed integer value into a floating point value
   1651 ///    and writes it to the lower 32 bits of the destination. The remaining
   1652 ///    higher order elements of the destination are copied from the
   1653 ///    corresponding elements in the first operand.
   1654 ///
   1655 /// \headerfile <x86intrin.h>
   1656 ///
   1657 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
   1658 ///
   1659 /// \param __a
   1660 ///    A 128-bit vector of [4 x float].
   1661 /// \param __b
   1662 ///    A 64-bit signed integer operand containing the value to be converted.
   1663 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
   1664 ///    converted value of the second operand. The upper 96 bits are copied from
   1665 ///    the upper 96 bits of the first operand.
   1666 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
   1667 _mm_cvtsi64_ss(__m128 __a, long long __b) {
   1668   __a[0] = __b;
   1669   return __a;
   1670 }
   1671 
   1672 #endif
   1673 
   1674 /// Converts two elements of a 64-bit vector of [2 x i32] into two
   1675 ///    floating point values and writes them to the lower 64-bits of the
   1676 ///    destination. The remaining higher order elements of the destination are
   1677 ///    copied from the corresponding elements in the first operand.
   1678 ///
   1679 /// \headerfile <x86intrin.h>
   1680 ///
   1681 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
   1682 ///
   1683 /// \param __a
   1684 ///    A 128-bit vector of [4 x float].
   1685 /// \param __b
   1686 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
   1687 ///    and written to the corresponding low-order elements in the destination.
   1688 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
   1689 ///    converted value of the second operand. The upper 64 bits are copied from
   1690 ///    the upper 64 bits of the first operand.
   1691 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
   1692 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
   1693 {
   1694   return (__m128)__builtin_shufflevector(
   1695       (__v4sf)__a,
   1696       __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
   1697       4, 5, 2, 3);
   1698 }
   1699 
   1700 /// Converts two elements of a 64-bit vector of [2 x i32] into two
   1701 ///    floating point values and writes them to the lower 64-bits of the
   1702 ///    destination. The remaining higher order elements of the destination are
   1703 ///    copied from the corresponding elements in the first operand.
   1704 ///
   1705 /// \headerfile <x86intrin.h>
   1706 ///
   1707 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
   1708 ///
   1709 /// \param __a
   1710 ///    A 128-bit vector of [4 x float].
   1711 /// \param __b
   1712 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
   1713 ///    and written to the corresponding low-order elements in the destination.
   1714 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
   1715 ///    converted value from the second operand. The upper 64 bits are copied
   1716 ///    from the upper 64 bits of the first operand.
   1717 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
   1718 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
   1719 {
   1720   return _mm_cvtpi32_ps(__a, __b);
   1721 }
   1722 
   1723 /// Extracts a float value contained in the lower 32 bits of a vector of
   1724 ///    [4 x float].
   1725 ///
   1726 /// \headerfile <x86intrin.h>
   1727 ///
   1728 /// This intrinsic has no corresponding instruction.
   1729 ///
   1730 /// \param __a
   1731 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
   1732 ///    used in the extraction.
   1733 /// \returns A 32-bit float containing the extracted value.
   1734 static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
   1735 _mm_cvtss_f32(__m128 __a) {
   1736   return __a[0];
   1737 }
   1738 
   1739 /// Loads two packed float values from the address \a __p into the
   1740 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
   1741 ///     are copied from the low-order bits of the first operand.
   1742 ///
   1743 /// \headerfile <x86intrin.h>
   1744 ///
   1745 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
   1746 ///
   1747 /// \param __a
   1748 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
   1749 ///    of the destination.
   1750 /// \param __p
   1751 ///    A pointer to two packed float values. Bits [63:0] are written to bits
   1752 ///    [127:64] of the destination.
   1753 /// \returns A 128-bit vector of [4 x float] containing the moved values.
   1754 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1755 _mm_loadh_pi(__m128 __a, const __m64 *__p)
   1756 {
   1757   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
   1758   struct __mm_loadh_pi_struct {
   1759     __mm_loadh_pi_v2f32 __u;
   1760   } __attribute__((__packed__, __may_alias__));
   1761   __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
   1762   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
   1763   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
   1764 }
   1765 
   1766 /// Loads two packed float values from the address \a __p into the
   1767 ///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
   1768 ///    are copied from the high-order bits of the first operand.
   1769 ///
   1770 /// \headerfile <x86intrin.h>
   1771 ///
   1772 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
   1773 ///
   1774 /// \param __a
   1775 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
   1776 ///    [127:64] of the destination.
   1777 /// \param __p
   1778 ///    A pointer to two packed float values. Bits [63:0] are written to bits
   1779 ///    [63:0] of the destination.
   1780 /// \returns A 128-bit vector of [4 x float] containing the moved values.
   1781 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1782 _mm_loadl_pi(__m128 __a, const __m64 *__p)
   1783 {
   1784   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
   1785   struct __mm_loadl_pi_struct {
   1786     __mm_loadl_pi_v2f32 __u;
   1787   } __attribute__((__packed__, __may_alias__));
   1788   __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
   1789   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
   1790   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
   1791 }
   1792 
   1793 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
   1794 ///    32 bits of the vector are initialized with the single-precision
   1795 ///    floating-point value loaded from a specified memory location. The upper
   1796 ///    96 bits are set to zero.
   1797 ///
   1798 /// \headerfile <x86intrin.h>
   1799 ///
   1800 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
   1801 ///
   1802 /// \param __p
   1803 ///    A pointer to a 32-bit memory location containing a single-precision
   1804 ///    floating-point value.
   1805 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
   1806 ///    lower 32 bits contain the value loaded from the memory location. The
   1807 ///    upper 96 bits are set to zero.
   1808 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1809 _mm_load_ss(const float *__p)
   1810 {
   1811   struct __mm_load_ss_struct {
   1812     float __u;
   1813   } __attribute__((__packed__, __may_alias__));
   1814   float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
   1815   return __extension__ (__m128){ __u, 0, 0, 0 };
   1816 }
   1817 
   1818 /// Loads a 32-bit float value and duplicates it to all four vector
   1819 ///    elements of a 128-bit vector of [4 x float].
   1820 ///
   1821 /// \headerfile <x86intrin.h>
   1822 ///
   1823 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
   1824 ///    instruction.
   1825 ///
   1826 /// \param __p
   1827 ///    A pointer to a float value to be loaded and duplicated.
   1828 /// \returns A 128-bit vector of [4 x float] containing the loaded and
   1829 ///    duplicated values.
   1830 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1831 _mm_load1_ps(const float *__p)
   1832 {
   1833   struct __mm_load1_ps_struct {
   1834     float __u;
   1835   } __attribute__((__packed__, __may_alias__));
   1836   float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
   1837   return __extension__ (__m128){ __u, __u, __u, __u };
   1838 }
   1839 
   1840 #define        _mm_load_ps1(p) _mm_load1_ps(p)
   1841 
   1842 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
   1843 ///    memory location.
   1844 ///
   1845 /// \headerfile <x86intrin.h>
   1846 ///
   1847 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
   1848 ///
   1849 /// \param __p
   1850 ///    A pointer to a 128-bit memory location. The address of the memory
   1851 ///    location has to be 128-bit aligned.
   1852 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
   1853 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1854 _mm_load_ps(const float *__p)
   1855 {
   1856   return *(const __m128*)__p;
   1857 }
   1858 
   1859 /// Loads a 128-bit floating-point vector of [4 x float] from an
   1860 ///    unaligned memory location.
   1861 ///
   1862 /// \headerfile <x86intrin.h>
   1863 ///
   1864 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
   1865 ///
   1866 /// \param __p
   1867 ///    A pointer to a 128-bit memory location. The address of the memory
   1868 ///    location does not have to be aligned.
   1869 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
   1870 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1871 _mm_loadu_ps(const float *__p)
   1872 {
   1873   struct __loadu_ps {
   1874     __m128_u __v;
   1875   } __attribute__((__packed__, __may_alias__));
   1876   return ((const struct __loadu_ps*)__p)->__v;
   1877 }
   1878 
   1879 /// Loads four packed float values, in reverse order, from an aligned
   1880 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
   1881 ///
   1882 /// \headerfile <x86intrin.h>
   1883 ///
   1884 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
   1885 ///    instruction.
   1886 ///
   1887 /// \param __p
   1888 ///    A pointer to a 128-bit memory location. The address of the memory
   1889 ///    location has to be 128-bit aligned.
   1890 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
   1891 ///    in reverse order.
   1892 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1893 _mm_loadr_ps(const float *__p)
   1894 {
   1895   __m128 __a = _mm_load_ps(__p);
   1896   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
   1897 }
   1898 
   1899 /// Create a 128-bit vector of [4 x float] with undefined values.
   1900 ///
   1901 /// \headerfile <x86intrin.h>
   1902 ///
   1903 /// This intrinsic has no corresponding instruction.
   1904 ///
   1905 /// \returns A 128-bit vector of [4 x float] containing undefined values.
   1906 static __inline__ __m128 __DEFAULT_FN_ATTRS
   1907 _mm_undefined_ps(void)
   1908 {
   1909   return (__m128)__builtin_ia32_undef128();
   1910 }
   1911 
   1912 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
   1913 ///    32 bits of the vector are initialized with the specified single-precision
   1914 ///    floating-point value. The upper 96 bits are set to zero.
   1915 ///
   1916 /// \headerfile <x86intrin.h>
   1917 ///
   1918 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
   1919 ///
   1920 /// \param __w
   1921 ///    A single-precision floating-point value used to initialize the lower 32
   1922 ///    bits of the result.
   1923 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
   1924 ///    lower 32 bits contain the value provided in the source operand. The
   1925 ///    upper 96 bits are set to zero.
   1926 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
   1927 _mm_set_ss(float __w) {
   1928   return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
   1929 }
   1930 
   1931 /// Constructs a 128-bit floating-point vector of [4 x float], with each
   1932 ///    of the four single-precision floating-point vector elements set to the
   1933 ///    specified single-precision floating-point value.
   1934 ///
   1935 /// \headerfile <x86intrin.h>
   1936 ///
   1937 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
   1938 ///
   1939 /// \param __w
   1940 ///    A single-precision floating-point value used to initialize each vector
   1941 ///    element of the result.
   1942 /// \returns An initialized 128-bit floating-point vector of [4 x float].
   1943 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
   1944 _mm_set1_ps(float __w) {
   1945   return __extension__ (__m128){ __w, __w, __w, __w };
   1946 }
   1947 
   1948 /* Microsoft specific. */
   1949 /// Constructs a 128-bit floating-point vector of [4 x float], with each
   1950 ///    of the four single-precision floating-point vector elements set to the
   1951 ///    specified single-precision floating-point value.
   1952 ///
   1953 /// \headerfile <x86intrin.h>
   1954 ///
   1955 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
   1956 ///
   1957 /// \param __w
   1958 ///    A single-precision floating-point value used to initialize each vector
   1959 ///    element of the result.
   1960 /// \returns An initialized 128-bit floating-point vector of [4 x float].
   1961 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
   1962 _mm_set_ps1(float __w) {
   1963     return _mm_set1_ps(__w);
   1964 }
   1965 
   1966 /// Constructs a 128-bit floating-point vector of [4 x float]
   1967 ///    initialized with the specified single-precision floating-point values.
   1968 ///
   1969 /// \headerfile <x86intrin.h>
   1970 ///
   1971 /// This intrinsic is a utility function and does not correspond to a specific
   1972 ///    instruction.
   1973 ///
   1974 /// \param __z
   1975 ///    A single-precision floating-point value used to initialize bits [127:96]
   1976 ///    of the result.
   1977 /// \param __y
   1978 ///    A single-precision floating-point value used to initialize bits [95:64]
   1979 ///    of the result.
   1980 /// \param __x
   1981 ///    A single-precision floating-point value used to initialize bits [63:32]
   1982 ///    of the result.
   1983 /// \param __w
   1984 ///    A single-precision floating-point value used to initialize bits [31:0]
   1985 ///    of the result.
   1986 /// \returns An initialized 128-bit floating-point vector of [4 x float].
   1987 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
   1988 _mm_set_ps(float __z, float __y, float __x, float __w) {
   1989   return __extension__ (__m128){ __w, __x, __y, __z };
   1990 }
   1991 
   1992 /// Constructs a 128-bit floating-point vector of [4 x float],
   1993 ///    initialized in reverse order with the specified 32-bit single-precision
   1994 ///    float-point values.
   1995 ///
   1996 /// \headerfile <x86intrin.h>
   1997 ///
   1998 /// This intrinsic is a utility function and does not correspond to a specific
   1999 ///    instruction.
   2000 ///
   2001 /// \param __z
   2002 ///    A single-precision floating-point value used to initialize bits [31:0]
   2003 ///    of the result.
   2004 /// \param __y
   2005 ///    A single-precision floating-point value used to initialize bits [63:32]
   2006 ///    of the result.
   2007 /// \param __x
   2008 ///    A single-precision floating-point value used to initialize bits [95:64]
   2009 ///    of the result.
   2010 /// \param __w
   2011 ///    A single-precision floating-point value used to initialize bits [127:96]
   2012 ///    of the result.
   2013 /// \returns An initialized 128-bit floating-point vector of [4 x float].
   2014 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
   2015 _mm_setr_ps(float __z, float __y, float __x, float __w) {
   2016   return __extension__ (__m128){ __z, __y, __x, __w };
   2017 }
   2018 
   2019 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
   2020 ///    to zero.
   2021 ///
   2022 /// \headerfile <x86intrin.h>
   2023 ///
   2024 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
   2025 ///
   2026 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
   2027 ///    all elements set to zero.
   2028 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
   2029 _mm_setzero_ps(void) {
   2030   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
   2031 }
   2032 
   2033 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
   2034 ///    memory location.
   2035 ///
   2036 /// \headerfile <x86intrin.h>
   2037 ///
   2038 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
   2039 ///
   2040 /// \param __p
   2041 ///    A pointer to a 64-bit memory location.
   2042 /// \param __a
   2043 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   2044 static __inline__ void __DEFAULT_FN_ATTRS
   2045 _mm_storeh_pi(__m64 *__p, __m128 __a)
   2046 {
   2047   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
   2048   struct __mm_storeh_pi_struct {
   2049     __mm_storeh_pi_v2f32 __u;
   2050   } __attribute__((__packed__, __may_alias__));
   2051   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
   2052 }
   2053 
   2054 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
   2055 ///     memory location.
   2056 ///
   2057 /// \headerfile <x86intrin.h>
   2058 ///
   2059 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
   2060 ///
   2061 /// \param __p
   2062 ///    A pointer to a memory location that will receive the float values.
   2063 /// \param __a
   2064 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   2065 static __inline__ void __DEFAULT_FN_ATTRS
   2066 _mm_storel_pi(__m64 *__p, __m128 __a)
   2067 {
   2068   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
   2069   struct __mm_storeh_pi_struct {
   2070     __mm_storeh_pi_v2f32 __u;
   2071   } __attribute__((__packed__, __may_alias__));
   2072   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
   2073 }
   2074 
   2075 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
   2076 ///     memory location.
   2077 ///
   2078 /// \headerfile <x86intrin.h>
   2079 ///
   2080 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
   2081 ///
   2082 /// \param __p
   2083 ///    A pointer to a 32-bit memory location.
   2084 /// \param __a
   2085 ///    A 128-bit vector of [4 x float] containing the value to be stored.
   2086 static __inline__ void __DEFAULT_FN_ATTRS
   2087 _mm_store_ss(float *__p, __m128 __a)
   2088 {
   2089   struct __mm_store_ss_struct {
   2090     float __u;
   2091   } __attribute__((__packed__, __may_alias__));
   2092   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
   2093 }
   2094 
   2095 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
   2096 ///    location.
   2097 ///
   2098 /// \headerfile <x86intrin.h>
   2099 ///
   2100 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
   2101 ///
   2102 /// \param __p
   2103 ///    A pointer to a 128-bit memory location. The address of the memory
   2104 ///    location does not have to be aligned.
   2105 /// \param __a
   2106 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   2107 static __inline__ void __DEFAULT_FN_ATTRS
   2108 _mm_storeu_ps(float *__p, __m128 __a)
   2109 {
   2110   struct __storeu_ps {
   2111     __m128_u __v;
   2112   } __attribute__((__packed__, __may_alias__));
   2113   ((struct __storeu_ps*)__p)->__v = __a;
   2114 }
   2115 
   2116 /// Stores a 128-bit vector of [4 x float] into an aligned memory
   2117 ///    location.
   2118 ///
   2119 /// \headerfile <x86intrin.h>
   2120 ///
   2121 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
   2122 ///
   2123 /// \param __p
   2124 ///    A pointer to a 128-bit memory location. The address of the memory
   2125 ///    location has to be 16-byte aligned.
   2126 /// \param __a
   2127 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   2128 static __inline__ void __DEFAULT_FN_ATTRS
   2129 _mm_store_ps(float *__p, __m128 __a)
   2130 {
   2131   *(__m128*)__p = __a;
   2132 }
   2133 
   2134 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
   2135 ///    four contiguous elements in an aligned memory location.
   2136 ///
   2137 /// \headerfile <x86intrin.h>
   2138 ///
   2139 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
   2140 ///    instruction.
   2141 ///
   2142 /// \param __p
   2143 ///    A pointer to a 128-bit memory location.
   2144 /// \param __a
   2145 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
   2146 ///    of the four contiguous elements pointed by \a __p.
   2147 static __inline__ void __DEFAULT_FN_ATTRS
   2148 _mm_store1_ps(float *__p, __m128 __a)
   2149 {
   2150   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
   2151   _mm_store_ps(__p, __a);
   2152 }
   2153 
   2154 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
   2155 ///    four contiguous elements in an aligned memory location.
   2156 ///
   2157 /// \headerfile <x86intrin.h>
   2158 ///
   2159 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
   2160 ///    instruction.
   2161 ///
   2162 /// \param __p
   2163 ///    A pointer to a 128-bit memory location.
   2164 /// \param __a
   2165 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
   2166 ///    of the four contiguous elements pointed by \a __p.
   2167 static __inline__ void __DEFAULT_FN_ATTRS
   2168 _mm_store_ps1(float *__p, __m128 __a)
   2169 {
   2170   _mm_store1_ps(__p, __a);
   2171 }
   2172 
   2173 /// Stores float values from a 128-bit vector of [4 x float] to an
   2174 ///    aligned memory location in reverse order.
   2175 ///
   2176 /// \headerfile <x86intrin.h>
   2177 ///
   2178 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
   2179 ///    instruction.
   2180 ///
   2181 /// \param __p
   2182 ///    A pointer to a 128-bit memory location. The address of the memory
   2183 ///    location has to be 128-bit aligned.
   2184 /// \param __a
   2185 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   2186 static __inline__ void __DEFAULT_FN_ATTRS
   2187 _mm_storer_ps(float *__p, __m128 __a)
   2188 {
   2189   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
   2190   _mm_store_ps(__p, __a);
   2191 }
   2192 
   2193 #define _MM_HINT_ET0 7
   2194 #define _MM_HINT_ET1 6
   2195 #define _MM_HINT_T0  3
   2196 #define _MM_HINT_T1  2
   2197 #define _MM_HINT_T2  1
   2198 #define _MM_HINT_NTA 0
   2199 
   2200 #ifndef _MSC_VER
   2201 /* FIXME: We have to #define this because "sel" must be a constant integer, and
   2202    Sema doesn't do any form of constant propagation yet. */
   2203 
   2204 /// Loads one cache line of data from the specified address to a location
   2205 ///    closer to the processor.
   2206 ///
   2207 /// \headerfile <x86intrin.h>
   2208 ///
   2209 /// \code
   2210 /// void _mm_prefetch(const void *a, const int sel);
   2211 /// \endcode
   2212 ///
   2213 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
   2214 ///
   2215 /// \param a
   2216 ///    A pointer to a memory location containing a cache line of data.
   2217 /// \param sel
   2218 ///    A predefined integer constant specifying the type of prefetch
   2219 ///    operation: \n
   2220 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
   2221 ///    PREFETCHNTA instruction will be generated. \n
   2222 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
   2223 ///    be generated. \n
   2224 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
   2225 ///    be generated. \n
   2226 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
   2227 ///    be generated.
   2228 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
   2229                                                  ((sel) >> 2) & 1, (sel) & 0x3))
   2230 #endif
   2231 
   2232 /// Stores a 64-bit integer in the specified aligned memory location. To
   2233 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
   2234 ///    used again soon).
   2235 ///
   2236 /// \headerfile <x86intrin.h>
   2237 ///
   2238 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
   2239 ///
   2240 /// \param __p
   2241 ///    A pointer to an aligned memory location used to store the register value.
   2242 /// \param __a
   2243 ///    A 64-bit integer containing the value to be stored.
   2244 static __inline__ void __DEFAULT_FN_ATTRS
   2245 _mm_stream_pi(void *__p, __m64 __a)
   2246 {
   2247   __builtin_nontemporal_store(__a, (__m64 *)__p);
   2248 }
   2249 
   2250 /// Moves packed float values from a 128-bit vector of [4 x float] to a
   2251 ///    128-bit aligned memory location. To minimize caching, the data is flagged
   2252 ///    as non-temporal (unlikely to be used again soon).
   2253 ///
   2254 /// \headerfile <x86intrin.h>
   2255 ///
   2256 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
   2257 ///
   2258 /// \param __p
   2259 ///    A pointer to a 128-bit aligned memory location that will receive the
   2260 ///    single-precision floating-point values.
   2261 /// \param __a
   2262 ///    A 128-bit vector of [4 x float] containing the values to be moved.
   2263 static __inline__ void __DEFAULT_FN_ATTRS
   2264 _mm_stream_ps(void *__p, __m128 __a)
   2265 {
   2266   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
   2267 }
   2268 
   2269 #if defined(__cplusplus)
   2270 extern "C" {
   2271 #endif
   2272 
   2273 /// Forces strong memory ordering (serialization) between store
   2274 ///    instructions preceding this instruction and store instructions following
   2275 ///    this instruction, ensuring the system completes all previous stores
   2276 ///    before executing subsequent stores.
   2277 ///
   2278 /// \headerfile <x86intrin.h>
   2279 ///
   2280 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
   2281 ///
   2282 void _mm_sfence(void);
   2283 
   2284 #if defined(__cplusplus)
   2285 } // extern "C"
   2286 #endif
   2287 
   2288 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
   2289 ///    returns it, as specified by the immediate integer operand.
   2290 ///
   2291 /// \headerfile <x86intrin.h>
   2292 ///
   2293 /// \code
   2294 /// int _mm_extract_pi16(__m64 a, int n);
   2295 /// \endcode
   2296 ///
   2297 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
   2298 ///
   2299 /// \param a
   2300 ///    A 64-bit vector of [4 x i16].
   2301 /// \param n
   2302 ///    An immediate integer operand that determines which bits are extracted: \n
   2303 ///    0: Bits [15:0] are copied to the destination. \n
   2304 ///    1: Bits [31:16] are copied to the destination. \n
   2305 ///    2: Bits [47:32] are copied to the destination. \n
   2306 ///    3: Bits [63:48] are copied to the destination.
   2307 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
   2308 #define _mm_extract_pi16(a, n) \
   2309   ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
   2310 
   2311 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
   2312 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
   2313 ///    specified by the immediate operand \a n.
   2314 ///
   2315 /// \headerfile <x86intrin.h>
   2316 ///
   2317 /// \code
   2318 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
   2319 /// \endcode
   2320 ///
   2321 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
   2322 ///
   2323 /// \param a
   2324 ///    A 64-bit vector of [4 x i16].
   2325 /// \param d
   2326 ///    An integer. The lower 16-bit value from this operand is written to the
   2327 ///    destination at the offset specified by operand \a n.
   2328 /// \param n
   2329 ///    An immediate integer operant that determines which the bits to be used
   2330 ///    in the destination. \n
   2331 ///    0: Bits [15:0] are copied to the destination. \n
   2332 ///    1: Bits [31:16] are copied to the destination. \n
   2333 ///    2: Bits [47:32] are copied to the destination. \n
   2334 ///    3: Bits [63:48] are copied to the destination.  \n
   2335 ///    The remaining bits in the destination are copied from the corresponding
   2336 ///    bits in operand \a a.
   2337 /// \returns A 64-bit integer vector containing the copied packed data from the
   2338 ///    operands.
   2339 #define _mm_insert_pi16(a, d, n) \
   2340   ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
   2341 
   2342 /// Compares each of the corresponding packed 16-bit integer values of
   2343 ///    the 64-bit integer vectors, and writes the greater value to the
   2344 ///    corresponding bits in the destination.
   2345 ///
   2346 /// \headerfile <x86intrin.h>
   2347 ///
   2348 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
   2349 ///
   2350 /// \param __a
   2351 ///    A 64-bit integer vector containing one of the source operands.
   2352 /// \param __b
   2353 ///    A 64-bit integer vector containing one of the source operands.
   2354 /// \returns A 64-bit integer vector containing the comparison results.
   2355 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   2356 _mm_max_pi16(__m64 __a, __m64 __b)
   2357 {
   2358   return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
   2359 }
   2360 
   2361 /// Compares each of the corresponding packed 8-bit unsigned integer
   2362 ///    values of the 64-bit integer vectors, and writes the greater value to the
   2363 ///    corresponding bits in the destination.
   2364 ///
   2365 /// \headerfile <x86intrin.h>
   2366 ///
   2367 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
   2368 ///
   2369 /// \param __a
   2370 ///    A 64-bit integer vector containing one of the source operands.
   2371 /// \param __b
   2372 ///    A 64-bit integer vector containing one of the source operands.
   2373 /// \returns A 64-bit integer vector containing the comparison results.
   2374 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   2375 _mm_max_pu8(__m64 __a, __m64 __b)
   2376 {
   2377   return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
   2378 }
   2379 
   2380 /// Compares each of the corresponding packed 16-bit integer values of
   2381 ///    the 64-bit integer vectors, and writes the lesser value to the
   2382 ///    corresponding bits in the destination.
   2383 ///
   2384 /// \headerfile <x86intrin.h>
   2385 ///
   2386 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
   2387 ///
   2388 /// \param __a
   2389 ///    A 64-bit integer vector containing one of the source operands.
   2390 /// \param __b
   2391 ///    A 64-bit integer vector containing one of the source operands.
   2392 /// \returns A 64-bit integer vector containing the comparison results.
   2393 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   2394 _mm_min_pi16(__m64 __a, __m64 __b)
   2395 {
   2396   return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
   2397 }
   2398 
   2399 /// Compares each of the corresponding packed 8-bit unsigned integer
   2400 ///    values of the 64-bit integer vectors, and writes the lesser value to the
   2401 ///    corresponding bits in the destination.
   2402 ///
   2403 /// \headerfile <x86intrin.h>
   2404 ///
   2405 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
   2406 ///
   2407 /// \param __a
   2408 ///    A 64-bit integer vector containing one of the source operands.
   2409 /// \param __b
   2410 ///    A 64-bit integer vector containing one of the source operands.
   2411 /// \returns A 64-bit integer vector containing the comparison results.
   2412 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   2413 _mm_min_pu8(__m64 __a, __m64 __b)
   2414 {
   2415   return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
   2416 }
   2417 
   2418 /// Takes the most significant bit from each 8-bit element in a 64-bit
   2419 ///    integer vector to create an 8-bit mask value. Zero-extends the value to
   2420 ///    32-bit integer and writes it to the destination.
   2421 ///
   2422 /// \headerfile <x86intrin.h>
   2423 ///
   2424 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
   2425 ///
   2426 /// \param __a
   2427 ///    A 64-bit integer vector containing the values with bits to be extracted.
   2428 /// \returns The most significant bit from each 8-bit element in \a __a,
   2429 ///    written to bits [7:0].
   2430 static __inline__ int __DEFAULT_FN_ATTRS_SSE2
   2431 _mm_movemask_pi8(__m64 __a)
   2432 {
   2433   return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
   2434 }
   2435 
   2436 /// Multiplies packed 16-bit unsigned integer values and writes the
   2437 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
   2438 ///    the destination.
   2439 ///
   2440 /// \headerfile <x86intrin.h>
   2441 ///
   2442 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
   2443 ///
   2444 /// \param __a
   2445 ///    A 64-bit integer vector containing one of the source operands.
   2446 /// \param __b
   2447 ///    A 64-bit integer vector containing one of the source operands.
   2448 /// \returns A 64-bit integer vector containing the products of both operands.
   2449 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   2450 _mm_mulhi_pu16(__m64 __a, __m64 __b)
   2451 {
   2452   return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
   2453                                              (__v8hi)__anyext128(__b)));
   2454 }
   2455 
   2456 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
   2457 ///    destination, as specified by the immediate value operand.
   2458 ///
   2459 /// \headerfile <x86intrin.h>
   2460 ///
   2461 /// \code
   2462 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
   2463 /// \endcode
   2464 ///
   2465 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
   2466 ///
   2467 /// \param a
   2468 ///    A 64-bit integer vector containing the values to be shuffled.
   2469 /// \param n
   2470 ///    An immediate value containing an 8-bit value specifying which elements to
   2471 ///    copy from \a a. The destinations within the 64-bit destination are
   2472 ///    assigned values as follows: \n
   2473 ///    Bits [1:0] are used to assign values to bits [15:0] in the
   2474 ///    destination. \n
   2475 ///    Bits [3:2] are used to assign values to bits [31:16] in the
   2476 ///    destination. \n
   2477 ///    Bits [5:4] are used to assign values to bits [47:32] in the
   2478 ///    destination. \n
   2479 ///    Bits [7:6] are used to assign values to bits [63:48] in the
   2480 ///    destination. \n
   2481 ///    Bit value assignments: \n
   2482 ///    00: assigned from bits [15:0] of \a a. \n
   2483 ///    01: assigned from bits [31:16] of \a a. \n
   2484 ///    10: assigned from bits [47:32] of \a a. \n
   2485 ///    11: assigned from bits [63:48] of \a a. \n
   2486 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
   2487 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
   2488 ///    <c>[b6, b4, b2, b0]</c>.
   2489 /// \returns A 64-bit integer vector containing the shuffled values.
   2490 #define _mm_shuffle_pi16(a, n)                                                 \
   2491   ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
   2492                                   (n) & 0x3, ((n) >> 2) & 0x3,                 \
   2493                                   ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
   2494 
   2495 /// Conditionally copies the values from each 8-bit element in the first
   2496 ///    64-bit integer vector operand to the specified memory location, as
   2497 ///    specified by the most significant bit in the corresponding element in the
   2498 ///    second 64-bit integer vector operand.
   2499 ///
   2500 ///    To minimize caching, the data is flagged as non-temporal
   2501 ///    (unlikely to be used again soon).
   2502 ///
   2503 /// \headerfile <x86intrin.h>
   2504 ///
   2505 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
   2506 ///
   2507 /// \param __d
   2508 ///    A 64-bit integer vector containing the values with elements to be copied.
   2509 /// \param __n
   2510 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
   2511 ///    element determines whether the corresponding element in operand \a __d
   2512 ///    is copied. If the most significant bit of a given element is 1, the
   2513 ///    corresponding element in operand \a __d is copied.
   2514 /// \param __p
   2515 ///    A pointer to a 64-bit memory location that will receive the conditionally
   2516 ///    copied integer values. The address of the memory location does not have
   2517 ///    to be aligned.
   2518 static __inline__ void __DEFAULT_FN_ATTRS_SSE2
   2519 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
   2520 {
   2521   // This is complex, because we need to support the case where __p is pointing
   2522   // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
   2523   // write might cause a trap where a 64-bit maskmovq would not. (Memory
   2524   // locations not selected by the mask bits might still cause traps.)
   2525   __m128i __d128  = __anyext128(__d);
   2526   __m128i __n128  = __zext128(__n);
   2527   if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
   2528       ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
   2529     // If there's a risk of spurious trap due to a 128-bit write, back up the
   2530     // pointer by 8 bytes and shift values in registers to match.
   2531     __p -= 8;
   2532     __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
   2533     __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
   2534   }
   2535 
   2536   __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
   2537 }
   2538 
   2539 /// Computes the rounded averages of the packed unsigned 8-bit integer
   2540 ///    values and writes the averages to the corresponding bits in the
   2541 ///    destination.
   2542 ///
   2543 /// \headerfile <x86intrin.h>
   2544 ///
   2545 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
   2546 ///
   2547 /// \param __a
   2548 ///    A 64-bit integer vector containing one of the source operands.
   2549 /// \param __b
   2550 ///    A 64-bit integer vector containing one of the source operands.
   2551 /// \returns A 64-bit integer vector containing the averages of both operands.
   2552 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   2553 _mm_avg_pu8(__m64 __a, __m64 __b)
   2554 {
   2555   return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
   2556                                            (__v16qi)__anyext128(__b)));
   2557 }
   2558 
   2559 /// Computes the rounded averages of the packed unsigned 16-bit integer
   2560 ///    values and writes the averages to the corresponding bits in the
   2561 ///    destination.
   2562 ///
   2563 /// \headerfile <x86intrin.h>
   2564 ///
   2565 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
   2566 ///
   2567 /// \param __a
   2568 ///    A 64-bit integer vector containing one of the source operands.
   2569 /// \param __b
   2570 ///    A 64-bit integer vector containing one of the source operands.
   2571 /// \returns A 64-bit integer vector containing the averages of both operands.
   2572 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   2573 _mm_avg_pu16(__m64 __a, __m64 __b)
   2574 {
   2575   return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
   2576                                            (__v8hi)__anyext128(__b)));
   2577 }
   2578 
   2579 /// Subtracts the corresponding 8-bit unsigned integer values of the two
   2580 ///    64-bit vector operands and computes the absolute value for each of the
   2581 ///    difference. Then sum of the 8 absolute differences is written to the
   2582 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
   2583 ///
   2584 /// \headerfile <x86intrin.h>
   2585 ///
   2586 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
   2587 ///
   2588 /// \param __a
   2589 ///    A 64-bit integer vector containing one of the source operands.
   2590 /// \param __b
   2591 ///    A 64-bit integer vector containing one of the source operands.
   2592 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
   2593 ///    sets of absolute differences between both operands. The upper bits are
   2594 ///    cleared.
   2595 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   2596 _mm_sad_pu8(__m64 __a, __m64 __b)
   2597 {
   2598   return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
   2599                                             (__v16qi)__zext128(__b)));
   2600 }
   2601 
   2602 #if defined(__cplusplus)
   2603 extern "C" {
   2604 #endif
   2605 
   2606 /// Returns the contents of the MXCSR register as a 32-bit unsigned
   2607 ///    integer value.
   2608 ///
   2609 ///    There are several groups of macros associated with this
   2610 ///    intrinsic, including:
   2611 ///    <ul>
   2612 ///    <li>
   2613 ///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
   2614 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
   2615 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
   2616 ///      _MM_GET_EXCEPTION_STATE().
   2617 ///    </li>
   2618 ///    <li>
   2619 ///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
   2620 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
   2621 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
   2622 ///    </li>
   2623 ///    <li>
   2624 ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
   2625 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
   2626 ///      _MM_GET_ROUNDING_MODE().
   2627 ///    </li>
   2628 ///    <li>
   2629 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
   2630 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
   2631 ///    </li>
   2632 ///    <li>
   2633 ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
   2634 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
   2635 ///      _MM_GET_DENORMALS_ZERO_MODE().
   2636 ///    </li>
   2637 ///    </ul>
   2638 ///
   2639 ///    For example, the following expression checks if an overflow exception has
   2640 ///    occurred:
   2641 ///    \code
   2642 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
   2643 ///    \endcode
   2644 ///
   2645 ///    The following expression gets the current rounding mode:
   2646 ///    \code
   2647 ///      _MM_GET_ROUNDING_MODE()
   2648 ///    \endcode
   2649 ///
   2650 /// \headerfile <x86intrin.h>
   2651 ///
   2652 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
   2653 ///
   2654 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
   2655 ///    register.
   2656 unsigned int _mm_getcsr(void);
   2657 
   2658 /// Sets the MXCSR register with the 32-bit unsigned integer value.
   2659 ///
   2660 ///    There are several groups of macros associated with this intrinsic,
   2661 ///    including:
   2662 ///    <ul>
   2663 ///    <li>
   2664 ///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
   2665 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
   2666 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
   2667 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
   2668 ///    </li>
   2669 ///    <li>
   2670 ///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
   2671 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
   2672 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
   2673 ///      of these macros.
   2674 ///    </li>
   2675 ///    <li>
   2676 ///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
   2677 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
   2678 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
   2679 ///    </li>
   2680 ///    <li>
   2681 ///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
   2682 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
   2683 ///      one of these macros.
   2684 ///    </li>
   2685 ///    <li>
   2686 ///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
   2687 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
   2688 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
   2689 ///    </li>
   2690 ///    </ul>
   2691 ///
   2692 ///    For example, the following expression causes subsequent floating-point
   2693 ///    operations to round up:
   2694 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
   2695 ///
   2696 ///    The following example sets the DAZ and FTZ flags:
   2697 ///    \code
   2698 ///    void setFlags() {
   2699 ///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
   2700 ///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
   2701 ///    }
   2702 ///    \endcode
   2703 ///
   2704 /// \headerfile <x86intrin.h>
   2705 ///
   2706 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
   2707 ///
   2708 /// \param __i
   2709 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
   2710 void _mm_setcsr(unsigned int __i);
   2711 
   2712 #if defined(__cplusplus)
   2713 } // extern "C"
   2714 #endif
   2715 
   2716 /// Selects 4 float values from the 128-bit operands of [4 x float], as
   2717 ///    specified by the immediate value operand.
   2718 ///
   2719 /// \headerfile <x86intrin.h>
   2720 ///
   2721 /// \code
   2722 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
   2723 /// \endcode
   2724 ///
   2725 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
   2726 ///
   2727 /// \param a
   2728 ///    A 128-bit vector of [4 x float].
   2729 /// \param b
   2730 ///    A 128-bit vector of [4 x float].
   2731 /// \param mask
   2732 ///    An immediate value containing an 8-bit value specifying which elements to
   2733 ///    copy from \a a and \a b. \n
   2734 ///    Bits [3:0] specify the values copied from operand \a a. \n
   2735 ///    Bits [7:4] specify the values copied from operand \a b. \n
   2736 ///    The destinations within the 128-bit destination are assigned values as
   2737 ///    follows: \n
   2738 ///    Bits [1:0] are used to assign values to bits [31:0] in the
   2739 ///    destination. \n
   2740 ///    Bits [3:2] are used to assign values to bits [63:32] in the
   2741 ///    destination. \n
   2742 ///    Bits [5:4] are used to assign values to bits [95:64] in the
   2743 ///    destination. \n
   2744 ///    Bits [7:6] are used to assign values to bits [127:96] in the
   2745 ///    destination. \n
   2746 ///    Bit value assignments: \n
   2747 ///    00: Bits [31:0] copied from the specified operand. \n
   2748 ///    01: Bits [63:32] copied from the specified operand. \n
   2749 ///    10: Bits [95:64] copied from the specified operand. \n
   2750 ///    11: Bits [127:96] copied from the specified operand. \n
   2751 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
   2752 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
   2753 ///    <c>[b6, b4, b2, b0]</c>.
   2754 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
   2755 #define _mm_shuffle_ps(a, b, mask) \
   2756   ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
   2757                                  (int)(mask)))
   2758 
   2759 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
   2760 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
   2761 ///
   2762 /// \headerfile <x86intrin.h>
   2763 ///
   2764 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
   2765 ///
   2766 /// \param __a
   2767 ///    A 128-bit vector of [4 x float]. \n
   2768 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
   2769 ///    Bits [127:96] are written to bits [95:64] of the destination.
   2770 /// \param __b
   2771 ///    A 128-bit vector of [4 x float].
   2772 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
   2773 ///    Bits [127:96] are written to bits [127:96] of the destination.
   2774 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
   2775 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
   2776 _mm_unpackhi_ps(__m128 __a, __m128 __b) {
   2777   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
   2778 }
   2779 
   2780 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
   2781 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
   2782 ///
   2783 /// \headerfile <x86intrin.h>
   2784 ///
   2785 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
   2786 ///
   2787 /// \param __a
   2788 ///    A 128-bit vector of [4 x float]. \n
   2789 ///    Bits [31:0] are written to bits [31:0] of the destination.  \n
   2790 ///    Bits [63:32] are written to bits [95:64] of the destination.
   2791 /// \param __b
   2792 ///    A 128-bit vector of [4 x float]. \n
   2793 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
   2794 ///    Bits [63:32] are written to bits [127:96] of the destination.
   2795 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
   2796 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
   2797 _mm_unpacklo_ps(__m128 __a, __m128 __b) {
   2798   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
   2799 }
   2800 
   2801 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
   2802 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
   2803 ///    96 bits are set to the upper 96 bits of the first parameter.
   2804 ///
   2805 /// \headerfile <x86intrin.h>
   2806 ///
   2807 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
   2808 ///    instruction.
   2809 ///
   2810 /// \param __a
   2811 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
   2812 ///    written to the upper 96 bits of the result.
   2813 /// \param __b
   2814 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
   2815 ///    written to the lower 32 bits of the result.
   2816 /// \returns A 128-bit floating-point vector of [4 x float].
   2817 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
   2818 _mm_move_ss(__m128 __a, __m128 __b) {
   2819   __a[0] = __b[0];
   2820   return __a;
   2821 }
   2822 
   2823 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
   2824 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
   2825 ///    64 bits are set to the upper 64 bits of the first parameter.
   2826 ///
   2827 /// \headerfile <x86intrin.h>
   2828 ///
   2829 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
   2830 ///
   2831 /// \param __a
   2832 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
   2833 ///    written to the upper 64 bits of the result.
   2834 /// \param __b
   2835 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
   2836 ///    written to the lower 64 bits of the result.
   2837 /// \returns A 128-bit floating-point vector of [4 x float].
   2838 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
   2839 _mm_movehl_ps(__m128 __a, __m128 __b) {
   2840   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
   2841 }
   2842 
   2843 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
   2844 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
   2845 ///    64 bits are set to the lower 64 bits of the second parameter.
   2846 ///
   2847 /// \headerfile <x86intrin.h>
   2848 ///
   2849 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
   2850 ///
   2851 /// \param __a
   2852 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
   2853 ///    written to the lower 64 bits of the result.
   2854 /// \param __b
   2855 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
   2856 ///    written to the upper 64 bits of the result.
   2857 /// \returns A 128-bit floating-point vector of [4 x float].
   2858 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
   2859 _mm_movelh_ps(__m128 __a, __m128 __b) {
   2860   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
   2861 }
   2862 
   2863 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
   2864 ///    float].
   2865 ///
   2866 /// \headerfile <x86intrin.h>
   2867 ///
   2868 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
   2869 ///
   2870 /// \param __a
   2871 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
   2872 ///    from the corresponding elements in this operand.
   2873 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
   2874 ///    values from the operand.
   2875 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
   2876 _mm_cvtpi16_ps(__m64 __a)
   2877 {
   2878   return __builtin_convertvector((__v4hi)__a, __v4sf);
   2879 }
   2880 
   2881 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
   2882 ///    128-bit vector of [4 x float].
   2883 ///
   2884 /// \headerfile <x86intrin.h>
   2885 ///
   2886 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
   2887 ///
   2888 /// \param __a
   2889 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
   2890 ///    destination are copied from the corresponding elements in this operand.
   2891 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
   2892 ///    values from the operand.
   2893 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
   2894 _mm_cvtpu16_ps(__m64 __a)
   2895 {
   2896   return __builtin_convertvector((__v4hu)__a, __v4sf);
   2897 }
   2898 
   2899 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
   2900 ///    into a 128-bit vector of [4 x float].
   2901 ///
   2902 /// \headerfile <x86intrin.h>
   2903 ///
   2904 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
   2905 ///
   2906 /// \param __a
   2907 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
   2908 ///    from the corresponding lower 4 elements in this operand.
   2909 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
   2910 ///    values from the operand.
   2911 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
   2912 _mm_cvtpi8_ps(__m64 __a)
   2913 {
   2914   return __builtin_convertvector(
   2915       __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
   2916                               0, 1, 2, 3), __v4sf);
   2917 }
   2918 
   2919 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
   2920 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
   2921 ///
   2922 /// \headerfile <x86intrin.h>
   2923 ///
   2924 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
   2925 ///
   2926 /// \param __a
   2927 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
   2928 ///    destination are copied from the corresponding lower 4 elements in this
   2929 ///    operand.
   2930 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
   2931 ///    values from the source operand.
   2932 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
   2933 _mm_cvtpu8_ps(__m64 __a)
   2934 {
   2935   return __builtin_convertvector(
   2936       __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
   2937                               0, 1, 2, 3), __v4sf);
   2938 }
   2939 
   2940 /// Converts the two 32-bit signed integer values from each 64-bit vector
   2941 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
   2942 ///
   2943 /// \headerfile <x86intrin.h>
   2944 ///
   2945 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
   2946 ///
   2947 /// \param __a
   2948 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
   2949 ///    copied from the elements in this operand.
   2950 /// \param __b
   2951 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
   2952 ///    copied from the elements in this operand.
   2953 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
   2954 ///    copied and converted values from the first operand. The upper 64 bits
   2955 ///    contain the copied and converted values from the second operand.
   2956 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
   2957 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
   2958 {
   2959   return __builtin_convertvector(
   2960       __builtin_shufflevector((__v2si)__a, (__v2si)__b,
   2961                               0, 1, 2, 3), __v4sf);
   2962 }
   2963 
   2964 /// Converts each single-precision floating-point element of a 128-bit
   2965 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
   2966 ///    packs the results into a 64-bit integer vector of [4 x i16].
   2967 ///
   2968 ///    If the floating-point element is NaN or infinity, or if the
   2969 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
   2970 ///    it is converted to 0x8000. Otherwise if the floating-point element is
   2971 ///    greater than 0x7FFF, it is converted to 0x7FFF.
   2972 ///
   2973 /// \headerfile <x86intrin.h>
   2974 ///
   2975 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
   2976 ///
   2977 /// \param __a
   2978 ///    A 128-bit floating-point vector of [4 x float].
   2979 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
   2980 ///    values.
   2981 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   2982 _mm_cvtps_pi16(__m128 __a)
   2983 {
   2984   return __trunc64(__builtin_ia32_packssdw128(
   2985       (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
   2986 }
   2987 
   2988 /// Converts each single-precision floating-point element of a 128-bit
   2989 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
   2990 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
   2991 ///    [8 x i8]. The upper 32 bits of the vector are set to 0.
   2992 ///
   2993 ///    If the floating-point element is NaN or infinity, or if the
   2994 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
   2995 ///    is converted to 0x80. Otherwise if the floating-point element is greater
   2996 ///    than 0x7F, it is converted to 0x7F.
   2997 ///
   2998 /// \headerfile <x86intrin.h>
   2999 ///
   3000 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
   3001 ///
   3002 /// \param __a
   3003 ///    128-bit floating-point vector of [4 x float].
   3004 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
   3005 ///    converted values and the uppper 32 bits are set to zero.
   3006 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
   3007 _mm_cvtps_pi8(__m128 __a)
   3008 {
   3009   __m64 __b, __c;
   3010 
   3011   __b = _mm_cvtps_pi16(__a);
   3012   __c = _mm_setzero_si64();
   3013 
   3014   return _mm_packs_pi16(__b, __c);
   3015 }
   3016 
   3017 /// Extracts the sign bits from each single-precision floating-point
   3018 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
   3019 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
   3020 ///    to zero.
   3021 ///
   3022 /// \headerfile <x86intrin.h>
   3023 ///
   3024 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
   3025 ///
   3026 /// \param __a
   3027 ///    A 128-bit floating-point vector of [4 x float].
   3028 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
   3029 ///    single-precision floating-point element of the parameter. Bits [31:4] are
   3030 ///    set to zero.
   3031 static __inline__ int __DEFAULT_FN_ATTRS
   3032 _mm_movemask_ps(__m128 __a)
   3033 {
   3034   return __builtin_ia32_movmskps((__v4sf)__a);
   3035 }
   3036 
   3037 /* Compare */
   3038 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
   3039 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
   3040 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
   3041 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
   3042 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
   3043 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
   3044 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
   3045 #define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
   3046 
   3047 /// Compares each of the corresponding values of two 128-bit vectors of
   3048 ///    [4 x float], using the operation specified by the immediate integer
   3049 ///    operand.
   3050 ///
   3051 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
   3052 ///    If either value in a comparison is NaN, comparisons that are ordered
   3053 ///    return false, and comparisons that are unordered return true.
   3054 ///
   3055 /// \headerfile <x86intrin.h>
   3056 ///
   3057 /// \code
   3058 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
   3059 /// \endcode
   3060 ///
   3061 /// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
   3062 ///
   3063 /// \param a
   3064 ///    A 128-bit vector of [4 x float].
   3065 /// \param b
   3066 ///    A 128-bit vector of [4 x float].
   3067 /// \param c
   3068 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   3069 ///    operation to use: \n
   3070 ///    0x00: Equal (ordered, non-signaling) \n
   3071 ///    0x01: Less-than (ordered, signaling) \n
   3072 ///    0x02: Less-than-or-equal (ordered, signaling) \n
   3073 ///    0x03: Unordered (non-signaling) \n
   3074 ///    0x04: Not-equal (unordered, non-signaling) \n
   3075 ///    0x05: Not-less-than (unordered, signaling) \n
   3076 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
   3077 ///    0x07: Ordered (non-signaling) \n
   3078 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
   3079 #define _mm_cmp_ps(a, b, c)                                                    \
   3080   ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
   3081 
   3082 /// Compares each of the corresponding scalar values of two 128-bit
   3083 ///    vectors of [4 x float], using the operation specified by the immediate
   3084 ///    integer operand.
   3085 ///
   3086 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
   3087 ///    If either value in a comparison is NaN, comparisons that are ordered
   3088 ///    return false, and comparisons that are unordered return true.
   3089 ///
   3090 /// \headerfile <x86intrin.h>
   3091 ///
   3092 /// \code
   3093 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
   3094 /// \endcode
   3095 ///
   3096 /// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
   3097 ///
   3098 /// \param a
   3099 ///    A 128-bit vector of [4 x float].
   3100 /// \param b
   3101 ///    A 128-bit vector of [4 x float].
   3102 /// \param c
   3103 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   3104 ///    operation to use: \n
   3105 ///    0x00: Equal (ordered, non-signaling) \n
   3106 ///    0x01: Less-than (ordered, signaling) \n
   3107 ///    0x02: Less-than-or-equal (ordered, signaling) \n
   3108 ///    0x03: Unordered (non-signaling) \n
   3109 ///    0x04: Not-equal (unordered, non-signaling) \n
   3110 ///    0x05: Not-less-than (unordered, signaling) \n
   3111 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
   3112 ///    0x07: Ordered (non-signaling) \n
   3113 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
   3114 #define _mm_cmp_ss(a, b, c)                                                    \
   3115   ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
   3116 
   3117 #define _MM_ALIGN16 __attribute__((aligned(16)))
   3118 
   3119 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
   3120 
   3121 #define _MM_EXCEPT_INVALID    (0x0001U)
   3122 #define _MM_EXCEPT_DENORM     (0x0002U)
   3123 #define _MM_EXCEPT_DIV_ZERO   (0x0004U)
   3124 #define _MM_EXCEPT_OVERFLOW   (0x0008U)
   3125 #define _MM_EXCEPT_UNDERFLOW  (0x0010U)
   3126 #define _MM_EXCEPT_INEXACT    (0x0020U)
   3127 #define _MM_EXCEPT_MASK       (0x003fU)
   3128 
   3129 #define _MM_MASK_INVALID      (0x0080U)
   3130 #define _MM_MASK_DENORM       (0x0100U)
   3131 #define _MM_MASK_DIV_ZERO     (0x0200U)
   3132 #define _MM_MASK_OVERFLOW     (0x0400U)
   3133 #define _MM_MASK_UNDERFLOW    (0x0800U)
   3134 #define _MM_MASK_INEXACT      (0x1000U)
   3135 #define _MM_MASK_MASK         (0x1f80U)
   3136 
   3137 #define _MM_ROUND_NEAREST     (0x0000U)
   3138 #define _MM_ROUND_DOWN        (0x2000U)
   3139 #define _MM_ROUND_UP          (0x4000U)
   3140 #define _MM_ROUND_TOWARD_ZERO (0x6000U)
   3141 #define _MM_ROUND_MASK        (0x6000U)
   3142 
   3143 #define _MM_FLUSH_ZERO_MASK   (0x8000U)
   3144 #define _MM_FLUSH_ZERO_ON     (0x8000U)
   3145 #define _MM_FLUSH_ZERO_OFF    (0x0000U)
   3146 
   3147 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
   3148 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
   3149 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
   3150 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
   3151 
   3152 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
   3153 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
   3154 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
   3155 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
   3156 
   3157 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
   3158 do { \
   3159   __m128 tmp3, tmp2, tmp1, tmp0; \
   3160   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
   3161   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
   3162   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
   3163   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
   3164   (row0) = _mm_movelh_ps(tmp0, tmp2); \
   3165   (row1) = _mm_movehl_ps(tmp2, tmp0); \
   3166   (row2) = _mm_movelh_ps(tmp1, tmp3); \
   3167   (row3) = _mm_movehl_ps(tmp3, tmp1); \
   3168 } while (0)
   3169 
   3170 /* Aliases for compatibility. */
   3171 #define _m_pextrw _mm_extract_pi16
   3172 #define _m_pinsrw _mm_insert_pi16
   3173 #define _m_pmaxsw _mm_max_pi16
   3174 #define _m_pmaxub _mm_max_pu8
   3175 #define _m_pminsw _mm_min_pi16
   3176 #define _m_pminub _mm_min_pu8
   3177 #define _m_pmovmskb _mm_movemask_pi8
   3178 #define _m_pmulhuw _mm_mulhi_pu16
   3179 #define _m_pshufw _mm_shuffle_pi16
   3180 #define _m_maskmovq _mm_maskmove_si64
   3181 #define _m_pavgb _mm_avg_pu8
   3182 #define _m_pavgw _mm_avg_pu16
   3183 #define _m_psadbw _mm_sad_pu8
   3184 #define _m_ _mm_
   3185 
   3186 #undef __trunc64
   3187 #undef __zext128
   3188 #undef __anyext128
   3189 #undef __zeroupper64
   3190 #undef __DEFAULT_FN_ATTRS
   3191 #undef __DEFAULT_FN_ATTRS_CONSTEXPR
   3192 #undef __DEFAULT_FN_ATTRS_SSE2
   3193 #undef __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
   3194 
   3195 /* Ugly hack for backwards-compatibility (compatible with gcc) */
   3196 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
   3197 #include <emmintrin.h>
   3198 #endif
   3199 
   3200 #endif /* __XMMINTRIN_H */