zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

smmintrin.h (101720B) - Raw


      1 /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 #ifndef __SMMINTRIN_H
     11 #define __SMMINTRIN_H
     12 
     13 #if !defined(__i386__) && !defined(__x86_64__)
     14 #error "This header is only meant to be used on x86 and x64 architecture"
     15 #endif
     16 
     17 #include <tmmintrin.h>
     18 
     19 /* Define the default attributes for the functions in this file. */
     20 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
     21 #define __DEFAULT_FN_ATTRS                                                     \
     22   __attribute__((__always_inline__, __nodebug__,                               \
     23                  __target__("sse4.1,no-evex512"), __min_vector_width__(128)))
     24 #else
     25 #define __DEFAULT_FN_ATTRS                                                     \
     26   __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"),         \
     27                  __min_vector_width__(128)))
     28 #endif
     29 
     30 /* SSE4 Rounding macros. */
     31 #define _MM_FROUND_TO_NEAREST_INT 0x00
     32 #define _MM_FROUND_TO_NEG_INF 0x01
     33 #define _MM_FROUND_TO_POS_INF 0x02
     34 #define _MM_FROUND_TO_ZERO 0x03
     35 #define _MM_FROUND_CUR_DIRECTION 0x04
     36 
     37 #define _MM_FROUND_RAISE_EXC 0x00
     38 #define _MM_FROUND_NO_EXC 0x08
     39 
     40 #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
     41 #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
     42 #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
     43 #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
     44 #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
     45 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
     46 
     47 /// Rounds up each element of the 128-bit vector of [4 x float] to an
     48 ///    integer and returns the rounded values in a 128-bit vector of
     49 ///    [4 x float].
     50 ///
     51 /// \headerfile <x86intrin.h>
     52 ///
     53 /// \code
     54 /// __m128 _mm_ceil_ps(__m128 X);
     55 /// \endcode
     56 ///
     57 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
     58 ///
     59 /// \param X
     60 ///    A 128-bit vector of [4 x float] values to be rounded up.
     61 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
     62 #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
     63 
     64 /// Rounds up each element of the 128-bit vector of [2 x double] to an
     65 ///    integer and returns the rounded values in a 128-bit vector of
     66 ///    [2 x double].
     67 ///
     68 /// \headerfile <x86intrin.h>
     69 ///
     70 /// \code
     71 /// __m128d _mm_ceil_pd(__m128d X);
     72 /// \endcode
     73 ///
     74 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
     75 ///
     76 /// \param X
     77 ///    A 128-bit vector of [2 x double] values to be rounded up.
     78 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
     79 #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
     80 
     81 /// Copies three upper elements of the first 128-bit vector operand to
     82 ///    the corresponding three upper elements of the 128-bit result vector of
     83 ///    [4 x float]. Rounds up the lowest element of the second 128-bit vector
     84 ///    operand to an integer and copies it to the lowest element of the 128-bit
     85 ///    result vector of [4 x float].
     86 ///
     87 /// \headerfile <x86intrin.h>
     88 ///
     89 /// \code
     90 /// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
     91 /// \endcode
     92 ///
     93 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
     94 ///
     95 /// \param X
     96 ///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
     97 ///    copied to the corresponding bits of the result.
     98 /// \param Y
     99 ///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
    100 ///    rounded up to the nearest integer and copied to the corresponding bits
    101 ///    of the result.
    102 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
    103 ///    values.
    104 #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
    105 
    106 /// Copies the upper element of the first 128-bit vector operand to the
    107 ///    corresponding upper element of the 128-bit result vector of [2 x double].
    108 ///    Rounds up the lower element of the second 128-bit vector operand to an
    109 ///    integer and copies it to the lower element of the 128-bit result vector
    110 ///    of [2 x double].
    111 ///
    112 /// \headerfile <x86intrin.h>
    113 ///
    114 /// \code
    115 /// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
    116 /// \endcode
    117 ///
    118 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
    119 ///
    120 /// \param X
    121 ///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
    122 ///    copied to the corresponding bits of the result.
    123 /// \param Y
    124 ///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
    125 ///    rounded up to the nearest integer and copied to the corresponding bits
    126 ///    of the result.
    127 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
    128 ///    values.
    129 #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
    130 
    131 /// Rounds down each element of the 128-bit vector of [4 x float] to an
    132 ///    an integer and returns the rounded values in a 128-bit vector of
    133 ///    [4 x float].
    134 ///
    135 /// \headerfile <x86intrin.h>
    136 ///
    137 /// \code
    138 /// __m128 _mm_floor_ps(__m128 X);
    139 /// \endcode
    140 ///
    141 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
    142 ///
    143 /// \param X
    144 ///    A 128-bit vector of [4 x float] values to be rounded down.
    145 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
    146 #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
    147 
    148 /// Rounds down each element of the 128-bit vector of [2 x double] to an
    149 ///    integer and returns the rounded values in a 128-bit vector of
    150 ///    [2 x double].
    151 ///
    152 /// \headerfile <x86intrin.h>
    153 ///
    154 /// \code
    155 /// __m128d _mm_floor_pd(__m128d X);
    156 /// \endcode
    157 ///
    158 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
    159 ///
    160 /// \param X
    161 ///    A 128-bit vector of [2 x double].
    162 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
    163 #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
    164 
    165 /// Copies three upper elements of the first 128-bit vector operand to
    166 ///    the corresponding three upper elements of the 128-bit result vector of
    167 ///    [4 x float]. Rounds down the lowest element of the second 128-bit vector
    168 ///    operand to an integer and copies it to the lowest element of the 128-bit
    169 ///    result vector of [4 x float].
    170 ///
    171 /// \headerfile <x86intrin.h>
    172 ///
    173 /// \code
    174 /// __m128 _mm_floor_ss(__m128 X, __m128 Y);
    175 /// \endcode
    176 ///
    177 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
    178 ///
    179 /// \param X
    180 ///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
    181 ///    copied to the corresponding bits of the result.
    182 /// \param Y
    183 ///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
    184 ///    rounded down to the nearest integer and copied to the corresponding bits
    185 ///    of the result.
    186 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
    187 ///    values.
    188 #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
    189 
    190 /// Copies the upper element of the first 128-bit vector operand to the
    191 ///    corresponding upper element of the 128-bit result vector of [2 x double].
    192 ///    Rounds down the lower element of the second 128-bit vector operand to an
    193 ///    integer and copies it to the lower element of the 128-bit result vector
    194 ///    of [2 x double].
    195 ///
    196 /// \headerfile <x86intrin.h>
    197 ///
    198 /// \code
    199 /// __m128d _mm_floor_sd(__m128d X, __m128d Y);
    200 /// \endcode
    201 ///
    202 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
    203 ///
    204 /// \param X
    205 ///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
    206 ///    copied to the corresponding bits of the result.
    207 /// \param Y
    208 ///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
    209 ///    rounded down to the nearest integer and copied to the corresponding bits
    210 ///    of the result.
    211 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
    212 ///    values.
    213 #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
    214 
    215 /// Rounds each element of the 128-bit vector of [4 x float] to an
    216 ///    integer value according to the rounding control specified by the second
    217 ///    argument and returns the rounded values in a 128-bit vector of
    218 ///    [4 x float].
    219 ///
    220 /// \headerfile <x86intrin.h>
    221 ///
    222 /// \code
    223 /// __m128 _mm_round_ps(__m128 X, const int M);
    224 /// \endcode
    225 ///
    226 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
    227 ///
    228 /// \param X
    229 ///    A 128-bit vector of [4 x float].
    230 /// \param M
    231 ///    An integer value that specifies the rounding operation. \n
    232 ///    Bits [7:4] are reserved. \n
    233 ///    Bit [3] is a precision exception value: \n
    234 ///      0: A normal PE exception is used \n
    235 ///      1: The PE field is not updated \n
    236 ///    Bit [2] is the rounding control source: \n
    237 ///      0: Use bits [1:0] of \a M \n
    238 ///      1: Use the current MXCSR setting \n
    239 ///    Bits [1:0] contain the rounding control definition: \n
    240 ///      00: Nearest \n
    241 ///      01: Downward (toward negative infinity) \n
    242 ///      10: Upward (toward positive infinity) \n
    243 ///      11: Truncated
    244 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
    245 #define _mm_round_ps(X, M)                                                     \
    246   ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
    247 
    248 /// Copies three upper elements of the first 128-bit vector operand to
    249 ///    the corresponding three upper elements of the 128-bit result vector of
    250 ///    [4 x float]. Rounds the lowest element of the second 128-bit vector
    251 ///    operand to an integer value according to the rounding control specified
    252 ///    by the third argument and copies it to the lowest element of the 128-bit
    253 ///    result vector of [4 x float].
    254 ///
    255 /// \headerfile <x86intrin.h>
    256 ///
    257 /// \code
    258 /// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
    259 /// \endcode
    260 ///
    261 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
    262 ///
    263 /// \param X
    264 ///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
    265 ///    copied to the corresponding bits of the result.
    266 /// \param Y
    267 ///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
    268 ///    rounded to the nearest integer using the specified rounding control and
    269 ///    copied to the corresponding bits of the result.
    270 /// \param M
    271 ///    An integer value that specifies the rounding operation. \n
    272 ///    Bits [7:4] are reserved. \n
    273 ///    Bit [3] is a precision exception value: \n
    274 ///      0: A normal PE exception is used \n
    275 ///      1: The PE field is not updated \n
    276 ///    Bit [2] is the rounding control source: \n
    277 ///      0: Use bits [1:0] of \a M \n
    278 ///      1: Use the current MXCSR setting \n
    279 ///    Bits [1:0] contain the rounding control definition: \n
    280 ///      00: Nearest \n
    281 ///      01: Downward (toward negative infinity) \n
    282 ///      10: Upward (toward positive infinity) \n
    283 ///      11: Truncated
    284 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
    285 ///    values.
    286 #define _mm_round_ss(X, Y, M)                                                  \
    287   ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y),    \
    288                                   (M)))
    289 
    290 /// Rounds each element of the 128-bit vector of [2 x double] to an
    291 ///    integer value according to the rounding control specified by the second
    292 ///    argument and returns the rounded values in a 128-bit vector of
    293 ///    [2 x double].
    294 ///
    295 /// \headerfile <x86intrin.h>
    296 ///
    297 /// \code
    298 /// __m128d _mm_round_pd(__m128d X, const int M);
    299 /// \endcode
    300 ///
    301 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
    302 ///
    303 /// \param X
    304 ///    A 128-bit vector of [2 x double].
    305 /// \param M
    306 ///    An integer value that specifies the rounding operation. \n
    307 ///    Bits [7:4] are reserved. \n
    308 ///    Bit [3] is a precision exception value: \n
    309 ///      0: A normal PE exception is used \n
    310 ///      1: The PE field is not updated \n
    311 ///    Bit [2] is the rounding control source: \n
    312 ///      0: Use bits [1:0] of \a M \n
    313 ///      1: Use the current MXCSR setting \n
    314 ///    Bits [1:0] contain the rounding control definition: \n
    315 ///      00: Nearest \n
    316 ///      01: Downward (toward negative infinity) \n
    317 ///      10: Upward (toward positive infinity) \n
    318 ///      11: Truncated
    319 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
    320 #define _mm_round_pd(X, M)                                                     \
    321   ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
    322 
    323 /// Copies the upper element of the first 128-bit vector operand to the
    324 ///    corresponding upper element of the 128-bit result vector of [2 x double].
    325 ///    Rounds the lower element of the second 128-bit vector operand to an
    326 ///    integer value according to the rounding control specified by the third
    327 ///    argument and copies it to the lower element of the 128-bit result vector
    328 ///    of [2 x double].
    329 ///
    330 /// \headerfile <x86intrin.h>
    331 ///
    332 /// \code
    333 /// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
    334 /// \endcode
    335 ///
    336 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
    337 ///
    338 /// \param X
    339 ///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
    340 ///    copied to the corresponding bits of the result.
    341 /// \param Y
    342 ///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
    343 ///    rounded to the nearest integer using the specified rounding control and
    344 ///    copied to the corresponding bits of the result.
    345 /// \param M
    346 ///    An integer value that specifies the rounding operation. \n
    347 ///    Bits [7:4] are reserved. \n
    348 ///    Bit [3] is a precision exception value: \n
    349 ///      0: A normal PE exception is used \n
    350 ///      1: The PE field is not updated \n
    351 ///    Bit [2] is the rounding control source: \n
    352 ///      0: Use bits [1:0] of \a M \n
    353 ///      1: Use the current MXCSR setting \n
    354 ///    Bits [1:0] contain the rounding control definition: \n
    355 ///      00: Nearest \n
    356 ///      01: Downward (toward negative infinity) \n
    357 ///      10: Upward (toward positive infinity) \n
    358 ///      11: Truncated
    359 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
    360 ///    values.
    361 #define _mm_round_sd(X, Y, M)                                                  \
    362   ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
    363                                    (M)))
    364 
    365 /* SSE4 Packed Blending Intrinsics.  */
    366 /// Returns a 128-bit vector of [2 x double] where the values are
    367 ///    selected from either the first or second operand as specified by the
    368 ///    third operand, the control mask.
    369 ///
    370 /// \headerfile <x86intrin.h>
    371 ///
    372 /// \code
    373 /// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
    374 /// \endcode
    375 ///
    376 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
    377 ///
    378 /// \param V1
    379 ///    A 128-bit vector of [2 x double].
    380 /// \param V2
    381 ///    A 128-bit vector of [2 x double].
    382 /// \param M
    383 ///    An immediate integer operand, with mask bits [1:0] specifying how the
    384 ///    values are to be copied. The position of the mask bit corresponds to the
    385 ///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
    386 ///    element in operand \a V1 is copied to the same position in the result.
    387 ///    When a mask bit is 1, the corresponding 64-bit element in operand \a V2
    388 ///    is copied to the same position in the result.
    389 /// \returns A 128-bit vector of [2 x double] containing the copied values.
    390 #define _mm_blend_pd(V1, V2, M)                                                \
    391   ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1),                      \
    392                                    (__v2df)(__m128d)(V2), (int)(M)))
    393 
    394 /// Returns a 128-bit vector of [4 x float] where the values are selected
    395 ///    from either the first or second operand as specified by the third
    396 ///    operand, the control mask.
    397 ///
    398 /// \headerfile <x86intrin.h>
    399 ///
    400 /// \code
    401 /// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
    402 /// \endcode
    403 ///
    404 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
    405 ///
    406 /// \param V1
    407 ///    A 128-bit vector of [4 x float].
    408 /// \param V2
    409 ///    A 128-bit vector of [4 x float].
    410 /// \param M
    411 ///    An immediate integer operand, with mask bits [3:0] specifying how the
    412 ///    values are to be copied. The position of the mask bit corresponds to the
    413 ///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
    414 ///    element in operand \a V1 is copied to the same position in the result.
    415 ///    When a mask bit is 1, the corresponding 32-bit element in operand \a V2
    416 ///    is copied to the same position in the result.
    417 /// \returns A 128-bit vector of [4 x float] containing the copied values.
    418 #define _mm_blend_ps(V1, V2, M)                                                \
    419   ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2),  \
    420                                   (int)(M)))
    421 
    422 /// Returns a 128-bit vector of [2 x double] where the values are
    423 ///    selected from either the first or second operand as specified by the
    424 ///    third operand, the control mask.
    425 ///
    426 /// \headerfile <x86intrin.h>
    427 ///
    428 /// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
    429 ///
    430 /// \param __V1
    431 ///    A 128-bit vector of [2 x double].
    432 /// \param __V2
    433 ///    A 128-bit vector of [2 x double].
    434 /// \param __M
    435 ///    A 128-bit vector operand, with mask bits 127 and 63 specifying how the
    436 ///    values are to be copied. The position of the mask bit corresponds to the
    437 ///    most significant bit of a copied value. When a mask bit is 0, the
    438 ///    corresponding 64-bit element in operand \a __V1 is copied to the same
    439 ///    position in the result. When a mask bit is 1, the corresponding 64-bit
    440 ///    element in operand \a __V2 is copied to the same position in the result.
    441 /// \returns A 128-bit vector of [2 x double] containing the copied values.
    442 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
    443                                                            __m128d __V2,
    444                                                            __m128d __M) {
    445   return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
    446                                           (__v2df)__M);
    447 }
    448 
    449 /// Returns a 128-bit vector of [4 x float] where the values are
    450 ///    selected from either the first or second operand as specified by the
    451 ///    third operand, the control mask.
    452 ///
    453 /// \headerfile <x86intrin.h>
    454 ///
    455 /// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
    456 ///
    457 /// \param __V1
    458 ///    A 128-bit vector of [4 x float].
    459 /// \param __V2
    460 ///    A 128-bit vector of [4 x float].
    461 /// \param __M
    462 ///    A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
    463 ///    how the values are to be copied. The position of the mask bit corresponds
    464 ///    to the most significant bit of a copied value. When a mask bit is 0, the
    465 ///    corresponding 32-bit element in operand \a __V1 is copied to the same
    466 ///    position in the result. When a mask bit is 1, the corresponding 32-bit
    467 ///    element in operand \a __V2 is copied to the same position in the result.
    468 /// \returns A 128-bit vector of [4 x float] containing the copied values.
    469 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
    470                                                           __m128 __V2,
    471                                                           __m128 __M) {
    472   return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
    473                                          (__v4sf)__M);
    474 }
    475 
    476 /// Returns a 128-bit vector of [16 x i8] where the values are selected
    477 ///    from either of the first or second operand as specified by the third
    478 ///    operand, the control mask.
    479 ///
    480 /// \headerfile <x86intrin.h>
    481 ///
    482 /// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
    483 ///
    484 /// \param __V1
    485 ///    A 128-bit vector of [16 x i8].
    486 /// \param __V2
    487 ///    A 128-bit vector of [16 x i8].
    488 /// \param __M
    489 ///    A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
    490 ///    how the values are to be copied. The position of the mask bit corresponds
    491 ///    to the most significant bit of a copied value. When a mask bit is 0, the
    492 ///    corresponding 8-bit element in operand \a __V1 is copied to the same
    493 ///    position in the result. When a mask bit is 1, the corresponding 8-bit
    494 ///    element in operand \a __V2 is copied to the same position in the result.
    495 /// \returns A 128-bit vector of [16 x i8] containing the copied values.
    496 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1,
    497                                                              __m128i __V2,
    498                                                              __m128i __M) {
    499   return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
    500                                              (__v16qi)__M);
    501 }
    502 
    503 /// Returns a 128-bit vector of [8 x i16] where the values are selected
    504 ///    from either of the first or second operand as specified by the third
    505 ///    operand, the control mask.
    506 ///
    507 /// \headerfile <x86intrin.h>
    508 ///
    509 /// \code
    510 /// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
    511 /// \endcode
    512 ///
    513 /// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
    514 ///
    515 /// \param V1
    516 ///    A 128-bit vector of [8 x i16].
    517 /// \param V2
    518 ///    A 128-bit vector of [8 x i16].
    519 /// \param M
    520 ///    An immediate integer operand, with mask bits [7:0] specifying how the
    521 ///    values are to be copied. The position of the mask bit corresponds to the
    522 ///    index of a copied value. When a mask bit is 0, the corresponding 16-bit
    523 ///    element in operand \a V1 is copied to the same position in the result.
    524 ///    When a mask bit is 1, the corresponding 16-bit element in operand \a V2
    525 ///    is copied to the same position in the result.
    526 /// \returns A 128-bit vector of [8 x i16] containing the copied values.
    527 #define _mm_blend_epi16(V1, V2, M)                                             \
    528   ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1),                   \
    529                                       (__v8hi)(__m128i)(V2), (int)(M)))
    530 
    531 /* SSE4 Dword Multiply Instructions.  */
    532 /// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
    533 ///    and returns the lower 32 bits of the each product in a 128-bit vector of
    534 ///    [4 x i32].
    535 ///
    536 /// \headerfile <x86intrin.h>
    537 ///
    538 /// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
    539 ///
    540 /// \param __V1
    541 ///    A 128-bit integer vector.
    542 /// \param __V2
    543 ///    A 128-bit integer vector.
    544 /// \returns A 128-bit integer vector containing the products of both operands.
    545 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
    546                                                              __m128i __V2) {
    547   return (__m128i)((__v4su)__V1 * (__v4su)__V2);
    548 }
    549 
    550 /// Multiplies corresponding even-indexed elements of two 128-bit
    551 ///    vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
    552 ///    containing the products.
    553 ///
    554 /// \headerfile <x86intrin.h>
    555 ///
    556 /// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
    557 ///
    558 /// \param __V1
    559 ///    A 128-bit vector of [4 x i32].
    560 /// \param __V2
    561 ///    A 128-bit vector of [4 x i32].
    562 /// \returns A 128-bit vector of [2 x i64] containing the products of both
    563 ///    operands.
    564 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
    565                                                            __m128i __V2) {
    566   return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
    567 }
    568 
    569 /* SSE4 Floating Point Dot Product Instructions.  */
    570 /// Computes the dot product of the two 128-bit vectors of [4 x float]
    571 ///    and returns it in the elements of the 128-bit result vector of
    572 ///    [4 x float].
    573 ///
    574 ///    The immediate integer operand controls which input elements
    575 ///    will contribute to the dot product, and where the final results are
    576 ///    returned.
    577 ///
    578 /// \headerfile <x86intrin.h>
    579 ///
    580 /// \code
    581 /// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
    582 /// \endcode
    583 ///
    584 /// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
    585 ///
    586 /// \param X
    587 ///    A 128-bit vector of [4 x float].
    588 /// \param Y
    589 ///    A 128-bit vector of [4 x float].
    590 /// \param M
    591 ///    An immediate integer operand. Mask bits [7:4] determine which elements
    592 ///    of the input vectors are used, with bit [4] corresponding to the lowest
    593 ///    element and bit [7] corresponding to the highest element of each [4 x
    594 ///    float] vector. If a bit is set, the corresponding elements from the two
    595 ///    input vectors are used as an input for dot product; otherwise that input
    596 ///    is treated as zero. Bits [3:0] determine which elements of the result
    597 ///    will receive a copy of the final dot product, with bit [0] corresponding
    598 ///    to the lowest element and bit [3] corresponding to the highest element of
    599 ///    each [4 x float] subvector. If a bit is set, the dot product is returned
    600 ///    in the corresponding element; otherwise that element is set to zero.
    601 /// \returns A 128-bit vector of [4 x float] containing the dot product.
    602 #define _mm_dp_ps(X, Y, M)                                                     \
    603   ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
    604 
    605 /// Computes the dot product of the two 128-bit vectors of [2 x double]
    606 ///    and returns it in the elements of the 128-bit result vector of
    607 ///    [2 x double].
    608 ///
    609 ///    The immediate integer operand controls which input
    610 ///    elements will contribute to the dot product, and where the final results
    611 ///    are returned.
    612 ///
    613 /// \headerfile <x86intrin.h>
    614 ///
    615 /// \code
    616 /// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
    617 /// \endcode
    618 ///
    619 /// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
    620 ///
    621 /// \param X
    622 ///    A 128-bit vector of [2 x double].
    623 /// \param Y
    624 ///    A 128-bit vector of [2 x double].
    625 /// \param M
    626 ///    An immediate integer operand. Mask bits [5:4] determine which elements
    627 ///    of the input vectors are used, with bit [4] corresponding to the lowest
    628 ///    element and bit [5] corresponding to the highest element of each of [2 x
    629 ///    double] vector. If a bit is set, the corresponding elements from the two
    630 ///    input vectors are used as an input for dot product; otherwise that input
    631 ///    is treated as zero. Bits [1:0] determine which elements of the result
    632 ///    will receive a copy of the final dot product, with bit [0] corresponding
    633 ///    to the lowest element and bit [1] corresponding to the highest element of
    634 ///    each [2 x double] vector. If a bit is set, the dot product is returned in
    635 ///    the corresponding element; otherwise that element is set to zero.
    636 #define _mm_dp_pd(X, Y, M)                                                     \
    637   ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y),    \
    638                                 (M)))
    639 
    640 /* SSE4 Streaming Load Hint Instruction.  */
    641 /// Loads integer values from a 128-bit aligned memory location to a
    642 ///    128-bit integer vector.
    643 ///
    644 /// \headerfile <x86intrin.h>
    645 ///
    646 /// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
    647 ///
    648 /// \param __V
    649 ///    A pointer to a 128-bit aligned memory location that contains the integer
    650 ///    values.
    651 /// \returns A 128-bit integer vector containing the data stored at the
    652 ///    specified memory location.
    653 static __inline__ __m128i __DEFAULT_FN_ATTRS
    654 _mm_stream_load_si128(const void *__V) {
    655   return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
    656 }
    657 
    658 /* SSE4 Packed Integer Min/Max Instructions.  */
    659 /// Compares the corresponding elements of two 128-bit vectors of
    660 ///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
    661 ///    of the two values.
    662 ///
    663 /// \headerfile <x86intrin.h>
    664 ///
    665 /// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
    666 ///
    667 /// \param __V1
    668 ///    A 128-bit vector of [16 x i8].
    669 /// \param __V2
    670 ///    A 128-bit vector of [16 x i8]
    671 /// \returns A 128-bit vector of [16 x i8] containing the lesser values.
    672 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1,
    673                                                           __m128i __V2) {
    674   return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
    675 }
    676 
    677 /// Compares the corresponding elements of two 128-bit vectors of
    678 ///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
    679 ///    greater value of the two.
    680 ///
    681 /// \headerfile <x86intrin.h>
    682 ///
    683 /// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
    684 ///
    685 /// \param __V1
    686 ///    A 128-bit vector of [16 x i8].
    687 /// \param __V2
    688 ///    A 128-bit vector of [16 x i8].
    689 /// \returns A 128-bit vector of [16 x i8] containing the greater values.
    690 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1,
    691                                                           __m128i __V2) {
    692   return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
    693 }
    694 
    695 /// Compares the corresponding elements of two 128-bit vectors of
    696 ///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
    697 ///    value of the two.
    698 ///
    699 /// \headerfile <x86intrin.h>
    700 ///
    701 /// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
    702 ///
    703 /// \param __V1
    704 ///    A 128-bit vector of [8 x u16].
    705 /// \param __V2
    706 ///    A 128-bit vector of [8 x u16].
    707 /// \returns A 128-bit vector of [8 x u16] containing the lesser values.
    708 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1,
    709                                                            __m128i __V2) {
    710   return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
    711 }
    712 
    713 /// Compares the corresponding elements of two 128-bit vectors of
    714 ///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
    715 ///    greater value of the two.
    716 ///
    717 /// \headerfile <x86intrin.h>
    718 ///
    719 /// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
    720 ///
    721 /// \param __V1
    722 ///    A 128-bit vector of [8 x u16].
    723 /// \param __V2
    724 ///    A 128-bit vector of [8 x u16].
    725 /// \returns A 128-bit vector of [8 x u16] containing the greater values.
    726 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1,
    727                                                            __m128i __V2) {
    728   return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
    729 }
    730 
    731 /// Compares the corresponding elements of two 128-bit vectors of
    732 ///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
    733 ///    value of the two.
    734 ///
    735 /// \headerfile <x86intrin.h>
    736 ///
    737 /// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
    738 ///
    739 /// \param __V1
    740 ///    A 128-bit vector of [4 x i32].
    741 /// \param __V2
    742 ///    A 128-bit vector of [4 x i32].
    743 /// \returns A 128-bit vector of [4 x i32] containing the lesser values.
    744 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1,
    745                                                            __m128i __V2) {
    746   return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
    747 }
    748 
    749 /// Compares the corresponding elements of two 128-bit vectors of
    750 ///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
    751 ///    greater value of the two.
    752 ///
    753 /// \headerfile <x86intrin.h>
    754 ///
    755 /// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
    756 ///
    757 /// \param __V1
    758 ///    A 128-bit vector of [4 x i32].
    759 /// \param __V2
    760 ///    A 128-bit vector of [4 x i32].
    761 /// \returns A 128-bit vector of [4 x i32] containing the greater values.
    762 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1,
    763                                                            __m128i __V2) {
    764   return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
    765 }
    766 
    767 /// Compares the corresponding elements of two 128-bit vectors of
    768 ///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
    769 ///    value of the two.
    770 ///
    771 /// \headerfile <x86intrin.h>
    772 ///
    773 /// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c>  instruction.
    774 ///
    775 /// \param __V1
    776 ///    A 128-bit vector of [4 x u32].
    777 /// \param __V2
    778 ///    A 128-bit vector of [4 x u32].
    779 /// \returns A 128-bit vector of [4 x u32] containing the lesser values.
    780 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1,
    781                                                            __m128i __V2) {
    782   return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
    783 }
    784 
    785 /// Compares the corresponding elements of two 128-bit vectors of
    786 ///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
    787 ///    greater value of the two.
    788 ///
    789 /// \headerfile <x86intrin.h>
    790 ///
    791 /// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
    792 ///
    793 /// \param __V1
    794 ///    A 128-bit vector of [4 x u32].
    795 /// \param __V2
    796 ///    A 128-bit vector of [4 x u32].
    797 /// \returns A 128-bit vector of [4 x u32] containing the greater values.
    798 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
    799                                                            __m128i __V2) {
    800   return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
    801 }
    802 
    803 /* SSE4 Insertion and Extraction from XMM Register Instructions.  */
    804 /// Takes the first argument \a X and inserts an element from the second
    805 ///    argument \a Y as selected by the third argument \a N. That result then
    806 ///    has elements zeroed out also as selected by the third argument \a N. The
    807 ///    resulting 128-bit vector of [4 x float] is then returned.
    808 ///
    809 /// \headerfile <x86intrin.h>
    810 ///
    811 /// \code
    812 /// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
    813 /// \endcode
    814 ///
    815 /// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
    816 ///
    817 /// \param X
    818 ///    A 128-bit vector source operand of [4 x float]. With the exception of
    819 ///    those bits in the result copied from parameter \a Y and zeroed by bits
    820 ///    [3:0] of \a N, all bits from this parameter are copied to the result.
    821 /// \param Y
    822 ///    A 128-bit vector source operand of [4 x float]. One single-precision
    823 ///    floating-point element from this source, as determined by the immediate
    824 ///    parameter, is copied to the result.
    825 /// \param N
    826 ///    Specifies which bits from operand \a Y will be copied, which bits in the
    827 ///    result they will be copied to, and which bits in the result will be
    828 ///    cleared. The following assignments are made: \n
    829 ///    Bits [7:6] specify the bits to copy from operand \a Y: \n
    830 ///      00: Selects bits [31:0] from operand \a Y. \n
    831 ///      01: Selects bits [63:32] from operand \a Y. \n
    832 ///      10: Selects bits [95:64] from operand \a Y. \n
    833 ///      11: Selects bits [127:96] from operand \a Y. \n
    834 ///    Bits [5:4] specify the bits in the result to which the selected bits
    835 ///    from operand \a Y are copied: \n
    836 ///      00: Copies the selected bits from \a Y to result bits [31:0]. \n
    837 ///      01: Copies the selected bits from \a Y to result bits [63:32]. \n
    838 ///      10: Copies the selected bits from \a Y to result bits [95:64]. \n
    839 ///      11: Copies the selected bits from \a Y to result bits [127:96]. \n
    840 ///    Bits[3:0]: If any of these bits are set, the corresponding result
    841 ///    element is cleared.
    842 /// \returns A 128-bit vector of [4 x float] containing the copied
    843 ///    single-precision floating point elements from the operands.
    844 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
    845 
    846 /// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
    847 ///    returns it, using the immediate value parameter \a N as a selector.
    848 ///
    849 /// \headerfile <x86intrin.h>
    850 ///
    851 /// \code
    852 /// int _mm_extract_ps(__m128 X, const int N);
    853 /// \endcode
    854 ///
    855 /// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
    856 /// instruction.
    857 ///
    858 /// \param X
    859 ///    A 128-bit vector of [4 x float].
    860 /// \param N
    861 ///    An immediate value. Bits [1:0] determines which bits from the argument
    862 ///    \a X are extracted and returned: \n
    863 ///    00: Bits [31:0] of parameter \a X are returned. \n
    864 ///    01: Bits [63:32] of parameter \a X are returned. \n
    865 ///    10: Bits [95:64] of parameter \a X are returned. \n
    866 ///    11: Bits [127:96] of parameter \a X are returned.
    867 /// \returns A 32-bit integer containing the extracted 32 bits of float data.
    868 #define _mm_extract_ps(X, N)                                                   \
    869   __builtin_bit_cast(                                                          \
    870       int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
    871 
    872 /* Miscellaneous insert and extract macros.  */
    873 /* Extract a single-precision float from X at index N into D.  */
    874 #define _MM_EXTRACT_FLOAT(D, X, N)                                             \
    875   do {                                                                         \
    876     (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N));          \
    877   } while (0)
    878 
    879 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
    880    an index suitable for _mm_insert_ps.  */
    881 #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
    882 
    883 /* Extract a float from X at index N into the first index of the return.  */
    884 #define _MM_PICK_OUT_PS(X, N)                                                  \
    885   _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
    886 
    887 /* Insert int into packed integer array at index.  */
    888 /// Constructs a 128-bit vector of [16 x i8] by first making a copy of
    889 ///    the 128-bit integer vector parameter, and then inserting the lower 8 bits
    890 ///    of an integer parameter \a I into an offset specified by the immediate
    891 ///    value parameter \a N.
    892 ///
    893 /// \headerfile <x86intrin.h>
    894 ///
    895 /// \code
    896 /// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
    897 /// \endcode
    898 ///
    899 /// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
    900 ///
    901 /// \param X
    902 ///    A 128-bit integer vector of [16 x i8]. This vector is copied to the
    903 ///    result and then one of the sixteen elements in the result vector is
    904 ///    replaced by the lower 8 bits of \a I.
    905 /// \param I
    906 ///    An integer. The lower 8 bits of this operand are written to the result
    907 ///    beginning at the offset specified by \a N.
    908 /// \param N
    909 ///    An immediate value. Bits [3:0] specify the bit offset in the result at
    910 ///    which the lower 8 bits of \a I are written. \n
    911 ///    0000: Bits [7:0] of the result are used for insertion. \n
    912 ///    0001: Bits [15:8] of the result are used for insertion. \n
    913 ///    0010: Bits [23:16] of the result are used for insertion. \n
    914 ///    0011: Bits [31:24] of the result are used for insertion. \n
    915 ///    0100: Bits [39:32] of the result are used for insertion. \n
    916 ///    0101: Bits [47:40] of the result are used for insertion. \n
    917 ///    0110: Bits [55:48] of the result are used for insertion. \n
    918 ///    0111: Bits [63:56] of the result are used for insertion. \n
    919 ///    1000: Bits [71:64] of the result are used for insertion. \n
    920 ///    1001: Bits [79:72] of the result are used for insertion. \n
    921 ///    1010: Bits [87:80] of the result are used for insertion. \n
    922 ///    1011: Bits [95:88] of the result are used for insertion. \n
    923 ///    1100: Bits [103:96] of the result are used for insertion. \n
    924 ///    1101: Bits [111:104] of the result are used for insertion. \n
    925 ///    1110: Bits [119:112] of the result are used for insertion. \n
    926 ///    1111: Bits [127:120] of the result are used for insertion.
    927 /// \returns A 128-bit integer vector containing the constructed values.
    928 #define _mm_insert_epi8(X, I, N)                                               \
    929   ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I),      \
    930                                          (int)(N)))
    931 
    932 /// Constructs a 128-bit vector of [4 x i32] by first making a copy of
    933 ///    the 128-bit integer vector parameter, and then inserting the 32-bit
    934 ///    integer parameter \a I at the offset specified by the immediate value
    935 ///    parameter \a N.
    936 ///
    937 /// \headerfile <x86intrin.h>
    938 ///
    939 /// \code
    940 /// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
    941 /// \endcode
    942 ///
    943 /// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
    944 ///
    945 /// \param X
    946 ///    A 128-bit integer vector of [4 x i32]. This vector is copied to the
    947 ///    result and then one of the four elements in the result vector is
    948 ///    replaced by \a I.
    949 /// \param I
    950 ///    A 32-bit integer that is written to the result beginning at the offset
    951 ///    specified by \a N.
    952 /// \param N
    953 ///    An immediate value. Bits [1:0] specify the bit offset in the result at
    954 ///    which the integer \a I is written. \n
    955 ///    00: Bits [31:0] of the result are used for insertion. \n
    956 ///    01: Bits [63:32] of the result are used for insertion. \n
    957 ///    10: Bits [95:64] of the result are used for insertion. \n
    958 ///    11: Bits [127:96] of the result are used for insertion.
    959 /// \returns A 128-bit integer vector containing the constructed values.
    960 #define _mm_insert_epi32(X, I, N)                                              \
    961   ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I),        \
    962                                         (int)(N)))
    963 
    964 #ifdef __x86_64__
    965 /// Constructs a 128-bit vector of [2 x i64] by first making a copy of
    966 ///    the 128-bit integer vector parameter, and then inserting the 64-bit
    967 ///    integer parameter \a I, using the immediate value parameter \a N as an
    968 ///    insertion location selector.
    969 ///
    970 /// \headerfile <x86intrin.h>
    971 ///
    972 /// \code
    973 /// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
    974 /// \endcode
    975 ///
    976 /// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
    977 ///
    978 /// \param X
    979 ///    A 128-bit integer vector of [2 x i64]. This vector is copied to the
    980 ///    result and then one of the two elements in the result vector is replaced
    981 ///    by \a I.
    982 /// \param I
    983 ///    A 64-bit integer that is written to the result beginning at the offset
    984 ///    specified by \a N.
    985 /// \param N
    986 ///    An immediate value. Bit [0] specifies the bit offset in the result at
    987 ///    which the integer \a I is written. \n
    988 ///    0: Bits [63:0] of the result are used for insertion. \n
    989 ///    1: Bits [127:64] of the result are used for insertion. \n
    990 /// \returns A 128-bit integer vector containing the constructed values.
    991 #define _mm_insert_epi64(X, I, N)                                              \
    992   ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I),  \
    993                                         (int)(N)))
    994 #endif /* __x86_64__ */
    995 
    996 /* Extract int from packed integer array at index.  This returns the element
    997  * as a zero extended value, so it is unsigned.
    998  */
    999 /// Extracts an 8-bit element from the 128-bit integer vector of
   1000 ///    [16 x i8], using the immediate value parameter \a N as a selector.
   1001 ///
   1002 /// \headerfile <x86intrin.h>
   1003 ///
   1004 /// \code
   1005 /// int _mm_extract_epi8(__m128i X, const int N);
   1006 /// \endcode
   1007 ///
   1008 /// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
   1009 ///
   1010 /// \param X
   1011 ///    A 128-bit integer vector.
   1012 /// \param N
   1013 ///    An immediate value. Bits [3:0] specify which 8-bit vector element from
   1014 ///    the argument \a X to extract and copy to the result. \n
   1015 ///    0000: Bits [7:0] of parameter \a X are extracted. \n
   1016 ///    0001: Bits [15:8] of the parameter \a X are extracted. \n
   1017 ///    0010: Bits [23:16] of the parameter \a X are extracted. \n
   1018 ///    0011: Bits [31:24] of the parameter \a X are extracted. \n
   1019 ///    0100: Bits [39:32] of the parameter \a X are extracted. \n
   1020 ///    0101: Bits [47:40] of the parameter \a X are extracted. \n
   1021 ///    0110: Bits [55:48] of the parameter \a X are extracted. \n
   1022 ///    0111: Bits [63:56] of the parameter \a X are extracted. \n
   1023 ///    1000: Bits [71:64] of the parameter \a X are extracted. \n
   1024 ///    1001: Bits [79:72] of the parameter \a X are extracted. \n
   1025 ///    1010: Bits [87:80] of the parameter \a X are extracted. \n
   1026 ///    1011: Bits [95:88] of the parameter \a X are extracted. \n
   1027 ///    1100: Bits [103:96] of the parameter \a X are extracted. \n
   1028 ///    1101: Bits [111:104] of the parameter \a X are extracted. \n
   1029 ///    1110: Bits [119:112] of the parameter \a X are extracted. \n
   1030 ///    1111: Bits [127:120] of the parameter \a X are extracted.
   1031 /// \returns  An unsigned integer, whose lower 8 bits are selected from the
   1032 ///    128-bit integer vector parameter and the remaining bits are assigned
   1033 ///    zeros.
   1034 #define _mm_extract_epi8(X, N)                                                 \
   1035   ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X),     \
   1036                                                     (int)(N)))
   1037 
   1038 /// Extracts a 32-bit element from the 128-bit integer vector of
   1039 ///    [4 x i32], using the immediate value parameter \a N as a selector.
   1040 ///
   1041 /// \headerfile <x86intrin.h>
   1042 ///
   1043 /// \code
   1044 /// int _mm_extract_epi32(__m128i X, const int N);
   1045 /// \endcode
   1046 ///
   1047 /// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
   1048 ///
   1049 /// \param X
   1050 ///    A 128-bit integer vector.
   1051 /// \param N
   1052 ///    An immediate value. Bits [1:0] specify which 32-bit vector element from
   1053 ///    the argument \a X to extract and copy to the result. \n
   1054 ///    00: Bits [31:0] of the parameter \a X are extracted. \n
   1055 ///    01: Bits [63:32] of the parameter \a X are extracted. \n
   1056 ///    10: Bits [95:64] of the parameter \a X are extracted. \n
   1057 ///    11: Bits [127:96] of the parameter \a X are exracted.
   1058 /// \returns  An integer, whose lower 32 bits are selected from the 128-bit
   1059 ///    integer vector parameter and the remaining bits are assigned zeros.
   1060 #define _mm_extract_epi32(X, N)                                                \
   1061   ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
   1062 
   1063 /// Extracts a 64-bit element from the 128-bit integer vector of
   1064 ///    [2 x i64], using the immediate value parameter \a N as a selector.
   1065 ///
   1066 /// \headerfile <x86intrin.h>
   1067 ///
   1068 /// \code
   1069 /// long long _mm_extract_epi64(__m128i X, const int N);
   1070 /// \endcode
   1071 ///
   1072 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
   1073 /// in 64-bit mode.
   1074 ///
   1075 /// \param X
   1076 ///    A 128-bit integer vector.
   1077 /// \param N
   1078 ///    An immediate value. Bit [0] specifies which 64-bit vector element from
   1079 ///    the argument \a X to return. \n
   1080 ///    0: Bits [63:0] are returned. \n
   1081 ///    1: Bits [127:64] are returned. \n
   1082 /// \returns  A 64-bit integer.
   1083 #define _mm_extract_epi64(X, N)                                                \
   1084   ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
   1085 
   1086 /* SSE4 128-bit Packed Integer Comparisons.  */
   1087 /// Tests whether the specified bits in a 128-bit integer vector are all
   1088 ///    zeros.
   1089 ///
   1090 /// \headerfile <x86intrin.h>
   1091 ///
   1092 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
   1093 ///
   1094 /// \param __M
   1095 ///    A 128-bit integer vector containing the bits to be tested.
   1096 /// \param __V
   1097 ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
   1098 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
   1099 static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
   1100                                                          __m128i __V) {
   1101   return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
   1102 }
   1103 
   1104 /// Tests whether the specified bits in a 128-bit integer vector are all
   1105 ///    ones.
   1106 ///
   1107 /// \headerfile <x86intrin.h>
   1108 ///
   1109 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
   1110 ///
   1111 /// \param __M
   1112 ///    A 128-bit integer vector containing the bits to be tested.
   1113 /// \param __V
   1114 ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
   1115 /// \returns TRUE if the specified bits are all ones; FALSE otherwise.
   1116 static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
   1117                                                          __m128i __V) {
   1118   return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
   1119 }
   1120 
   1121 /// Tests whether the specified bits in a 128-bit integer vector are
   1122 ///    neither all zeros nor all ones.
   1123 ///
   1124 /// \headerfile <x86intrin.h>
   1125 ///
   1126 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
   1127 ///
   1128 /// \param __M
   1129 ///    A 128-bit integer vector containing the bits to be tested.
   1130 /// \param __V
   1131 ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
   1132 /// \returns TRUE if the specified bits are neither all zeros nor all ones;
   1133 ///    FALSE otherwise.
   1134 static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
   1135                                                            __m128i __V) {
   1136   return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
   1137 }
   1138 
   1139 /// Tests whether the specified bits in a 128-bit integer vector are all
   1140 ///    ones.
   1141 ///
   1142 /// \headerfile <x86intrin.h>
   1143 ///
   1144 /// \code
   1145 /// int _mm_test_all_ones(__m128i V);
   1146 /// \endcode
   1147 ///
   1148 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
   1149 ///
   1150 /// \param V
   1151 ///    A 128-bit integer vector containing the bits to be tested.
   1152 /// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
   1153 ///    otherwise.
   1154 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1))
   1155 
   1156 /// Tests whether the specified bits in a 128-bit integer vector are
   1157 ///    neither all zeros nor all ones.
   1158 ///
   1159 /// \headerfile <x86intrin.h>
   1160 ///
   1161 /// \code
   1162 /// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
   1163 /// \endcode
   1164 ///
   1165 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
   1166 ///
   1167 /// \param M
   1168 ///    A 128-bit integer vector containing the bits to be tested.
   1169 /// \param V
   1170 ///    A 128-bit integer vector selecting which bits to test in operand \a M.
   1171 /// \returns TRUE if the specified bits are neither all zeros nor all ones;
   1172 ///    FALSE otherwise.
   1173 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
   1174 
   1175 /// Tests whether the specified bits in a 128-bit integer vector are all
   1176 ///    zeros.
   1177 ///
   1178 /// \headerfile <x86intrin.h>
   1179 ///
   1180 /// \code
   1181 /// int _mm_test_all_zeros(__m128i M, __m128i V);
   1182 /// \endcode
   1183 ///
   1184 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
   1185 ///
   1186 /// \param M
   1187 ///    A 128-bit integer vector containing the bits to be tested.
   1188 /// \param V
   1189 ///    A 128-bit integer vector selecting which bits to test in operand \a M.
   1190 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
   1191 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
   1192 
   1193 /* SSE4 64-bit Packed Integer Comparisons.  */
   1194 /// Compares each of the corresponding 64-bit values of the 128-bit
   1195 ///    integer vectors for equality.
   1196 ///
   1197 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
   1198 ///
   1199 /// \headerfile <x86intrin.h>
   1200 ///
   1201 /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
   1202 ///
   1203 /// \param __V1
   1204 ///    A 128-bit integer vector.
   1205 /// \param __V2
   1206 ///    A 128-bit integer vector.
   1207 /// \returns A 128-bit integer vector containing the comparison results.
   1208 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
   1209                                                              __m128i __V2) {
   1210   return (__m128i)((__v2di)__V1 == (__v2di)__V2);
   1211 }
   1212 
   1213 /* SSE4 Packed Integer Sign-Extension.  */
   1214 /// Sign-extends each of the lower eight 8-bit integer elements of a
   1215 ///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
   1216 ///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
   1217 ///    are unused.
   1218 ///
   1219 /// \headerfile <x86intrin.h>
   1220 ///
   1221 /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
   1222 ///
   1223 /// \param __V
   1224 ///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
   1225 ///    sign-extended to 16-bit values.
   1226 /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
   1227 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) {
   1228   /* This function always performs a signed extension, but __v16qi is a char
   1229      which may be signed or unsigned, so use __v16qs. */
   1230   return (__m128i) __builtin_convertvector(
   1231       __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
   1232                               7),
   1233       __v8hi);
   1234 }
   1235 
   1236 /// Sign-extends each of the lower four 8-bit integer elements of a
   1237 ///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
   1238 ///    128-bit vector of [4 x i32]. The upper twelve elements of the input
   1239 ///    vector are unused.
   1240 ///
   1241 /// \headerfile <x86intrin.h>
   1242 ///
   1243 /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
   1244 ///
   1245 /// \param __V
   1246 ///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
   1247 ///    sign-extended to 32-bit values.
   1248 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
   1249 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) {
   1250   /* This function always performs a signed extension, but __v16qi is a char
   1251      which may be signed or unsigned, so use __v16qs. */
   1252   return (__m128i) __builtin_convertvector(
   1253       __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
   1254 }
   1255 
   1256 /// Sign-extends each of the lower two 8-bit integer elements of a
   1257 ///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
   1258 ///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
   1259 ///    vector are unused.
   1260 ///
   1261 /// \headerfile <x86intrin.h>
   1262 ///
   1263 /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
   1264 ///
   1265 /// \param __V
   1266 ///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
   1267 ///    sign-extended to 64-bit values.
   1268 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
   1269 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) {
   1270   /* This function always performs a signed extension, but __v16qi is a char
   1271      which may be signed or unsigned, so use __v16qs. */
   1272   return (__m128i) __builtin_convertvector(
   1273       __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
   1274 }
   1275 
   1276 /// Sign-extends each of the lower four 16-bit integer elements of a
   1277 ///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
   1278 ///    a 128-bit vector of [4 x i32]. The upper four elements of the input
   1279 ///    vector are unused.
   1280 ///
   1281 /// \headerfile <x86intrin.h>
   1282 ///
   1283 /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
   1284 ///
   1285 /// \param __V
   1286 ///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
   1287 ///    sign-extended to 32-bit values.
   1288 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
   1289 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) {
   1290   return (__m128i) __builtin_convertvector(
   1291       __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
   1292 }
   1293 
   1294 /// Sign-extends each of the lower two 16-bit integer elements of a
   1295 ///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
   1296 ///    a 128-bit vector of [2 x i64]. The upper six elements of the input
   1297 ///    vector are unused.
   1298 ///
   1299 /// \headerfile <x86intrin.h>
   1300 ///
   1301 /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
   1302 ///
   1303 /// \param __V
   1304 ///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
   1305 ///     sign-extended to 64-bit values.
   1306 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
   1307 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) {
   1308   return (__m128i) __builtin_convertvector(
   1309       __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
   1310 }
   1311 
   1312 /// Sign-extends each of the lower two 32-bit integer elements of a
   1313 ///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
   1314 ///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
   1315 ///    are unused.
   1316 ///
   1317 /// \headerfile <x86intrin.h>
   1318 ///
   1319 /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
   1320 ///
   1321 /// \param __V
   1322 ///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
   1323 ///    sign-extended to 64-bit values.
   1324 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
   1325 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) {
   1326   return (__m128i) __builtin_convertvector(
   1327       __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
   1328 }
   1329 
   1330 /* SSE4 Packed Integer Zero-Extension.  */
   1331 /// Zero-extends each of the lower eight 8-bit integer elements of a
   1332 ///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
   1333 ///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
   1334 ///    are unused.
   1335 ///
   1336 /// \headerfile <x86intrin.h>
   1337 ///
   1338 /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
   1339 ///
   1340 /// \param __V
   1341 ///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
   1342 ///    zero-extended to 16-bit values.
   1343 /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
   1344 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) {
   1345   return (__m128i) __builtin_convertvector(
   1346       __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
   1347                               7),
   1348       __v8hi);
   1349 }
   1350 
   1351 /// Zero-extends each of the lower four 8-bit integer elements of a
   1352 ///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
   1353 ///    128-bit vector of [4 x i32]. The upper twelve elements of the input
   1354 ///    vector are unused.
   1355 ///
   1356 /// \headerfile <x86intrin.h>
   1357 ///
   1358 /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
   1359 ///
   1360 /// \param __V
   1361 ///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
   1362 ///    zero-extended to 32-bit values.
   1363 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
   1364 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) {
   1365   return (__m128i) __builtin_convertvector(
   1366       __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
   1367 }
   1368 
   1369 /// Zero-extends each of the lower two 8-bit integer elements of a
   1370 ///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
   1371 ///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
   1372 ///    vector are unused.
   1373 ///
   1374 /// \headerfile <x86intrin.h>
   1375 ///
   1376 /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
   1377 ///
   1378 /// \param __V
   1379 ///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
   1380 ///    zero-extended to 64-bit values.
   1381 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
   1382 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) {
   1383   return (__m128i) __builtin_convertvector(
   1384       __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
   1385 }
   1386 
   1387 /// Zero-extends each of the lower four 16-bit integer elements of a
   1388 ///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
   1389 ///    a 128-bit vector of [4 x i32]. The upper four elements of the input
   1390 ///    vector are unused.
   1391 ///
   1392 /// \headerfile <x86intrin.h>
   1393 ///
   1394 /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
   1395 ///
   1396 /// \param __V
   1397 ///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
   1398 ///    zero-extended to 32-bit values.
   1399 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
   1400 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) {
   1401   return (__m128i) __builtin_convertvector(
   1402       __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
   1403 }
   1404 
   1405 /// Zero-extends each of the lower two 16-bit integer elements of a
   1406 ///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
   1407 ///    a 128-bit vector of [2 x i64]. The upper six elements of the input vector
   1408 ///    are unused.
   1409 ///
   1410 /// \headerfile <x86intrin.h>
   1411 ///
   1412 /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
   1413 ///
   1414 /// \param __V
   1415 ///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
   1416 ///    zero-extended to 64-bit values.
   1417 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
   1418 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) {
   1419   return (__m128i) __builtin_convertvector(
   1420       __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
   1421 }
   1422 
   1423 /// Zero-extends each of the lower two 32-bit integer elements of a
   1424 ///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
   1425 ///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
   1426 ///    are unused.
   1427 ///
   1428 /// \headerfile <x86intrin.h>
   1429 ///
   1430 /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
   1431 ///
   1432 /// \param __V
   1433 ///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
   1434 ///    zero-extended to 64-bit values.
   1435 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
   1436 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
   1437   return (__m128i) __builtin_convertvector(
   1438       __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
   1439 }
   1440 
   1441 /* SSE4 Pack with Unsigned Saturation.  */
   1442 /// Converts, with saturation, 32-bit signed integers from both 128-bit integer
   1443 ///    vector operands into 16-bit unsigned integers, and returns the packed
   1444 ///    result.
   1445 ///
   1446 ///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
   1447 ///    0x0000 are saturated to 0x0000.
   1448 ///
   1449 /// \headerfile <x86intrin.h>
   1450 ///
   1451 /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
   1452 ///
   1453 /// \param __V1
   1454 ///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
   1455 ///    written to the lower 64 bits of the result.
   1456 /// \param __V2
   1457 ///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
   1458 ///    written to the higher 64 bits of the result.
   1459 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
   1460 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
   1461                                                               __m128i __V2) {
   1462   return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
   1463 }
   1464 
   1465 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
   1466 /// Subtracts 8-bit unsigned integer values and computes the absolute
   1467 ///    values of the differences to the corresponding bits in the destination.
   1468 ///    Then sums of the absolute differences are returned according to the bit
   1469 ///    fields in the immediate operand.
   1470 ///
   1471 /// \headerfile <x86intrin.h>
   1472 ///
   1473 /// \code
   1474 /// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
   1475 /// \endcode
   1476 ///
   1477 /// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
   1478 ///
   1479 /// \param X
   1480 ///    A 128-bit vector of [16 x i8].
   1481 /// \param Y
   1482 ///    A 128-bit vector of [16 x i8].
   1483 /// \param M
   1484 ///    An 8-bit immediate operand specifying how the absolute differences are to
   1485 ///    be calculated, according to the following algorithm:
   1486 ///    \code
   1487 ///    // M2 represents bit 2 of the immediate operand
   1488 ///    // M10 represents bits [1:0] of the immediate operand
   1489 ///    i = M2 * 4;
   1490 ///    j = M10 * 4;
   1491 ///    for (k = 0; k < 8; k = k + 1) {
   1492 ///      d0 = abs(X[i + k + 0] - Y[j + 0]);
   1493 ///      d1 = abs(X[i + k + 1] - Y[j + 1]);
   1494 ///      d2 = abs(X[i + k + 2] - Y[j + 2]);
   1495 ///      d3 = abs(X[i + k + 3] - Y[j + 3]);
   1496 ///      r[k] = d0 + d1 + d2 + d3;
   1497 ///    }
   1498 ///    \endcode
   1499 /// \returns A 128-bit integer vector containing the sums of the sets of
   1500 ///    absolute differences between both operands.
   1501 #define _mm_mpsadbw_epu8(X, Y, M)                                              \
   1502   ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X),                   \
   1503                                       (__v16qi)(__m128i)(Y), (M)))
   1504 
   1505 /// Finds the minimum unsigned 16-bit element in the input 128-bit
   1506 ///    vector of [8 x u16] and returns it and along with its index.
   1507 ///
   1508 /// \headerfile <x86intrin.h>
   1509 ///
   1510 /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
   1511 /// instruction.
   1512 ///
   1513 /// \param __V
   1514 ///    A 128-bit vector of [8 x u16].
   1515 /// \returns A 128-bit value where bits [15:0] contain the minimum value found
   1516 ///    in parameter \a __V, bits [18:16] contain the index of the minimum value
   1517 ///    and the remaining bits are set to 0.
   1518 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
   1519   return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
   1520 }
   1521 
   1522 /* Handle the sse4.2 definitions here. */
   1523 
   1524 /* These definitions are normally in nmmintrin.h, but gcc puts them in here
   1525    so we'll do the same.  */
   1526 
   1527 #undef __DEFAULT_FN_ATTRS
   1528 #define __DEFAULT_FN_ATTRS                                                     \
   1529   __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
   1530 
   1531 /* These specify the type of data that we're comparing.  */
   1532 #define _SIDD_UBYTE_OPS 0x00
   1533 #define _SIDD_UWORD_OPS 0x01
   1534 #define _SIDD_SBYTE_OPS 0x02
   1535 #define _SIDD_SWORD_OPS 0x03
   1536 
   1537 /* These specify the type of comparison operation.  */
   1538 #define _SIDD_CMP_EQUAL_ANY 0x00
   1539 #define _SIDD_CMP_RANGES 0x04
   1540 #define _SIDD_CMP_EQUAL_EACH 0x08
   1541 #define _SIDD_CMP_EQUAL_ORDERED 0x0c
   1542 
   1543 /* These macros specify the polarity of the operation.  */
   1544 #define _SIDD_POSITIVE_POLARITY 0x00
   1545 #define _SIDD_NEGATIVE_POLARITY 0x10
   1546 #define _SIDD_MASKED_POSITIVE_POLARITY 0x20
   1547 #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
   1548 
   1549 /* These macros are used in _mm_cmpXstri() to specify the return.  */
   1550 #define _SIDD_LEAST_SIGNIFICANT 0x00
   1551 #define _SIDD_MOST_SIGNIFICANT 0x40
   1552 
   1553 /* These macros are used in _mm_cmpXstri() to specify the return.  */
   1554 #define _SIDD_BIT_MASK 0x00
   1555 #define _SIDD_UNIT_MASK 0x40
   1556 
   1557 /* SSE4.2 Packed Comparison Intrinsics.  */
   1558 /// Uses the immediate operand \a M to perform a comparison of string
   1559 ///    data with implicitly defined lengths that is contained in source operands
   1560 ///    \a A and \a B. Returns a 128-bit integer vector representing the result
   1561 ///    mask of the comparison.
   1562 ///
   1563 /// \headerfile <x86intrin.h>
   1564 ///
   1565 /// \code
   1566 /// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
   1567 /// \endcode
   1568 ///
   1569 /// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
   1570 /// instruction.
   1571 ///
   1572 /// \param A
   1573 ///    A 128-bit integer vector containing one of the source operands to be
   1574 ///    compared.
   1575 /// \param B
   1576 ///    A 128-bit integer vector containing one of the source operands to be
   1577 ///    compared.
   1578 /// \param M
   1579 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   1580 ///    words, the type of comparison to perform, and the format of the return
   1581 ///    value. \n
   1582 ///    Bits [1:0]: Determine source data format. \n
   1583 ///      00: 16 unsigned bytes \n
   1584 ///      01: 8 unsigned words \n
   1585 ///      10: 16 signed bytes \n
   1586 ///      11: 8 signed words \n
   1587 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   1588 ///      00: Subset: Each character in \a B is compared for equality with all
   1589 ///          the characters in \a A. \n
   1590 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   1591 ///          basis is greater than or equal for even-indexed elements in \a A,
   1592 ///          and less than or equal for odd-indexed elements in \a A. \n
   1593 ///      10: Match: Compare each pair of corresponding characters in \a A and
   1594 ///          \a B for equality. \n
   1595 ///      11: Substring: Search \a B for substring matches of \a A. \n
   1596 ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
   1597 ///                mask of the comparison results. \n
   1598 ///      00: No effect. \n
   1599 ///      01: Negate the bit mask. \n
   1600 ///      10: No effect. \n
   1601 ///      11: Negate the bit mask only for bits with an index less than or equal
   1602 ///          to the size of \a A or \a B. \n
   1603 ///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
   1604 ///             bytes. \n
   1605 ///      0: The result is zero-extended to 16 bytes. \n
   1606 ///      1: The result is expanded to 16 bytes (this expansion is performed by
   1607 ///         repeating each bit 8 or 16 times).
   1608 /// \returns Returns a 128-bit integer vector representing the result mask of
   1609 ///    the comparison.
   1610 #define _mm_cmpistrm(A, B, M)                                                  \
   1611   ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A),                 \
   1612                                         (__v16qi)(__m128i)(B), (int)(M)))
   1613 
   1614 /// Uses the immediate operand \a M to perform a comparison of string
   1615 ///    data with implicitly defined lengths that is contained in source operands
   1616 ///    \a A and \a B. Returns an integer representing the result index of the
   1617 ///    comparison.
   1618 ///
   1619 /// \headerfile <x86intrin.h>
   1620 ///
   1621 /// \code
   1622 /// int _mm_cmpistri(__m128i A, __m128i B, const int M);
   1623 /// \endcode
   1624 ///
   1625 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
   1626 /// instruction.
   1627 ///
   1628 /// \param A
   1629 ///    A 128-bit integer vector containing one of the source operands to be
   1630 ///    compared.
   1631 /// \param B
   1632 ///    A 128-bit integer vector containing one of the source operands to be
   1633 ///    compared.
   1634 /// \param M
   1635 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   1636 ///    words, the type of comparison to perform, and the format of the return
   1637 ///    value. \n
   1638 ///    Bits [1:0]: Determine source data format. \n
   1639 ///      00: 16 unsigned bytes \n
   1640 ///      01: 8 unsigned words \n
   1641 ///      10: 16 signed bytes \n
   1642 ///      11: 8 signed words \n
   1643 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   1644 ///      00: Subset: Each character in \a B is compared for equality with all
   1645 ///          the characters in \a A. \n
   1646 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   1647 ///          basis is greater than or equal for even-indexed elements in \a A,
   1648 ///          and less than or equal for odd-indexed elements in \a A. \n
   1649 ///      10: Match: Compare each pair of corresponding characters in \a A and
   1650 ///          \a B for equality. \n
   1651 ///      11: Substring: Search B for substring matches of \a A. \n
   1652 ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
   1653 ///                mask of the comparison results. \n
   1654 ///      00: No effect. \n
   1655 ///      01: Negate the bit mask. \n
   1656 ///      10: No effect. \n
   1657 ///      11: Negate the bit mask only for bits with an index less than or equal
   1658 ///          to the size of \a A or \a B. \n
   1659 ///    Bit [6]: Determines whether the index of the lowest set bit or the
   1660 ///             highest set bit is returned. \n
   1661 ///      0: The index of the least significant set bit. \n
   1662 ///      1: The index of the most significant set bit. \n
   1663 /// \returns Returns an integer representing the result index of the comparison.
   1664 #define _mm_cmpistri(A, B, M)                                                  \
   1665   ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A),                     \
   1666                                     (__v16qi)(__m128i)(B), (int)(M)))
   1667 
   1668 /// Uses the immediate operand \a M to perform a comparison of string
   1669 ///    data with explicitly defined lengths that is contained in source operands
   1670 ///    \a A and \a B. Returns a 128-bit integer vector representing the result
   1671 ///    mask of the comparison.
   1672 ///
   1673 /// \headerfile <x86intrin.h>
   1674 ///
   1675 /// \code
   1676 /// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
   1677 /// \endcode
   1678 ///
   1679 /// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
   1680 /// instruction.
   1681 ///
   1682 /// \param A
   1683 ///    A 128-bit integer vector containing one of the source operands to be
   1684 ///    compared.
   1685 /// \param LA
   1686 ///    An integer that specifies the length of the string in \a A.
   1687 /// \param B
   1688 ///    A 128-bit integer vector containing one of the source operands to be
   1689 ///    compared.
   1690 /// \param LB
   1691 ///    An integer that specifies the length of the string in \a B.
   1692 /// \param M
   1693 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   1694 ///    words, the type of comparison to perform, and the format of the return
   1695 ///    value. \n
   1696 ///    Bits [1:0]: Determine source data format. \n
   1697 ///      00: 16 unsigned bytes \n
   1698 ///      01: 8 unsigned words \n
   1699 ///      10: 16 signed bytes \n
   1700 ///      11: 8 signed words \n
   1701 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   1702 ///      00: Subset: Each character in \a B is compared for equality with all
   1703 ///          the characters in \a A. \n
   1704 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   1705 ///          basis is greater than or equal for even-indexed elements in \a A,
   1706 ///          and less than or equal for odd-indexed elements in \a A. \n
   1707 ///      10: Match: Compare each pair of corresponding characters in \a A and
   1708 ///          \a B for equality. \n
   1709 ///      11: Substring: Search \a B for substring matches of \a A. \n
   1710 ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
   1711 ///                mask of the comparison results. \n
   1712 ///      00: No effect. \n
   1713 ///      01: Negate the bit mask. \n
   1714 ///      10: No effect. \n
   1715 ///      11: Negate the bit mask only for bits with an index less than or equal
   1716 ///          to the size of \a A or \a B. \n
   1717 ///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
   1718 ///             bytes. \n
   1719 ///      0: The result is zero-extended to 16 bytes. \n
   1720 ///      1: The result is expanded to 16 bytes (this expansion is performed by
   1721 ///         repeating each bit 8 or 16 times). \n
   1722 /// \returns Returns a 128-bit integer vector representing the result mask of
   1723 ///    the comparison.
   1724 #define _mm_cmpestrm(A, LA, B, LB, M)                                          \
   1725   ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA),      \
   1726                                         (__v16qi)(__m128i)(B), (int)(LB),      \
   1727                                         (int)(M)))
   1728 
   1729 /// Uses the immediate operand \a M to perform a comparison of string
   1730 ///    data with explicitly defined lengths that is contained in source operands
   1731 ///    \a A and \a B. Returns an integer representing the result index of the
   1732 ///    comparison.
   1733 ///
   1734 /// \headerfile <x86intrin.h>
   1735 ///
   1736 /// \code
   1737 /// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
   1738 /// \endcode
   1739 ///
   1740 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
   1741 /// instruction.
   1742 ///
   1743 /// \param A
   1744 ///    A 128-bit integer vector containing one of the source operands to be
   1745 ///    compared.
   1746 /// \param LA
   1747 ///    An integer that specifies the length of the string in \a A.
   1748 /// \param B
   1749 ///    A 128-bit integer vector containing one of the source operands to be
   1750 ///    compared.
   1751 /// \param LB
   1752 ///    An integer that specifies the length of the string in \a B.
   1753 /// \param M
   1754 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   1755 ///    words, the type of comparison to perform, and the format of the return
   1756 ///    value. \n
   1757 ///    Bits [1:0]: Determine source data format. \n
   1758 ///      00: 16 unsigned bytes \n
   1759 ///      01: 8 unsigned words \n
   1760 ///      10: 16 signed bytes \n
   1761 ///      11: 8 signed words \n
   1762 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   1763 ///      00: Subset: Each character in \a B is compared for equality with all
   1764 ///          the characters in \a A. \n
   1765 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   1766 ///          basis is greater than or equal for even-indexed elements in \a A,
   1767 ///          and less than or equal for odd-indexed elements in \a A. \n
   1768 ///      10: Match: Compare each pair of corresponding characters in \a A and
   1769 ///          \a B for equality. \n
   1770 ///      11: Substring: Search B for substring matches of \a A. \n
   1771 ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
   1772 ///                mask of the comparison results. \n
   1773 ///      00: No effect. \n
   1774 ///      01: Negate the bit mask. \n
   1775 ///      10: No effect. \n
   1776 ///      11: Negate the bit mask only for bits with an index less than or equal
   1777 ///          to the size of \a A or \a B. \n
   1778 ///    Bit [6]: Determines whether the index of the lowest set bit or the
   1779 ///             highest set bit is returned. \n
   1780 ///      0: The index of the least significant set bit. \n
   1781 ///      1: The index of the most significant set bit. \n
   1782 /// \returns Returns an integer representing the result index of the comparison.
   1783 #define _mm_cmpestri(A, LA, B, LB, M)                                          \
   1784   ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA),          \
   1785                                     (__v16qi)(__m128i)(B), (int)(LB),          \
   1786                                     (int)(M)))
   1787 
   1788 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
   1789 /// Uses the immediate operand \a M to perform a comparison of string
   1790 ///    data with implicitly defined lengths that is contained in source operands
   1791 ///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
   1792 ///    string in \a B is the maximum, otherwise, returns 0.
   1793 ///
   1794 /// \headerfile <x86intrin.h>
   1795 ///
   1796 /// \code
   1797 /// int _mm_cmpistra(__m128i A, __m128i B, const int M);
   1798 /// \endcode
   1799 ///
   1800 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
   1801 /// instruction.
   1802 ///
   1803 /// \param A
   1804 ///    A 128-bit integer vector containing one of the source operands to be
   1805 ///    compared.
   1806 /// \param B
   1807 ///    A 128-bit integer vector containing one of the source operands to be
   1808 ///    compared.
   1809 /// \param M
   1810 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   1811 ///    words and the type of comparison to perform. \n
   1812 ///    Bits [1:0]: Determine source data format. \n
   1813 ///      00: 16 unsigned bytes \n
   1814 ///      01: 8 unsigned words \n
   1815 ///      10: 16 signed bytes \n
   1816 ///      11: 8 signed words \n
   1817 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   1818 ///      00: Subset: Each character in \a B is compared for equality with all
   1819 ///          the characters in \a A. \n
   1820 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   1821 ///          basis is greater than or equal for even-indexed elements in \a A,
   1822 ///          and less than or equal for odd-indexed elements in \a A. \n
   1823 ///      10: Match: Compare each pair of corresponding characters in \a A and
   1824 ///          \a B for equality. \n
   1825 ///      11: Substring: Search \a B for substring matches of \a A. \n
   1826 ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
   1827 ///                mask of the comparison results. \n
   1828 ///      00: No effect. \n
   1829 ///      01: Negate the bit mask. \n
   1830 ///      10: No effect. \n
   1831 ///      11: Negate the bit mask only for bits with an index less than or equal
   1832 ///          to the size of \a A or \a B. \n
   1833 /// \returns Returns 1 if the bit mask is zero and the length of the string in
   1834 ///    \a B is the maximum; otherwise, returns 0.
   1835 #define _mm_cmpistra(A, B, M)                                                  \
   1836   ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A),                    \
   1837                                      (__v16qi)(__m128i)(B), (int)(M)))
   1838 
   1839 /// Uses the immediate operand \a M to perform a comparison of string
   1840 ///    data with implicitly defined lengths that is contained in source operands
   1841 ///    \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
   1842 ///    0.
   1843 ///
   1844 /// \headerfile <x86intrin.h>
   1845 ///
   1846 /// \code
   1847 /// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
   1848 /// \endcode
   1849 ///
   1850 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
   1851 /// instruction.
   1852 ///
   1853 /// \param A
   1854 ///    A 128-bit integer vector containing one of the source operands to be
   1855 ///    compared.
   1856 /// \param B
   1857 ///    A 128-bit integer vector containing one of the source operands to be
   1858 ///    compared.
   1859 /// \param M
   1860 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   1861 ///    words and the type of comparison to perform. \n
   1862 ///    Bits [1:0]: Determine source data format. \n
   1863 ///      00: 16 unsigned bytes \n
   1864 ///      01: 8 unsigned words \n
   1865 ///      10: 16 signed bytes \n
   1866 ///      11: 8 signed words \n
   1867 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   1868 ///      00: Subset: Each character in \a B is compared for equality with all
   1869 ///          the characters in \a A. \n
   1870 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   1871 ///          basis is greater than or equal for even-indexed elements in \a A,
   1872 ///          and less than or equal for odd-indexed elements in \a A. \n
   1873 ///      10: Match: Compare each pair of corresponding characters in \a A and
   1874 ///          \a B for equality. \n
   1875 ///      11: Substring: Search B for substring matches of \a A. \n
   1876 ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
   1877 ///                mask of the comparison results. \n
   1878 ///      00: No effect. \n
   1879 ///      01: Negate the bit mask. \n
   1880 ///      10: No effect. \n
   1881 ///      11: Negate the bit mask only for bits with an index less than or equal
   1882 ///          to the size of \a A or \a B.
   1883 /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
   1884 #define _mm_cmpistrc(A, B, M)                                                  \
   1885   ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A),                    \
   1886                                      (__v16qi)(__m128i)(B), (int)(M)))
   1887 
   1888 /// Uses the immediate operand \a M to perform a comparison of string
   1889 ///    data with implicitly defined lengths that is contained in source operands
   1890 ///    \a A and \a B. Returns bit 0 of the resulting bit mask.
   1891 ///
   1892 /// \headerfile <x86intrin.h>
   1893 ///
   1894 /// \code
   1895 /// int _mm_cmpistro(__m128i A, __m128i B, const int M);
   1896 /// \endcode
   1897 ///
   1898 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
   1899 /// instruction.
   1900 ///
   1901 /// \param A
   1902 ///    A 128-bit integer vector containing one of the source operands to be
   1903 ///    compared.
   1904 /// \param B
   1905 ///    A 128-bit integer vector containing one of the source operands to be
   1906 ///    compared.
   1907 /// \param M
   1908 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   1909 ///    words and the type of comparison to perform. \n
   1910 ///    Bits [1:0]: Determine source data format. \n
   1911 ///      00: 16 unsigned bytes \n
   1912 ///      01: 8 unsigned words \n
   1913 ///      10: 16 signed bytes \n
   1914 ///      11: 8 signed words \n
   1915 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   1916 ///      00: Subset: Each character in \a B is compared for equality with all
   1917 ///          the characters in \a A. \n
   1918 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   1919 ///          basis is greater than or equal for even-indexed elements in \a A,
   1920 ///          and less than or equal for odd-indexed elements in \a A. \n
   1921 ///      10: Match: Compare each pair of corresponding characters in \a A and
   1922 ///          \a B for equality. \n
   1923 ///      11: Substring: Search B for substring matches of \a A. \n
   1924 ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
   1925 ///                mask of the comparison results. \n
   1926 ///      00: No effect. \n
   1927 ///      01: Negate the bit mask. \n
   1928 ///      10: No effect. \n
   1929 ///      11: Negate the bit mask only for bits with an index less than or equal
   1930 ///          to the size of \a A or \a B. \n
   1931 /// \returns Returns bit 0 of the resulting bit mask.
   1932 #define _mm_cmpistro(A, B, M)                                                  \
   1933   ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A),                    \
   1934                                      (__v16qi)(__m128i)(B), (int)(M)))
   1935 
   1936 /// Uses the immediate operand \a M to perform a comparison of string
   1937 ///    data with implicitly defined lengths that is contained in source operands
   1938 ///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
   1939 ///    the maximum, otherwise, returns 0.
   1940 ///
   1941 /// \headerfile <x86intrin.h>
   1942 ///
   1943 /// \code
   1944 /// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
   1945 /// \endcode
   1946 ///
   1947 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
   1948 /// instruction.
   1949 ///
   1950 /// \param A
   1951 ///    A 128-bit integer vector containing one of the source operands to be
   1952 ///    compared.
   1953 /// \param B
   1954 ///    A 128-bit integer vector containing one of the source operands to be
   1955 ///    compared.
   1956 /// \param M
   1957 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   1958 ///    words and the type of comparison to perform. \n
   1959 ///    Bits [1:0]: Determine source data format. \n
   1960 ///      00: 16 unsigned bytes \n
   1961 ///      01: 8 unsigned words \n
   1962 ///      10: 16 signed bytes \n
   1963 ///      11: 8 signed words \n
   1964 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   1965 ///      00: Subset: Each character in \a B is compared for equality with all
   1966 ///          the characters in \a A. \n
   1967 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   1968 ///          basis is greater than or equal for even-indexed elements in \a A,
   1969 ///          and less than or equal for odd-indexed elements in \a A. \n
   1970 ///      10: Match: Compare each pair of corresponding characters in \a A and
   1971 ///          \a B for equality. \n
   1972 ///      11: Substring: Search \a B for substring matches of \a A. \n
   1973 ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
   1974 ///                mask of the comparison results. \n
   1975 ///      00: No effect. \n
   1976 ///      01: Negate the bit mask. \n
   1977 ///      10: No effect. \n
   1978 ///      11: Negate the bit mask only for bits with an index less than or equal
   1979 ///          to the size of \a A or \a B. \n
   1980 /// \returns Returns 1 if the length of the string in \a A is less than the
   1981 ///    maximum, otherwise, returns 0.
   1982 #define _mm_cmpistrs(A, B, M)                                                  \
   1983   ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A),                    \
   1984                                      (__v16qi)(__m128i)(B), (int)(M)))
   1985 
   1986 /// Uses the immediate operand \a M to perform a comparison of string
   1987 ///    data with implicitly defined lengths that is contained in source operands
   1988 ///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
   1989 ///    the maximum, otherwise, returns 0.
   1990 ///
   1991 /// \headerfile <x86intrin.h>
   1992 ///
   1993 /// \code
   1994 /// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
   1995 /// \endcode
   1996 ///
   1997 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
   1998 /// instruction.
   1999 ///
   2000 /// \param A
   2001 ///    A 128-bit integer vector containing one of the source operands to be
   2002 ///    compared.
   2003 /// \param B
   2004 ///    A 128-bit integer vector containing one of the source operands to be
   2005 ///    compared.
   2006 /// \param M
   2007 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   2008 ///    words and the type of comparison to perform. \n
   2009 ///    Bits [1:0]: Determine source data format. \n
   2010 ///      00: 16 unsigned bytes \n
   2011 ///      01: 8 unsigned words \n
   2012 ///      10: 16 signed bytes \n
   2013 ///      11: 8 signed words \n
   2014 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   2015 ///      00: Subset: Each character in \a B is compared for equality with all
   2016 ///          the characters in \a A. \n
   2017 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   2018 ///          basis is greater than or equal for even-indexed elements in \a A,
   2019 ///          and less than or equal for odd-indexed elements in \a A. \n
   2020 ///      10: Match: Compare each pair of corresponding characters in \a A and
   2021 ///          \a B for equality. \n
   2022 ///      11: Substring: Search \a B for substring matches of \a A. \n
   2023 ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
   2024 ///                mask of the comparison results. \n
   2025 ///      00: No effect. \n
   2026 ///      01: Negate the bit mask. \n
   2027 ///      10: No effect. \n
   2028 ///      11: Negate the bit mask only for bits with an index less than or equal
   2029 ///          to the size of \a A or \a B.
   2030 /// \returns Returns 1 if the length of the string in \a B is less than the
   2031 ///    maximum, otherwise, returns 0.
   2032 #define _mm_cmpistrz(A, B, M)                                                  \
   2033   ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A),                    \
   2034                                      (__v16qi)(__m128i)(B), (int)(M)))
   2035 
   2036 /// Uses the immediate operand \a M to perform a comparison of string
   2037 ///    data with explicitly defined lengths that is contained in source operands
   2038 ///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
   2039 ///    string in \a B is the maximum, otherwise, returns 0.
   2040 ///
   2041 /// \headerfile <x86intrin.h>
   2042 ///
   2043 /// \code
   2044 /// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
   2045 /// \endcode
   2046 ///
   2047 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
   2048 /// instruction.
   2049 ///
   2050 /// \param A
   2051 ///    A 128-bit integer vector containing one of the source operands to be
   2052 ///    compared.
   2053 /// \param LA
   2054 ///    An integer that specifies the length of the string in \a A.
   2055 /// \param B
   2056 ///    A 128-bit integer vector containing one of the source operands to be
   2057 ///    compared.
   2058 /// \param LB
   2059 ///    An integer that specifies the length of the string in \a B.
   2060 /// \param M
   2061 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   2062 ///    words and the type of comparison to perform. \n
   2063 ///    Bits [1:0]: Determine source data format. \n
   2064 ///      00: 16 unsigned bytes \n
   2065 ///      01: 8 unsigned words \n
   2066 ///      10: 16 signed bytes \n
   2067 ///      11: 8 signed words \n
   2068 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   2069 ///      00: Subset: Each character in \a B is compared for equality with all
   2070 ///          the characters in \a A. \n
   2071 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   2072 ///          basis is greater than or equal for even-indexed elements in \a A,
   2073 ///          and less than or equal for odd-indexed elements in \a A. \n
   2074 ///      10: Match: Compare each pair of corresponding characters in \a A and
   2075 ///          \a B for equality. \n
   2076 ///      11: Substring: Search \a B for substring matches of \a A. \n
   2077 ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
   2078 ///                mask of the comparison results. \n
   2079 ///      00: No effect. \n
   2080 ///      01: Negate the bit mask. \n
   2081 ///      10: No effect. \n
   2082 ///      11: Negate the bit mask only for bits with an index less than or equal
   2083 ///          to the size of \a A or \a B.
   2084 /// \returns Returns 1 if the bit mask is zero and the length of the string in
   2085 ///    \a B is the maximum, otherwise, returns 0.
   2086 #define _mm_cmpestra(A, LA, B, LB, M)                                          \
   2087   ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA),         \
   2088                                      (__v16qi)(__m128i)(B), (int)(LB),         \
   2089                                      (int)(M)))
   2090 
   2091 /// Uses the immediate operand \a M to perform a comparison of string
   2092 ///    data with explicitly defined lengths that is contained in source operands
   2093 ///    \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
   2094 ///    returns 0.
   2095 ///
   2096 /// \headerfile <x86intrin.h>
   2097 ///
   2098 /// \code
   2099 /// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
   2100 /// \endcode
   2101 ///
   2102 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
   2103 /// instruction.
   2104 ///
   2105 /// \param A
   2106 ///    A 128-bit integer vector containing one of the source operands to be
   2107 ///    compared.
   2108 /// \param LA
   2109 ///    An integer that specifies the length of the string in \a A.
   2110 /// \param B
   2111 ///    A 128-bit integer vector containing one of the source operands to be
   2112 ///    compared.
   2113 /// \param LB
   2114 ///    An integer that specifies the length of the string in \a B.
   2115 /// \param M
   2116 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   2117 ///    words and the type of comparison to perform. \n
   2118 ///    Bits [1:0]: Determine source data format. \n
   2119 ///      00: 16 unsigned bytes \n
   2120 ///      01: 8 unsigned words \n
   2121 ///      10: 16 signed bytes \n
   2122 ///      11: 8 signed words \n
   2123 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   2124 ///      00: Subset: Each character in \a B is compared for equality with all
   2125 ///          the characters in \a A. \n
   2126 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   2127 ///          basis is greater than or equal for even-indexed elements in \a A,
   2128 ///          and less than or equal for odd-indexed elements in \a A. \n
   2129 ///      10: Match: Compare each pair of corresponding characters in \a A and
   2130 ///          \a B for equality. \n
   2131 ///      11: Substring: Search \a B for substring matches of \a A. \n
   2132 ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
   2133 ///                mask of the comparison results. \n
   2134 ///      00: No effect. \n
   2135 ///      01: Negate the bit mask. \n
   2136 ///      10: No effect. \n
   2137 ///      11: Negate the bit mask only for bits with an index less than or equal
   2138 ///          to the size of \a A or \a B. \n
   2139 /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
   2140 #define _mm_cmpestrc(A, LA, B, LB, M)                                          \
   2141   ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA),         \
   2142                                      (__v16qi)(__m128i)(B), (int)(LB),         \
   2143                                      (int)(M)))
   2144 
   2145 /// Uses the immediate operand \a M to perform a comparison of string
   2146 ///    data with explicitly defined lengths that is contained in source operands
   2147 ///    \a A and \a B. Returns bit 0 of the resulting bit mask.
   2148 ///
   2149 /// \headerfile <x86intrin.h>
   2150 ///
   2151 /// \code
   2152 /// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
   2153 /// \endcode
   2154 ///
   2155 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
   2156 /// instruction.
   2157 ///
   2158 /// \param A
   2159 ///    A 128-bit integer vector containing one of the source operands to be
   2160 ///    compared.
   2161 /// \param LA
   2162 ///    An integer that specifies the length of the string in \a A.
   2163 /// \param B
   2164 ///    A 128-bit integer vector containing one of the source operands to be
   2165 ///    compared.
   2166 /// \param LB
   2167 ///    An integer that specifies the length of the string in \a B.
   2168 /// \param M
   2169 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   2170 ///    words and the type of comparison to perform. \n
   2171 ///    Bits [1:0]: Determine source data format. \n
   2172 ///      00: 16 unsigned bytes \n
   2173 ///      01: 8 unsigned words \n
   2174 ///      10: 16 signed bytes \n
   2175 ///      11: 8 signed words \n
   2176 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   2177 ///      00: Subset: Each character in \a B is compared for equality with all
   2178 ///          the characters in \a A. \n
   2179 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   2180 ///          basis is greater than or equal for even-indexed elements in \a A,
   2181 ///          and less than or equal for odd-indexed elements in \a A. \n
   2182 ///      10: Match: Compare each pair of corresponding characters in \a A and
   2183 ///          \a B for equality. \n
   2184 ///      11: Substring: Search \a B for substring matches of \a A. \n
   2185 ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
   2186 ///                mask of the comparison results. \n
   2187 ///      00: No effect. \n
   2188 ///      01: Negate the bit mask. \n
   2189 ///      10: No effect. \n
   2190 ///      11: Negate the bit mask only for bits with an index less than or equal
   2191 ///          to the size of \a A or \a B.
   2192 /// \returns Returns bit 0 of the resulting bit mask.
   2193 #define _mm_cmpestro(A, LA, B, LB, M)                                          \
   2194   ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA),         \
   2195                                      (__v16qi)(__m128i)(B), (int)(LB),         \
   2196                                      (int)(M)))
   2197 
   2198 /// Uses the immediate operand \a M to perform a comparison of string
   2199 ///    data with explicitly defined lengths that is contained in source operands
   2200 ///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
   2201 ///    the maximum, otherwise, returns 0.
   2202 ///
   2203 /// \headerfile <x86intrin.h>
   2204 ///
   2205 /// \code
   2206 /// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
   2207 /// \endcode
   2208 ///
   2209 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
   2210 /// instruction.
   2211 ///
   2212 /// \param A
   2213 ///    A 128-bit integer vector containing one of the source operands to be
   2214 ///    compared.
   2215 /// \param LA
   2216 ///    An integer that specifies the length of the string in \a A.
   2217 /// \param B
   2218 ///    A 128-bit integer vector containing one of the source operands to be
   2219 ///    compared.
   2220 /// \param LB
   2221 ///    An integer that specifies the length of the string in \a B.
   2222 /// \param M
   2223 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   2224 ///    words and the type of comparison to perform. \n
   2225 ///    Bits [1:0]: Determine source data format. \n
   2226 ///      00: 16 unsigned bytes \n
   2227 ///      01: 8 unsigned words \n
   2228 ///      10: 16 signed bytes \n
   2229 ///      11: 8 signed words \n
   2230 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   2231 ///      00: Subset: Each character in \a B is compared for equality with all
   2232 ///          the characters in \a A. \n
   2233 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   2234 ///          basis is greater than or equal for even-indexed elements in \a A,
   2235 ///          and less than or equal for odd-indexed elements in \a A. \n
   2236 ///      10: Match: Compare each pair of corresponding characters in \a A and
   2237 ///          \a B for equality. \n
   2238 ///      11: Substring: Search \a B for substring matches of \a A. \n
   2239 ///    Bits [5:4]: Determine whether to perform a one's complement in the bit
   2240 ///                mask of the comparison results. \n
   2241 ///      00: No effect. \n
   2242 ///      01: Negate the bit mask. \n
   2243 ///      10: No effect. \n
   2244 ///      11: Negate the bit mask only for bits with an index less than or equal
   2245 ///          to the size of \a A or \a B. \n
   2246 /// \returns Returns 1 if the length of the string in \a A is less than the
   2247 ///    maximum, otherwise, returns 0.
   2248 #define _mm_cmpestrs(A, LA, B, LB, M)                                          \
   2249   ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA),         \
   2250                                      (__v16qi)(__m128i)(B), (int)(LB),         \
   2251                                      (int)(M)))
   2252 
   2253 /// Uses the immediate operand \a M to perform a comparison of string
   2254 ///    data with explicitly defined lengths that is contained in source operands
   2255 ///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
   2256 ///    the maximum, otherwise, returns 0.
   2257 ///
   2258 /// \headerfile <x86intrin.h>
   2259 ///
   2260 /// \code
   2261 /// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
   2262 /// \endcode
   2263 ///
   2264 /// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
   2265 ///
   2266 /// \param A
   2267 ///    A 128-bit integer vector containing one of the source operands to be
   2268 ///    compared.
   2269 /// \param LA
   2270 ///    An integer that specifies the length of the string in \a A.
   2271 /// \param B
   2272 ///    A 128-bit integer vector containing one of the source operands to be
   2273 ///    compared.
   2274 /// \param LB
   2275 ///    An integer that specifies the length of the string in \a B.
   2276 /// \param M
   2277 ///    An 8-bit immediate operand specifying whether the characters are bytes or
   2278 ///    words and the type of comparison to perform. \n
   2279 ///    Bits [1:0]: Determine source data format. \n
   2280 ///      00: 16 unsigned bytes  \n
   2281 ///      01: 8 unsigned words \n
   2282 ///      10: 16 signed bytes \n
   2283 ///      11: 8 signed words \n
   2284 ///    Bits [3:2]: Determine comparison type and aggregation method. \n
   2285 ///      00: Subset: Each character in \a B is compared for equality with all
   2286 ///          the characters in \a A. \n
   2287 ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
   2288 ///          basis is greater than or equal for even-indexed elements in \a A,
   2289 ///          and less than or equal for odd-indexed elements in \a A. \n
   2290 ///      10: Match: Compare each pair of corresponding characters in \a A and
   2291 ///          \a B for equality. \n
   2292 ///      11: Substring: Search \a B for substring matches of \a A. \n
   2293 ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
   2294 ///                mask of the comparison results. \n
   2295 ///      00: No effect. \n
   2296 ///      01: Negate the bit mask. \n
   2297 ///      10: No effect. \n
   2298 ///      11: Negate the bit mask only for bits with an index less than or equal
   2299 ///          to the size of \a A or \a B.
   2300 /// \returns Returns 1 if the length of the string in \a B is less than the
   2301 ///    maximum, otherwise, returns 0.
   2302 #define _mm_cmpestrz(A, LA, B, LB, M)                                          \
   2303   ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA),         \
   2304                                      (__v16qi)(__m128i)(B), (int)(LB),         \
   2305                                      (int)(M)))
   2306 
   2307 /* SSE4.2 Compare Packed Data -- Greater Than.  */
   2308 /// Compares each of the corresponding 64-bit values of the 128-bit
   2309 ///    integer vectors to determine if the values in the first operand are
   2310 ///    greater than those in the second operand.
   2311 ///
   2312 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
   2313 ///
   2314 /// \headerfile <x86intrin.h>
   2315 ///
   2316 /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
   2317 ///
   2318 /// \param __V1
   2319 ///    A 128-bit integer vector.
   2320 /// \param __V2
   2321 ///    A 128-bit integer vector.
   2322 /// \returns A 128-bit integer vector containing the comparison results.
   2323 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1,
   2324                                                              __m128i __V2) {
   2325   return (__m128i)((__v2di)__V1 > (__v2di)__V2);
   2326 }
   2327 
   2328 #undef __DEFAULT_FN_ATTRS
   2329 
   2330 #include <popcntintrin.h>
   2331 
   2332 #include <crc32intrin.h>
   2333 
   2334 #endif /* __SMMINTRIN_H */