zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

avx2intrin.h (192637B) - Raw


      1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 #ifndef __IMMINTRIN_H
     11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
     12 #endif
     13 
     14 #ifndef __AVX2INTRIN_H
     15 #define __AVX2INTRIN_H
     16 
     17 /* Define the default attributes for the functions in this file. */
     18 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
     19 #define __DEFAULT_FN_ATTRS256                                                  \
     20   __attribute__((__always_inline__, __nodebug__,                               \
     21                  __target__("avx2,no-evex512"), __min_vector_width__(256)))
     22 #define __DEFAULT_FN_ATTRS128                                                  \
     23   __attribute__((__always_inline__, __nodebug__,                               \
     24                  __target__("avx2,no-evex512"), __min_vector_width__(128)))
     25 #else
     26 #define __DEFAULT_FN_ATTRS256                                                  \
     27   __attribute__((__always_inline__, __nodebug__, __target__("avx2"),           \
     28                  __min_vector_width__(256)))
     29 #define __DEFAULT_FN_ATTRS128                                                  \
     30   __attribute__((__always_inline__, __nodebug__, __target__("avx2"),           \
     31                  __min_vector_width__(128)))
     32 #endif
     33 
     34 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
     35 /// Computes sixteen sum of absolute difference (SAD) operations on sets of
     36 ///    four unsigned 8-bit integers from the 256-bit integer vectors \a X and
     37 ///    \a Y.
     38 ///
     39 ///    Eight SAD results are computed using the lower half of the input
     40 ///    vectors, and another eight using the upper half. These 16-bit values
     41 ///    are returned in the lower and upper halves of the 256-bit result,
     42 ///    respectively.
     43 ///
     44 ///    A single SAD operation selects four bytes from \a X and four bytes from
     45 ///    \a Y as input. It computes the differences between each \a X byte and
     46 ///    the corresponding \a Y byte, takes the absolute value of each
     47 ///    difference, and sums these four values to form one 16-bit result. The
     48 ///    intrinsic computes 16 of these results with different sets of input
     49 ///    bytes.
     50 ///
     51 ///    For each set of eight results, the SAD operations use the same four
     52 ///    bytes from \a Y; the starting bit position for these four bytes is
     53 ///    specified by \a M[1:0] times 32. The eight operations use successive
     54 ///    sets of four bytes from \a X; the starting bit position for the first
     55 ///    set of four bytes is specified by \a M[2] times 32. These bit positions
     56 ///    are all relative to the 128-bit lane for each set of eight operations.
     57 ///
     58 /// \code{.operation}
     59 /// r := 0
     60 /// FOR i := 0 TO 1
     61 ///   j := i*3
     62 ///   Ybase := M[j+1:j]*32 + i*128
     63 ///   Xbase := M[j+2]*32 + i*128
     64 ///   FOR k := 0 TO 3
     65 ///     temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
     66 ///     temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
     67 ///     temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
     68 ///     temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
     69 ///     result[r+15:r] := temp0 + temp1 + temp2 + temp3
     70 ///     Xbase := Xbase + 8
     71 ///     r := r + 16
     72 ///   ENDFOR
     73 /// ENDFOR
     74 /// \endcode
     75 ///
     76 /// \headerfile <immintrin.h>
     77 ///
     78 /// \code
     79 /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
     80 /// \endcode
     81 ///
     82 /// This intrinsic corresponds to the \c VMPSADBW instruction.
     83 ///
     84 /// \param X
     85 ///    A 256-bit integer vector containing one of the inputs.
     86 /// \param Y
     87 ///    A 256-bit integer vector containing one of the inputs.
     88 /// \param M
     89 ///     An unsigned immediate value specifying the starting positions of the
     90 ///     bytes to operate on.
     91 /// \returns A 256-bit vector of [16 x i16] containing the result.
     92 #define _mm256_mpsadbw_epu8(X, Y, M) \
     93   ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
     94                                       (__v32qi)(__m256i)(Y), (int)(M)))
     95 
     96 /// Computes the absolute value of each signed byte in the 256-bit integer
     97 ///    vector \a __a and returns each value in the corresponding byte of
     98 ///    the result.
     99 ///
    100 /// \headerfile <immintrin.h>
    101 ///
    102 /// This intrinsic corresponds to the \c VPABSB instruction.
    103 ///
    104 /// \param __a
    105 ///    A 256-bit integer vector.
    106 /// \returns A 256-bit integer vector containing the result.
    107 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    108 _mm256_abs_epi8(__m256i __a)
    109 {
    110     return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
    111 }
    112 
    113 /// Computes the absolute value of each signed 16-bit element in the 256-bit
    114 ///    vector of [16 x i16] in \a __a and returns each value in the
    115 ///    corresponding element of the result.
    116 ///
    117 /// \headerfile <immintrin.h>
    118 ///
    119 /// This intrinsic corresponds to the \c VPABSW instruction.
    120 ///
    121 /// \param __a
    122 ///    A 256-bit vector of [16 x i16].
    123 /// \returns A 256-bit vector of [16 x i16] containing the result.
    124 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    125 _mm256_abs_epi16(__m256i __a)
    126 {
    127     return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
    128 }
    129 
    130 /// Computes the absolute value of each signed 32-bit element in the 256-bit
    131 ///    vector of [8 x i32] in \a __a and returns each value in the
    132 ///    corresponding element of the result.
    133 ///
    134 /// \headerfile <immintrin.h>
    135 ///
    136 /// This intrinsic corresponds to the \c VPABSD instruction.
    137 ///
    138 /// \param __a
    139 ///    A 256-bit vector of [8 x i32].
    140 /// \returns A 256-bit vector of [8 x i32] containing the result.
    141 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    142 _mm256_abs_epi32(__m256i __a)
    143 {
    144     return (__m256i)__builtin_elementwise_abs((__v8si)__a);
    145 }
    146 
    147 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
    148 ///    integers using signed saturation, and returns the 256-bit result.
    149 ///
    150 /// \code{.operation}
    151 /// FOR i := 0 TO 7
    152 ///   j := i*16
    153 ///   k := i*8
    154 ///   result[7+k:k] := SATURATE8(__a[15+j:j])
    155 ///   result[71+k:64+k] := SATURATE8(__b[15+j:j])
    156 ///   result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
    157 ///   result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
    158 /// ENDFOR
    159 /// \endcode
    160 ///
    161 /// \headerfile <immintrin.h>
    162 ///
    163 /// This intrinsic corresponds to the \c VPACKSSWB instruction.
    164 ///
    165 /// \param __a
    166 ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
    167 ///    result[191:128].
    168 /// \param __b
    169 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
    170 ///    result[255:192].
    171 /// \returns A 256-bit integer vector containing the result.
    172 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    173 _mm256_packs_epi16(__m256i __a, __m256i __b)
    174 {
    175   return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
    176 }
    177 
    178 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
    179 ///    integers using signed saturation, and returns the resulting 256-bit
    180 ///    vector of [16 x i16].
    181 ///
    182 /// \code{.operation}
    183 /// FOR i := 0 TO 3
    184 ///   j := i*32
    185 ///   k := i*16
    186 ///   result[15+k:k] := SATURATE16(__a[31+j:j])
    187 ///   result[79+k:64+k] := SATURATE16(__b[31+j:j])
    188 ///   result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
    189 ///   result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
    190 /// ENDFOR
    191 /// \endcode
    192 ///
    193 /// \headerfile <immintrin.h>
    194 ///
    195 /// This intrinsic corresponds to the \c VPACKSSDW instruction.
    196 ///
    197 /// \param __a
    198 ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
    199 ///    result[191:128].
    200 /// \param __b
    201 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
    202 ///    result[255:192].
    203 /// \returns A 256-bit vector of [16 x i16] containing the result.
    204 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    205 _mm256_packs_epi32(__m256i __a, __m256i __b)
    206 {
    207   return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
    208 }
    209 
    210 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
    211 ///    using unsigned saturation, and returns the 256-bit result.
    212 ///
    213 /// \code{.operation}
    214 /// FOR i := 0 TO 7
    215 ///   j := i*16
    216 ///   k := i*8
    217 ///   result[7+k:k] := SATURATE8U(__a[15+j:j])
    218 ///   result[71+k:64+k] := SATURATE8U(__b[15+j:j])
    219 ///   result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
    220 ///   result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
    221 /// ENDFOR
    222 /// \endcode
    223 ///
    224 /// \headerfile <immintrin.h>
    225 ///
    226 /// This intrinsic corresponds to the \c VPACKUSWB instruction.
    227 ///
    228 /// \param __a
    229 ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
    230 ///    result[191:128].
    231 /// \param __b
    232 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
    233 ///    result[255:192].
    234 /// \returns A 256-bit integer vector containing the result.
    235 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    236 _mm256_packus_epi16(__m256i __a, __m256i __b)
    237 {
    238   return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
    239 }
    240 
    241 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
    242 ///    using unsigned saturation, and returns the resulting 256-bit vector of
    243 ///    [16 x i16].
    244 ///
    245 /// \code{.operation}
    246 /// FOR i := 0 TO 3
    247 ///   j := i*32
    248 ///   k := i*16
    249 ///   result[15+k:k] := SATURATE16U(__V1[31+j:j])
    250 ///   result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
    251 ///   result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
    252 ///   result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
    253 /// ENDFOR
    254 /// \endcode
    255 ///
    256 /// \headerfile <immintrin.h>
    257 ///
    258 /// This intrinsic corresponds to the \c VPACKUSDW instruction.
    259 ///
    260 /// \param __V1
    261 ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
    262 ///    result[191:128].
    263 /// \param __V2
    264 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
    265 ///    result[255:192].
    266 /// \returns A 256-bit vector of [16 x i16] containing the result.
    267 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    268 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
    269 {
    270   return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
    271 }
    272 
    273 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
    274 ///    vectors and returns the lower 8 bits of each sum in the corresponding
    275 ///    byte of the 256-bit integer vector result (overflow is ignored).
    276 ///
    277 /// \headerfile <immintrin.h>
    278 ///
    279 /// This intrinsic corresponds to the \c VPADDB instruction.
    280 ///
    281 /// \param __a
    282 ///    A 256-bit integer vector containing one of the source operands.
    283 /// \param __b
    284 ///    A 256-bit integer vector containing one of the source operands.
    285 /// \returns A 256-bit integer vector containing the sums.
    286 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    287 _mm256_add_epi8(__m256i __a, __m256i __b)
    288 {
    289   return (__m256i)((__v32qu)__a + (__v32qu)__b);
    290 }
    291 
    292 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
    293 ///    [16 x i16] and returns the lower 16 bits of each sum in the
    294 ///    corresponding element of the [16 x i16] result (overflow is ignored).
    295 ///
    296 /// \headerfile <immintrin.h>
    297 ///
    298 /// This intrinsic corresponds to the \c VPADDW instruction.
    299 ///
    300 /// \param __a
    301 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
    302 /// \param __b
    303 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
    304 /// \returns A 256-bit vector of [16 x i16] containing the sums.
    305 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    306 _mm256_add_epi16(__m256i __a, __m256i __b)
    307 {
    308   return (__m256i)((__v16hu)__a + (__v16hu)__b);
    309 }
    310 
    311 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
    312 ///    [8 x i32] and returns the lower 32 bits of each sum in the corresponding
    313 ///    element of the [8 x i32] result (overflow is ignored).
    314 ///
    315 /// \headerfile <immintrin.h>
    316 ///
    317 /// This intrinsic corresponds to the \c VPADDD instruction.
    318 ///
    319 /// \param __a
    320 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
    321 /// \param __b
    322 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
    323 /// \returns A 256-bit vector of [8 x i32] containing the sums.
    324 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    325 _mm256_add_epi32(__m256i __a, __m256i __b)
    326 {
    327   return (__m256i)((__v8su)__a + (__v8su)__b);
    328 }
    329 
    330 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
    331 ///    [4 x i64] and returns the lower 64 bits of each sum in the corresponding
    332 ///    element of the [4 x i64] result (overflow is ignored).
    333 ///
    334 /// \headerfile <immintrin.h>
    335 ///
    336 /// This intrinsic corresponds to the \c VPADDQ instruction.
    337 ///
    338 /// \param __a
    339 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
    340 /// \param __b
    341 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
    342 /// \returns A 256-bit vector of [4 x i64] containing the sums.
    343 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    344 _mm256_add_epi64(__m256i __a, __m256i __b)
    345 {
    346   return (__m256i)((__v4du)__a + (__v4du)__b);
    347 }
    348 
    349 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
    350 ///    vectors using signed saturation, and returns each sum in the
    351 ///    corresponding byte of the 256-bit integer vector result.
    352 ///
    353 /// \headerfile <immintrin.h>
    354 ///
    355 /// This intrinsic corresponds to the \c VPADDSB instruction.
    356 ///
    357 /// \param __a
    358 ///    A 256-bit integer vector containing one of the source operands.
    359 /// \param __b
    360 ///    A 256-bit integer vector containing one of the source operands.
    361 /// \returns A 256-bit integer vector containing the sums.
    362 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    363 _mm256_adds_epi8(__m256i __a, __m256i __b)
    364 {
    365   return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
    366 }
    367 
    368 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
    369 ///    [16 x i16] using signed saturation, and returns the [16 x i16] result.
    370 ///
    371 /// \headerfile <immintrin.h>
    372 ///
    373 /// This intrinsic corresponds to the \c VPADDSW instruction.
    374 ///
    375 /// \param __a
    376 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
    377 /// \param __b
    378 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
    379 /// \returns A 256-bit vector of [16 x i16] containing the sums.
    380 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    381 _mm256_adds_epi16(__m256i __a, __m256i __b)
    382 {
    383   return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
    384 }
    385 
    386 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
    387 ///    vectors using unsigned saturation, and returns each sum in the
    388 ///    corresponding byte of the 256-bit integer vector result.
    389 ///
    390 /// \headerfile <immintrin.h>
    391 ///
    392 /// This intrinsic corresponds to the \c VPADDUSB instruction.
    393 ///
    394 /// \param __a
    395 ///    A 256-bit integer vector containing one of the source operands.
    396 /// \param __b
    397 ///    A 256-bit integer vector containing one of the source operands.
    398 /// \returns A 256-bit integer vector containing the sums.
    399 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    400 _mm256_adds_epu8(__m256i __a, __m256i __b)
    401 {
    402   return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
    403 }
    404 
    405 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
    406 ///    [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
    407 ///
    408 /// \headerfile <immintrin.h>
    409 ///
    410 /// This intrinsic corresponds to the \c VPADDUSW instruction.
    411 ///
    412 /// \param __a
    413 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
    414 /// \param __b
    415 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
    416 /// \returns A 256-bit vector of [16 x i16] containing the sums.
    417 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    418 _mm256_adds_epu16(__m256i __a, __m256i __b)
    419 {
    420   return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
    421 }
    422 
    423 /// Uses the lower half of the 256-bit vector \a a as the upper half of a
    424 ///    temporary 256-bit value, and the lower half of the 256-bit vector \a b
    425 ///    as the lower half of the temporary value. Right-shifts the temporary
    426 ///    value by \a n bytes, and uses the lower 16 bytes of the shifted value
    427 ///    as the lower 16 bytes of the result. Uses the upper halves of \a a and
    428 ///    \a b to make another temporary value, right shifts by \a n, and uses
    429 ///    the lower 16 bytes of the shifted value as the upper 16 bytes of the
    430 ///    result.
    431 ///
    432 /// \headerfile <immintrin.h>
    433 ///
    434 /// \code
    435 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
    436 /// \endcode
    437 ///
    438 /// This intrinsic corresponds to the \c VPALIGNR instruction.
    439 ///
    440 /// \param a
    441 ///    A 256-bit integer vector containing source values.
    442 /// \param b
    443 ///    A 256-bit integer vector containing source values.
    444 /// \param n
    445 ///    An immediate value specifying the number of bytes to shift.
    446 /// \returns A 256-bit integer vector containing the result.
    447 #define _mm256_alignr_epi8(a, b, n) \
    448   ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
    449                                       (__v32qi)(__m256i)(b), (n)))
    450 
    451 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
    452 ///    \a __b.
    453 ///
    454 /// \headerfile <immintrin.h>
    455 ///
    456 /// This intrinsic corresponds to the \c VPAND instruction.
    457 ///
    458 /// \param __a
    459 ///    A 256-bit integer vector.
    460 /// \param __b
    461 ///    A 256-bit integer vector.
    462 /// \returns A 256-bit integer vector containing the result.
    463 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    464 _mm256_and_si256(__m256i __a, __m256i __b)
    465 {
    466   return (__m256i)((__v4du)__a & (__v4du)__b);
    467 }
    468 
    469 /// Computes the bitwise AND of the 256-bit integer vector in \a __b with
    470 ///    the bitwise NOT of the 256-bit integer vector in \a __a.
    471 ///
    472 /// \headerfile <immintrin.h>
    473 ///
    474 /// This intrinsic corresponds to the \c VPANDN instruction.
    475 ///
    476 /// \param __a
    477 ///    A 256-bit integer vector.
    478 /// \param __b
    479 ///    A 256-bit integer vector.
    480 /// \returns A 256-bit integer vector containing the result.
    481 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    482 _mm256_andnot_si256(__m256i __a, __m256i __b)
    483 {
    484   return (__m256i)(~(__v4du)__a & (__v4du)__b);
    485 }
    486 
    487 /// Computes the averages of the corresponding unsigned bytes in the two
    488 ///    256-bit integer vectors in \a __a and \a __b and returns each
    489 ///    average in the corresponding byte of the 256-bit result.
    490 ///
    491 /// \code{.operation}
    492 /// FOR i := 0 TO 31
    493 ///   j := i*8
    494 ///   result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
    495 /// ENDFOR
    496 /// \endcode
    497 ///
    498 /// \headerfile <immintrin.h>
    499 ///
    500 /// This intrinsic corresponds to the \c VPAVGB instruction.
    501 ///
    502 /// \param __a
    503 ///    A 256-bit integer vector.
    504 /// \param __b
    505 ///    A 256-bit integer vector.
    506 /// \returns A 256-bit integer vector containing the result.
    507 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    508 _mm256_avg_epu8(__m256i __a, __m256i __b)
    509 {
    510   return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
    511 }
    512 
    513 /// Computes the averages of the corresponding unsigned 16-bit integers in
    514 ///    the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
    515 ///    each average in the corresponding element of the 256-bit result.
    516 ///
    517 /// \code{.operation}
    518 /// FOR i := 0 TO 15
    519 ///   j := i*16
    520 ///   result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
    521 /// ENDFOR
    522 /// \endcode
    523 ///
    524 /// \headerfile <immintrin.h>
    525 ///
    526 /// This intrinsic corresponds to the \c VPAVGW instruction.
    527 ///
    528 /// \param __a
    529 ///    A 256-bit vector of [16 x i16].
    530 /// \param __b
    531 ///    A 256-bit vector of [16 x i16].
    532 /// \returns A 256-bit vector of [16 x i16] containing the result.
    533 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    534 _mm256_avg_epu16(__m256i __a, __m256i __b)
    535 {
    536   return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
    537 }
    538 
    539 /// Merges 8-bit integer values from either of the two 256-bit vectors
    540 ///    \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
    541 ///    the resulting 256-bit integer vector.
    542 ///
    543 /// \code{.operation}
    544 /// FOR i := 0 TO 31
    545 ///   j := i*8
    546 ///   IF __M[7+i] == 0
    547 ///     result[7+j:j] := __V1[7+j:j]
    548 ///   ELSE
    549 ///     result[7+j:j] := __V2[7+j:j]
    550 ///   FI
    551 /// ENDFOR
    552 /// \endcode
    553 ///
    554 /// \headerfile <immintrin.h>
    555 ///
    556 /// This intrinsic corresponds to the \c VPBLENDVB instruction.
    557 ///
    558 /// \param __V1
    559 ///    A 256-bit integer vector containing source values.
    560 /// \param __V2
    561 ///    A 256-bit integer vector containing source values.
    562 /// \param __M
    563 ///    A 256-bit integer vector, with bit [7] of each byte specifying the
    564 ///    source for each corresponding byte of the result. When the mask bit
    565 ///    is 0, the byte is copied from \a __V1; otherwise, it is copied from
    566 ///    \a __V2.
    567 /// \returns A 256-bit integer vector containing the result.
    568 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    569 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
    570 {
    571   return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
    572                                               (__v32qi)__M);
    573 }
    574 
    575 /// Merges 16-bit integer values from either of the two 256-bit vectors
    576 ///    \a V1 or \a V2, as specified by the immediate integer operand \a M,
    577 ///    and returns the resulting 256-bit vector of [16 x i16].
    578 ///
    579 /// \code{.operation}
    580 /// FOR i := 0 TO 7
    581 ///   j := i*16
    582 ///   IF M[i] == 0
    583 ///     result[7+j:j] := V1[7+j:j]
    584 ///     result[135+j:128+j] := V1[135+j:128+j]
    585 ///   ELSE
    586 ///     result[7+j:j] := V2[7+j:j]
    587 ///     result[135+j:128+j] := V2[135+j:128+j]
    588 ///   FI
    589 /// ENDFOR
    590 /// \endcode
    591 ///
    592 /// \headerfile <immintrin.h>
    593 ///
    594 /// \code
    595 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
    596 /// \endcode
    597 ///
    598 /// This intrinsic corresponds to the \c VPBLENDW instruction.
    599 ///
    600 /// \param V1
    601 ///    A 256-bit vector of [16 x i16] containing source values.
    602 /// \param V2
    603 ///    A 256-bit vector of [16 x i16] containing source values.
    604 /// \param M
    605 ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
    606 ///    source for each element of the result. The position of the mask bit
    607 ///    corresponds to the index of a copied value. When a mask bit is 0, the
    608 ///    element is copied from \a V1; otherwise, it is copied from \a V2.
    609 ///    \a M[0] determines the source for elements 0 and 8, \a M[1] for
    610 ///    elements 1 and 9, and so forth.
    611 /// \returns A 256-bit vector of [16 x i16] containing the result.
    612 #define _mm256_blend_epi16(V1, V2, M) \
    613   ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
    614                                       (__v16hi)(__m256i)(V2), (int)(M)))
    615 
    616 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
    617 ///    \a __b for equality and returns the outcomes in the corresponding
    618 ///    bytes of the 256-bit result.
    619 ///
    620 /// \code{.operation}
    621 /// FOR i := 0 TO 31
    622 ///   j := i*8
    623 ///   result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
    624 /// ENDFOR
    625 /// \endcode
    626 ///
    627 /// \headerfile <immintrin.h>
    628 ///
    629 /// This intrinsic corresponds to the \c VPCMPEQB instruction.
    630 ///
    631 /// \param __a
    632 ///    A 256-bit integer vector containing one of the inputs.
    633 /// \param __b
    634 ///    A 256-bit integer vector containing one of the inputs.
    635 /// \returns A 256-bit integer vector containing the result.
    636 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    637 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
    638 {
    639   return (__m256i)((__v32qi)__a == (__v32qi)__b);
    640 }
    641 
    642 /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
    643 ///    \a __a and \a __b for equality and returns the outcomes in the
    644 ///    corresponding elements of the 256-bit result.
    645 ///
    646 /// \code{.operation}
    647 /// FOR i := 0 TO 15
    648 ///   j := i*16
    649 ///   result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
    650 /// ENDFOR
    651 /// \endcode
    652 ///
    653 /// \headerfile <immintrin.h>
    654 ///
    655 /// This intrinsic corresponds to the \c VPCMPEQW instruction.
    656 ///
    657 /// \param __a
    658 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
    659 /// \param __b
    660 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
    661 /// \returns A 256-bit vector of [16 x i16] containing the result.
    662 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    663 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
    664 {
    665   return (__m256i)((__v16hi)__a == (__v16hi)__b);
    666 }
    667 
    668 /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
    669 ///    \a __a and \a __b for equality and returns the outcomes in the
    670 ///    corresponding elements of the 256-bit result.
    671 ///
    672 /// \code{.operation}
    673 /// FOR i := 0 TO 7
    674 ///   j := i*32
    675 ///   result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
    676 /// ENDFOR
    677 /// \endcode
    678 ///
    679 /// \headerfile <immintrin.h>
    680 ///
    681 /// This intrinsic corresponds to the \c VPCMPEQD instruction.
    682 ///
    683 /// \param __a
    684 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
    685 /// \param __b
    686 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
    687 /// \returns A 256-bit vector of [8 x i32] containing the result.
    688 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    689 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
    690 {
    691   return (__m256i)((__v8si)__a == (__v8si)__b);
    692 }
    693 
    694 /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
    695 ///    \a __a and \a __b for equality and returns the outcomes in the
    696 ///    corresponding elements of the 256-bit result.
    697 ///
    698 /// \code{.operation}
    699 /// FOR i := 0 TO 3
    700 ///   j := i*64
    701 ///   result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
    702 /// ENDFOR
    703 /// \endcode
    704 ///
    705 /// \headerfile <immintrin.h>
    706 ///
    707 /// This intrinsic corresponds to the \c VPCMPEQQ instruction.
    708 ///
    709 /// \param __a
    710 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
    711 /// \param __b
    712 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
    713 /// \returns A 256-bit vector of [4 x i64] containing the result.
    714 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    715 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
    716 {
    717   return (__m256i)((__v4di)__a == (__v4di)__b);
    718 }
    719 
    720 /// Compares corresponding signed bytes in the 256-bit integer vectors in
    721 ///    \a __a and \a __b for greater-than and returns the outcomes in the
    722 ///    corresponding bytes of the 256-bit result.
    723 ///
    724 /// \code{.operation}
    725 /// FOR i := 0 TO 31
    726 ///   j := i*8
    727 ///   result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
    728 /// ENDFOR
    729 /// \endcode
    730 ///
    731 /// \headerfile <immintrin.h>
    732 ///
    733 /// This intrinsic corresponds to the \c VPCMPGTB instruction.
    734 ///
    735 /// \param __a
    736 ///    A 256-bit integer vector containing one of the inputs.
    737 /// \param __b
    738 ///    A 256-bit integer vector containing one of the inputs.
    739 /// \returns A 256-bit integer vector containing the result.
    740 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    741 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
    742 {
    743   /* This function always performs a signed comparison, but __v32qi is a char
    744      which may be signed or unsigned, so use __v32qs. */
    745   return (__m256i)((__v32qs)__a > (__v32qs)__b);
    746 }
    747 
    748 /// Compares corresponding signed elements in the 256-bit vectors of
    749 ///    [16 x i16] in \a __a and \a __b for greater-than and returns the
    750 ///    outcomes in the corresponding elements of the 256-bit result.
    751 ///
    752 /// \code{.operation}
    753 /// FOR i := 0 TO 15
    754 ///   j := i*16
    755 ///   result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
    756 /// ENDFOR
    757 /// \endcode
    758 ///
    759 /// \headerfile <immintrin.h>
    760 ///
    761 /// This intrinsic corresponds to the \c VPCMPGTW instruction.
    762 ///
    763 /// \param __a
    764 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
    765 /// \param __b
    766 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
    767 /// \returns A 256-bit vector of [16 x i16] containing the result.
    768 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    769 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
    770 {
    771   return (__m256i)((__v16hi)__a > (__v16hi)__b);
    772 }
    773 
    774 /// Compares corresponding signed elements in the 256-bit vectors of
    775 ///    [8 x i32] in \a __a and \a __b for greater-than and returns the
    776 ///    outcomes in the corresponding elements of the 256-bit result.
    777 ///
    778 /// \code{.operation}
    779 /// FOR i := 0 TO 7
    780 ///   j := i*32
    781 ///   result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
    782 /// ENDFOR
    783 /// \endcode
    784 ///
    785 /// \headerfile <immintrin.h>
    786 ///
    787 /// This intrinsic corresponds to the \c VPCMPGTD instruction.
    788 ///
    789 /// \param __a
    790 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
    791 /// \param __b
    792 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
    793 /// \returns A 256-bit vector of [8 x i32] containing the result.
    794 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    795 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
    796 {
    797   return (__m256i)((__v8si)__a > (__v8si)__b);
    798 }
    799 
    800 /// Compares corresponding signed elements in the 256-bit vectors of
    801 ///    [4 x i64] in \a __a and \a __b for greater-than and returns the
    802 ///    outcomes in the corresponding elements of the 256-bit result.
    803 ///
    804 /// \code{.operation}
    805 /// FOR i := 0 TO 3
    806 ///   j := i*64
    807 ///   result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
    808 /// ENDFOR
    809 /// \endcode
    810 ///
    811 /// \headerfile <immintrin.h>
    812 ///
    813 /// This intrinsic corresponds to the \c VPCMPGTQ instruction.
    814 ///
    815 /// \param __a
    816 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
    817 /// \param __b
    818 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
    819 /// \returns A 256-bit vector of [4 x i64] containing the result.
    820 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    821 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
    822 {
    823   return (__m256i)((__v4di)__a > (__v4di)__b);
    824 }
    825 
    826 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
    827 ///    vectors of [16 x i16] and returns the lower 16 bits of each sum in an
    828 ///    element of the [16 x i16] result (overflow is ignored). Sums from
    829 ///    \a __a are returned in the lower 64 bits of each 128-bit half of the
    830 ///    result; sums from \a __b are returned in the upper 64 bits of each
    831 ///    128-bit half of the result.
    832 ///
    833 /// \code{.operation}
    834 /// FOR i := 0 TO 1
    835 ///   j := i*128
    836 ///   result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
    837 ///   result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
    838 ///   result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
    839 ///   result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
    840 ///   result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
    841 ///   result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
    842 ///   result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
    843 ///   result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
    844 /// ENDFOR
    845 /// \endcode
    846 ///
    847 /// \headerfile <immintrin.h>
    848 ///
    849 /// This intrinsic corresponds to the \c VPHADDW instruction.
    850 ///
    851 /// \param __a
    852 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
    853 /// \param __b
    854 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
    855 /// \returns A 256-bit vector of [16 x i16] containing the sums.
    856 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    857 _mm256_hadd_epi16(__m256i __a, __m256i __b)
    858 {
    859     return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
    860 }
    861 
    862 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
    863 ///    vectors of [8 x i32] and returns the lower 32 bits of each sum in an
    864 ///    element of the [8 x i32] result (overflow is ignored). Sums from \a __a
    865 ///    are returned in the lower 64 bits of each 128-bit half of the result;
    866 ///    sums from \a __b are returned in the upper 64 bits of each 128-bit half
    867 ///    of the result.
    868 ///
    869 /// \code{.operation}
    870 /// FOR i := 0 TO 1
    871 ///   j := i*128
    872 ///   result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
    873 ///   result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
    874 ///   result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
    875 ///   result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
    876 /// ENDFOR
    877 /// \endcode
    878 ///
    879 /// \headerfile <immintrin.h>
    880 ///
    881 /// This intrinsic corresponds to the \c VPHADDD instruction.
    882 ///
    883 /// \param __a
    884 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
    885 /// \param __b
    886 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
    887 /// \returns A 256-bit vector of [8 x i32] containing the sums.
    888 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    889 _mm256_hadd_epi32(__m256i __a, __m256i __b)
    890 {
    891     return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
    892 }
    893 
    894 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
    895 ///    vectors of [16 x i16] using signed saturation and returns each sum in
    896 ///    an element of the [16 x i16] result. Sums from \a __a are returned in
    897 ///    the lower 64 bits of each 128-bit half of the result; sums from \a __b
    898 ///    are returned in the upper 64 bits of each 128-bit half of the result.
    899 ///
    900 /// \code{.operation}
    901 /// FOR i := 0 TO 1
    902 ///   j := i*128
    903 ///   result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
    904 ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
    905 ///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
    906 ///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
    907 ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
    908 ///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
    909 ///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
    910 ///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
    911 /// ENDFOR
    912 /// \endcode
    913 ///
    914 /// \headerfile <immintrin.h>
    915 ///
    916 /// This intrinsic corresponds to the \c VPHADDSW instruction.
    917 ///
    918 /// \param __a
    919 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
    920 /// \param __b
    921 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
    922 /// \returns A 256-bit vector of [16 x i16] containing the sums.
    923 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    924 _mm256_hadds_epi16(__m256i __a, __m256i __b)
    925 {
    926     return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
    927 }
    928 
    929 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
    930 ///    vectors of [16 x i16] and returns the lower 16 bits of each difference
    931 ///    in an element of the [16 x i16] result (overflow is ignored).
    932 ///    Differences from \a __a are returned in the lower 64 bits of each
    933 ///    128-bit half of the result; differences from \a __b are returned in the
    934 ///    upper 64 bits of each 128-bit half of the result.
    935 ///
    936 /// \code{.operation}
    937 /// FOR i := 0 TO 1
    938 ///   j := i*128
    939 ///   result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
    940 ///   result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
    941 ///   result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
    942 ///   result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
    943 ///   result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
    944 ///   result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
    945 ///   result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
    946 ///   result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
    947 /// ENDFOR
    948 /// \endcode
    949 ///
    950 /// \headerfile <immintrin.h>
    951 ///
    952 /// This intrinsic corresponds to the \c VPHSUBW instruction.
    953 ///
    954 /// \param __a
    955 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
    956 /// \param __b
    957 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
    958 /// \returns A 256-bit vector of [16 x i16] containing the differences.
    959 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    960 _mm256_hsub_epi16(__m256i __a, __m256i __b)
    961 {
    962     return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
    963 }
    964 
    965 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
    966 ///    vectors of [8 x i32] and returns the lower 32 bits of each difference in
    967 ///    an element of the [8 x i32] result (overflow is ignored). Differences
    968 ///    from \a __a are returned in the lower 64 bits of each 128-bit half of
    969 ///    the result; differences from \a __b are returned in the upper 64 bits
    970 ///    of each 128-bit half of the result.
    971 ///
    972 /// \code{.operation}
    973 /// FOR i := 0 TO 1
    974 ///   j := i*128
    975 ///   result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
    976 ///   result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
    977 ///   result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
    978 ///   result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
    979 /// ENDFOR
    980 /// \endcode
    981 ///
    982 /// \headerfile <immintrin.h>
    983 ///
    984 /// This intrinsic corresponds to the \c VPHSUBD instruction.
    985 ///
    986 /// \param __a
    987 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
    988 /// \param __b
    989 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
    990 /// \returns A 256-bit vector of [8 x i32] containing the differences.
    991 static __inline__ __m256i __DEFAULT_FN_ATTRS256
    992 _mm256_hsub_epi32(__m256i __a, __m256i __b)
    993 {
    994     return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
    995 }
    996 
    997 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
    998 ///    vectors of [16 x i16] using signed saturation and returns each sum in
    999 ///    an element of the [16 x i16] result. Differences from \a __a are
   1000 ///    returned in the lower 64 bits of each 128-bit half of the result;
   1001 ///    differences from \a __b are returned in the upper 64 bits of each
   1002 ///    128-bit half of the result.
   1003 ///
   1004 /// \code{.operation}
   1005 /// FOR i := 0 TO 1
   1006 ///   j := i*128
   1007 ///   result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
   1008 ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
   1009 ///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
   1010 ///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
   1011 ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
   1012 ///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
   1013 ///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
   1014 ///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
   1015 /// ENDFOR
   1016 /// \endcode
   1017 ///
   1018 /// \headerfile <immintrin.h>
   1019 ///
   1020 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
   1021 ///
   1022 /// \param __a
   1023 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
   1024 /// \param __b
   1025 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
   1026 /// \returns A 256-bit vector of [16 x i16] containing the differences.
   1027 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1028 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
   1029 {
   1030     return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
   1031 }
   1032 
   1033 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
   1034 ///    with the corresponding signed byte from the 256-bit integer vector in
   1035 ///    \a __b, forming signed 16-bit intermediate products. Adds adjacent
   1036 ///    pairs of those products using signed saturation to form 16-bit sums
   1037 ///    returned as elements of the [16 x i16] result.
   1038 ///
   1039 /// \code{.operation}
   1040 /// FOR i := 0 TO 15
   1041 ///   j := i*16
   1042 ///   temp1 := __a[j+7:j] * __b[j+7:j]
   1043 ///   temp2 := __a[j+15:j+8] * __b[j+15:j+8]
   1044 ///   result[j+15:j] := SATURATE16(temp1 + temp2)
   1045 /// ENDFOR
   1046 /// \endcode
   1047 ///
   1048 /// \headerfile <immintrin.h>
   1049 ///
   1050 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
   1051 ///
   1052 /// \param __a
   1053 ///    A 256-bit vector containing one of the source operands.
   1054 /// \param __b
   1055 ///    A 256-bit vector containing one of the source operands.
   1056 /// \returns A 256-bit vector of [16 x i16] containing the result.
   1057 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1058 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
   1059 {
   1060     return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
   1061 }
   1062 
   1063 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
   1064 ///    [16 x i16], forming 32-bit intermediate products, and adds pairs of
   1065 ///    those products to form 32-bit sums returned as elements of the
   1066 ///    [8 x i32] result.
   1067 ///
   1068 ///    There is only one wraparound case: when all four of the 16-bit sources
   1069 ///    are \c 0x8000, the result will be \c 0x80000000.
   1070 ///
   1071 /// \code{.operation}
   1072 /// FOR i := 0 TO 7
   1073 ///   j := i*32
   1074 ///   temp1 := __a[j+15:j] * __b[j+15:j]
   1075 ///   temp2 := __a[j+31:j+16] * __b[j+31:j+16]
   1076 ///   result[j+31:j] := temp1 + temp2
   1077 /// ENDFOR
   1078 /// \endcode
   1079 ///
   1080 /// \headerfile <immintrin.h>
   1081 ///
   1082 /// This intrinsic corresponds to the \c VPMADDWD instruction.
   1083 ///
   1084 /// \param __a
   1085 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
   1086 /// \param __b
   1087 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
   1088 /// \returns A 256-bit vector of [8 x i32] containing the result.
   1089 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1090 _mm256_madd_epi16(__m256i __a, __m256i __b)
   1091 {
   1092   return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
   1093 }
   1094 
   1095 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
   1096 ///     in \a __a and \a __b and returns the larger of each pair in the
   1097 ///     corresponding byte of the 256-bit result.
   1098 ///
   1099 /// \headerfile <immintrin.h>
   1100 ///
   1101 /// This intrinsic corresponds to the \c VPMAXSB instruction.
   1102 ///
   1103 /// \param __a
   1104 ///    A 256-bit integer vector.
   1105 /// \param __b
   1106 ///    A 256-bit integer vector.
   1107 /// \returns A 256-bit integer vector containing the result.
   1108 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1109 _mm256_max_epi8(__m256i __a, __m256i __b)
   1110 {
   1111   return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
   1112 }
   1113 
   1114 /// Compares the corresponding signed 16-bit integers in the two 256-bit
   1115 ///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
   1116 ///    each pair in the corresponding element of the 256-bit result.
   1117 ///
   1118 /// \headerfile <immintrin.h>
   1119 ///
   1120 /// This intrinsic corresponds to the \c VPMAXSW instruction.
   1121 ///
   1122 /// \param __a
   1123 ///    A 256-bit vector of [16 x i16].
   1124 /// \param __b
   1125 ///    A 256-bit vector of [16 x i16].
   1126 /// \returns A 256-bit vector of [16 x i16] containing the result.
   1127 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1128 _mm256_max_epi16(__m256i __a, __m256i __b)
   1129 {
   1130   return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
   1131 }
   1132 
   1133 /// Compares the corresponding signed 32-bit integers in the two 256-bit
   1134 ///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
   1135 ///    each pair in the corresponding element of the 256-bit result.
   1136 ///
   1137 /// \headerfile <immintrin.h>
   1138 ///
   1139 /// This intrinsic corresponds to the \c VPMAXSD instruction.
   1140 ///
   1141 /// \param __a
   1142 ///    A 256-bit vector of [8 x i32].
   1143 /// \param __b
   1144 ///    A 256-bit vector of [8 x i32].
   1145 /// \returns A 256-bit vector of [8 x i32] containing the result.
   1146 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1147 _mm256_max_epi32(__m256i __a, __m256i __b)
   1148 {
   1149   return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
   1150 }
   1151 
   1152 /// Compares the corresponding unsigned bytes in the two 256-bit integer
   1153 ///     vectors in \a __a and \a __b and returns the larger of each pair in
   1154 ///     the corresponding byte of the 256-bit result.
   1155 ///
   1156 /// \headerfile <immintrin.h>
   1157 ///
   1158 /// This intrinsic corresponds to the \c VPMAXUB instruction.
   1159 ///
   1160 /// \param __a
   1161 ///    A 256-bit integer vector.
   1162 /// \param __b
   1163 ///    A 256-bit integer vector.
   1164 /// \returns A 256-bit integer vector containing the result.
   1165 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1166 _mm256_max_epu8(__m256i __a, __m256i __b)
   1167 {
   1168   return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
   1169 }
   1170 
   1171 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
   1172 ///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
   1173 ///    each pair in the corresponding element of the 256-bit result.
   1174 ///
   1175 /// \headerfile <immintrin.h>
   1176 ///
   1177 /// This intrinsic corresponds to the \c VPMAXUW instruction.
   1178 ///
   1179 /// \param __a
   1180 ///    A 256-bit vector of [16 x i16].
   1181 /// \param __b
   1182 ///    A 256-bit vector of [16 x i16].
   1183 /// \returns A 256-bit vector of [16 x i16] containing the result.
   1184 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1185 _mm256_max_epu16(__m256i __a, __m256i __b)
   1186 {
   1187   return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
   1188 }
   1189 
   1190 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
   1191 ///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
   1192 ///    each pair in the corresponding element of the 256-bit result.
   1193 ///
   1194 /// \headerfile <immintrin.h>
   1195 ///
   1196 /// This intrinsic corresponds to the \c VPMAXUD instruction.
   1197 ///
   1198 /// \param __a
   1199 ///    A 256-bit vector of [8 x i32].
   1200 /// \param __b
   1201 ///    A 256-bit vector of [8 x i32].
   1202 /// \returns A 256-bit vector of [8 x i32] containing the result.
   1203 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1204 _mm256_max_epu32(__m256i __a, __m256i __b)
   1205 {
   1206   return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
   1207 }
   1208 
   1209 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
   1210 ///     in \a __a and \a __b and returns the smaller of each pair in the
   1211 ///     corresponding byte of the 256-bit result.
   1212 ///
   1213 /// \headerfile <immintrin.h>
   1214 ///
   1215 /// This intrinsic corresponds to the \c VPMINSB instruction.
   1216 ///
   1217 /// \param __a
   1218 ///    A 256-bit integer vector.
   1219 /// \param __b
   1220 ///    A 256-bit integer vector.
   1221 /// \returns A 256-bit integer vector containing the result.
   1222 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1223 _mm256_min_epi8(__m256i __a, __m256i __b)
   1224 {
   1225   return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
   1226 }
   1227 
   1228 /// Compares the corresponding signed 16-bit integers in the two 256-bit
   1229 ///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
   1230 ///    each pair in the corresponding element of the 256-bit result.
   1231 ///
   1232 /// \headerfile <immintrin.h>
   1233 ///
   1234 /// This intrinsic corresponds to the \c VPMINSW instruction.
   1235 ///
   1236 /// \param __a
   1237 ///    A 256-bit vector of [16 x i16].
   1238 /// \param __b
   1239 ///    A 256-bit vector of [16 x i16].
   1240 /// \returns A 256-bit vector of [16 x i16] containing the result.
   1241 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1242 _mm256_min_epi16(__m256i __a, __m256i __b)
   1243 {
   1244   return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
   1245 }
   1246 
   1247 /// Compares the corresponding signed 32-bit integers in the two 256-bit
   1248 ///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
   1249 ///    each pair in the corresponding element of the 256-bit result.
   1250 ///
   1251 /// \headerfile <immintrin.h>
   1252 ///
   1253 /// This intrinsic corresponds to the \c VPMINSD instruction.
   1254 ///
   1255 /// \param __a
   1256 ///    A 256-bit vector of [8 x i32].
   1257 /// \param __b
   1258 ///    A 256-bit vector of [8 x i32].
   1259 /// \returns A 256-bit vector of [8 x i32] containing the result.
   1260 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1261 _mm256_min_epi32(__m256i __a, __m256i __b)
   1262 {
   1263   return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
   1264 }
   1265 
   1266 /// Compares the corresponding unsigned bytes in the two 256-bit integer
   1267 ///     vectors in \a __a and \a __b and returns the smaller of each pair in
   1268 ///     the corresponding byte of the 256-bit result.
   1269 ///
   1270 /// \headerfile <immintrin.h>
   1271 ///
   1272 /// This intrinsic corresponds to the \c VPMINUB instruction.
   1273 ///
   1274 /// \param __a
   1275 ///    A 256-bit integer vector.
   1276 /// \param __b
   1277 ///    A 256-bit integer vector.
   1278 /// \returns A 256-bit integer vector containing the result.
   1279 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1280 _mm256_min_epu8(__m256i __a, __m256i __b)
   1281 {
   1282   return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
   1283 }
   1284 
   1285 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
   1286 ///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
   1287 ///    each pair in the corresponding element of the 256-bit result.
   1288 ///
   1289 /// \headerfile <immintrin.h>
   1290 ///
   1291 /// This intrinsic corresponds to the \c VPMINUW instruction.
   1292 ///
   1293 /// \param __a
   1294 ///    A 256-bit vector of [16 x i16].
   1295 /// \param __b
   1296 ///    A 256-bit vector of [16 x i16].
   1297 /// \returns A 256-bit vector of [16 x i16] containing the result.
   1298 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1299 _mm256_min_epu16(__m256i __a, __m256i __b)
   1300 {
   1301   return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
   1302 }
   1303 
   1304 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
   1305 ///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
   1306 ///    each pair in the corresponding element of the 256-bit result.
   1307 ///
   1308 /// \headerfile <immintrin.h>
   1309 ///
   1310 /// This intrinsic corresponds to the \c VPMINUD instruction.
   1311 ///
   1312 /// \param __a
   1313 ///    A 256-bit vector of [8 x i32].
   1314 /// \param __b
   1315 ///    A 256-bit vector of [8 x i32].
   1316 /// \returns A 256-bit vector of [8 x i32] containing the result.
   1317 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1318 _mm256_min_epu32(__m256i __a, __m256i __b)
   1319 {
   1320   return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
   1321 }
   1322 
   1323 /// Creates a 32-bit integer mask from the most significant bit of each byte
   1324 ///    in the 256-bit integer vector in \a __a and returns the result.
   1325 ///
   1326 /// \code{.operation}
   1327 /// FOR i := 0 TO 31
   1328 ///   j := i*8
   1329 ///   result[i] := __a[j+7]
   1330 /// ENDFOR
   1331 /// \endcode
   1332 ///
   1333 /// \headerfile <immintrin.h>
   1334 ///
   1335 /// This intrinsic corresponds to the \c VPMOVMSKB instruction.
   1336 ///
   1337 /// \param __a
   1338 ///    A 256-bit integer vector containing the source bytes.
   1339 /// \returns The 32-bit integer mask.
   1340 static __inline__ int __DEFAULT_FN_ATTRS256
   1341 _mm256_movemask_epi8(__m256i __a)
   1342 {
   1343   return __builtin_ia32_pmovmskb256((__v32qi)__a);
   1344 }
   1345 
   1346 /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
   1347 ///    the 16-bit values in the corresponding elements of a 256-bit vector
   1348 ///    of [16 x i16].
   1349 ///
   1350 /// \code{.operation}
   1351 /// FOR i := 0 TO 15
   1352 ///   j := i*8
   1353 ///   k := i*16
   1354 ///   result[k+15:k] := SignExtend(__V[j+7:j])
   1355 /// ENDFOR
   1356 /// \endcode
   1357 ///
   1358 /// \headerfile <immintrin.h>
   1359 ///
   1360 /// This intrinsic corresponds to the \c VPMOVSXBW instruction.
   1361 ///
   1362 /// \param __V
   1363 ///    A 128-bit integer vector containing the source bytes.
   1364 /// \returns A 256-bit vector of [16 x i16] containing the sign-extended
   1365 ///    values.
   1366 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1367 _mm256_cvtepi8_epi16(__m128i __V)
   1368 {
   1369   /* This function always performs a signed extension, but __v16qi is a char
   1370      which may be signed or unsigned, so use __v16qs. */
   1371   return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
   1372 }
   1373 
   1374 /// Sign-extends bytes from the lower half of the 128-bit integer vector in
   1375 ///    \a __V and returns the 32-bit values in the corresponding elements of a
   1376 ///    256-bit vector of [8 x i32].
   1377 ///
   1378 /// \code{.operation}
   1379 /// FOR i := 0 TO 7
   1380 ///   j := i*8
   1381 ///   k := i*32
   1382 ///   result[k+31:k] := SignExtend(__V[j+7:j])
   1383 /// ENDFOR
   1384 /// \endcode
   1385 ///
   1386 /// \headerfile <immintrin.h>
   1387 ///
   1388 /// This intrinsic corresponds to the \c VPMOVSXBD instruction.
   1389 ///
   1390 /// \param __V
   1391 ///    A 128-bit integer vector containing the source bytes.
   1392 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
   1393 ///    values.
   1394 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1395 _mm256_cvtepi8_epi32(__m128i __V)
   1396 {
   1397   /* This function always performs a signed extension, but __v16qi is a char
   1398      which may be signed or unsigned, so use __v16qs. */
   1399   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
   1400 }
   1401 
   1402 /// Sign-extends the first four bytes from the 128-bit integer vector in
   1403 ///    \a __V and returns the 64-bit values in the corresponding elements of a
   1404 ///    256-bit vector of [4 x i64].
   1405 ///
   1406 /// \code{.operation}
   1407 /// result[63:0] := SignExtend(__V[7:0])
   1408 /// result[127:64] := SignExtend(__V[15:8])
   1409 /// result[191:128] := SignExtend(__V[23:16])
   1410 /// result[255:192] := SignExtend(__V[31:24])
   1411 /// \endcode
   1412 ///
   1413 /// \headerfile <immintrin.h>
   1414 ///
   1415 /// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
   1416 ///
   1417 /// \param __V
   1418 ///    A 128-bit integer vector containing the source bytes.
   1419 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
   1420 ///    values.
   1421 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1422 _mm256_cvtepi8_epi64(__m128i __V)
   1423 {
   1424   /* This function always performs a signed extension, but __v16qi is a char
   1425      which may be signed or unsigned, so use __v16qs. */
   1426   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
   1427 }
   1428 
   1429 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
   1430 ///    \a __V and returns the 32-bit values in the corresponding elements of a
   1431 ///    256-bit vector of [8 x i32].
   1432 ///
   1433 /// \code{.operation}
   1434 /// FOR i := 0 TO 7
   1435 ///   j := i*16
   1436 ///   k := i*32
   1437 ///   result[k+31:k] := SignExtend(__V[j+15:j])
   1438 /// ENDFOR
   1439 /// \endcode
   1440 ///
   1441 /// \headerfile <immintrin.h>
   1442 ///
   1443 /// This intrinsic corresponds to the \c VPMOVSXWD instruction.
   1444 ///
   1445 /// \param __V
   1446 ///    A 128-bit vector of [8 x i16] containing the source values.
   1447 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
   1448 ///    values.
   1449 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1450 _mm256_cvtepi16_epi32(__m128i __V)
   1451 {
   1452   return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
   1453 }
   1454 
   1455 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
   1456 ///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
   1457 ///    elements of a 256-bit vector of [4 x i64].
   1458 ///
   1459 /// \code{.operation}
   1460 /// result[63:0] := SignExtend(__V[15:0])
   1461 /// result[127:64] := SignExtend(__V[31:16])
   1462 /// result[191:128] := SignExtend(__V[47:32])
   1463 /// result[255:192] := SignExtend(__V[64:48])
   1464 /// \endcode
   1465 ///
   1466 /// \headerfile <immintrin.h>
   1467 ///
   1468 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
   1469 ///
   1470 /// \param __V
   1471 ///    A 128-bit vector of [8 x i16] containing the source values.
   1472 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
   1473 ///    values.
   1474 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1475 _mm256_cvtepi16_epi64(__m128i __V)
   1476 {
   1477   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
   1478 }
   1479 
   1480 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
   1481 ///    \a __V and returns the 64-bit values in the corresponding elements of a
   1482 ///    256-bit vector of [4 x i64].
   1483 ///
   1484 /// \code{.operation}
   1485 /// result[63:0] := SignExtend(__V[31:0])
   1486 /// result[127:64] := SignExtend(__V[63:32])
   1487 /// result[191:128] := SignExtend(__V[95:64])
   1488 /// result[255:192] := SignExtend(__V[127:96])
   1489 /// \endcode
   1490 ///
   1491 /// \headerfile <immintrin.h>
   1492 ///
   1493 /// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
   1494 ///
   1495 /// \param __V
   1496 ///    A 128-bit vector of [4 x i32] containing the source values.
   1497 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
   1498 ///    values.
   1499 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1500 _mm256_cvtepi32_epi64(__m128i __V)
   1501 {
   1502   return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
   1503 }
   1504 
   1505 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
   1506 ///    the 16-bit values in the corresponding elements of a 256-bit vector
   1507 ///    of [16 x i16].
   1508 ///
   1509 /// \code{.operation}
   1510 /// FOR i := 0 TO 15
   1511 ///   j := i*8
   1512 ///   k := i*16
   1513 ///   result[k+15:k] := ZeroExtend(__V[j+7:j])
   1514 /// ENDFOR
   1515 /// \endcode
   1516 ///
   1517 /// \headerfile <immintrin.h>
   1518 ///
   1519 /// This intrinsic corresponds to the \c VPMOVZXBW instruction.
   1520 ///
   1521 /// \param __V
   1522 ///    A 128-bit integer vector containing the source bytes.
   1523 /// \returns A 256-bit vector of [16 x i16] containing the zero-extended
   1524 ///    values.
   1525 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1526 _mm256_cvtepu8_epi16(__m128i __V)
   1527 {
   1528   return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
   1529 }
   1530 
   1531 /// Zero-extends bytes from the lower half of the 128-bit integer vector in
   1532 ///    \a __V and returns the 32-bit values in the corresponding elements of a
   1533 ///    256-bit vector of [8 x i32].
   1534 ///
   1535 /// \code{.operation}
   1536 /// FOR i := 0 TO 7
   1537 ///   j := i*8
   1538 ///   k := i*32
   1539 ///   result[k+31:k] := ZeroExtend(__V[j+7:j])
   1540 /// ENDFOR
   1541 /// \endcode
   1542 ///
   1543 /// \headerfile <immintrin.h>
   1544 ///
   1545 /// This intrinsic corresponds to the \c VPMOVZXBD instruction.
   1546 ///
   1547 /// \param __V
   1548 ///    A 128-bit integer vector containing the source bytes.
   1549 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
   1550 ///    values.
   1551 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1552 _mm256_cvtepu8_epi32(__m128i __V)
   1553 {
   1554   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
   1555 }
   1556 
   1557 /// Zero-extends the first four bytes from the 128-bit integer vector in
   1558 ///    \a __V and returns the 64-bit values in the corresponding elements of a
   1559 ///    256-bit vector of [4 x i64].
   1560 ///
   1561 /// \code{.operation}
   1562 /// result[63:0] := ZeroExtend(__V[7:0])
   1563 /// result[127:64] := ZeroExtend(__V[15:8])
   1564 /// result[191:128] := ZeroExtend(__V[23:16])
   1565 /// result[255:192] := ZeroExtend(__V[31:24])
   1566 /// \endcode
   1567 ///
   1568 /// \headerfile <immintrin.h>
   1569 ///
   1570 /// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
   1571 ///
   1572 /// \param __V
   1573 ///    A 128-bit integer vector containing the source bytes.
   1574 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
   1575 ///    values.
   1576 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1577 _mm256_cvtepu8_epi64(__m128i __V)
   1578 {
   1579   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
   1580 }
   1581 
   1582 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
   1583 ///    \a __V and returns the 32-bit values in the corresponding elements of a
   1584 ///    256-bit vector of [8 x i32].
   1585 ///
   1586 /// \code{.operation}
   1587 /// FOR i := 0 TO 7
   1588 ///   j := i*16
   1589 ///   k := i*32
   1590 ///   result[k+31:k] := ZeroExtend(__V[j+15:j])
   1591 /// ENDFOR
   1592 /// \endcode
   1593 ///
   1594 /// \headerfile <immintrin.h>
   1595 ///
   1596 /// This intrinsic corresponds to the \c VPMOVZXWD instruction.
   1597 ///
   1598 /// \param __V
   1599 ///    A 128-bit vector of [8 x i16] containing the source values.
   1600 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
   1601 ///    values.
   1602 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1603 _mm256_cvtepu16_epi32(__m128i __V)
   1604 {
   1605   return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
   1606 }
   1607 
   1608 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
   1609 ///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
   1610 ///    elements of a 256-bit vector of [4 x i64].
   1611 ///
   1612 /// \code{.operation}
   1613 /// result[63:0] := ZeroExtend(__V[15:0])
   1614 /// result[127:64] := ZeroExtend(__V[31:16])
   1615 /// result[191:128] := ZeroExtend(__V[47:32])
   1616 /// result[255:192] := ZeroExtend(__V[64:48])
   1617 /// \endcode
   1618 ///
   1619 /// \headerfile <immintrin.h>
   1620 ///
   1621 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
   1622 ///
   1623 /// \param __V
   1624 ///    A 128-bit vector of [8 x i16] containing the source values.
   1625 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
   1626 ///    values.
   1627 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1628 _mm256_cvtepu16_epi64(__m128i __V)
   1629 {
   1630   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
   1631 }
   1632 
   1633 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
   1634 ///    \a __V and returns the 64-bit values in the corresponding elements of a
   1635 ///    256-bit vector of [4 x i64].
   1636 ///
   1637 /// \code{.operation}
   1638 /// result[63:0] := ZeroExtend(__V[31:0])
   1639 /// result[127:64] := ZeroExtend(__V[63:32])
   1640 /// result[191:128] := ZeroExtend(__V[95:64])
   1641 /// result[255:192] := ZeroExtend(__V[127:96])
   1642 /// \endcode
   1643 ///
   1644 /// \headerfile <immintrin.h>
   1645 ///
   1646 /// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
   1647 ///
   1648 /// \param __V
   1649 ///    A 128-bit vector of [4 x i32] containing the source values.
   1650 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
   1651 ///    values.
   1652 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1653 _mm256_cvtepu32_epi64(__m128i __V)
   1654 {
   1655   return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
   1656 }
   1657 
   1658 /// Multiplies signed 32-bit integers from even-numbered elements of two
   1659 ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
   1660 ///    [4 x i64] result.
   1661 ///
   1662 /// \code{.operation}
   1663 /// result[63:0] := __a[31:0] * __b[31:0]
   1664 /// result[127:64] := __a[95:64] * __b[95:64]
   1665 /// result[191:128] := __a[159:128] * __b[159:128]
   1666 /// result[255:192] := __a[223:192] * __b[223:192]
   1667 /// \endcode
   1668 ///
   1669 /// \headerfile <immintrin.h>
   1670 ///
   1671 /// This intrinsic corresponds to the \c VPMULDQ instruction.
   1672 ///
   1673 /// \param __a
   1674 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
   1675 /// \param __b
   1676 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
   1677 /// \returns A 256-bit vector of [4 x i64] containing the products.
   1678 static __inline__  __m256i __DEFAULT_FN_ATTRS256
   1679 _mm256_mul_epi32(__m256i __a, __m256i __b)
   1680 {
   1681   return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
   1682 }
   1683 
   1684 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
   1685 ///    [16 x i16], truncates the 32-bit results to the most significant 18
   1686 ///    bits, rounds by adding 1, and returns bits [16:1] of each rounded
   1687 ///    product in the [16 x i16] result.
   1688 ///
   1689 /// \code{.operation}
   1690 /// FOR i := 0 TO 15
   1691 ///   j := i*16
   1692 ///   temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
   1693 ///   result[j+15:j] := temp[16:1]
   1694 /// \endcode
   1695 ///
   1696 /// \headerfile <immintrin.h>
   1697 ///
   1698 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
   1699 ///
   1700 /// \param __a
   1701 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
   1702 /// \param __b
   1703 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
   1704 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
   1705 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1706 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
   1707 {
   1708   return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
   1709 }
   1710 
   1711 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
   1712 ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
   1713 ///    [16 x i16] result.
   1714 ///
   1715 /// \headerfile <immintrin.h>
   1716 ///
   1717 /// This intrinsic corresponds to the \c VPMULHUW instruction.
   1718 ///
   1719 /// \param __a
   1720 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
   1721 /// \param __b
   1722 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
   1723 /// \returns A 256-bit vector of [16 x i16] containing the products.
   1724 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1725 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
   1726 {
   1727   return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
   1728 }
   1729 
   1730 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
   1731 ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
   1732 ///    [16 x i16] result.
   1733 ///
   1734 /// \headerfile <immintrin.h>
   1735 ///
   1736 /// This intrinsic corresponds to the \c VPMULHW instruction.
   1737 ///
   1738 /// \param __a
   1739 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
   1740 /// \param __b
   1741 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
   1742 /// \returns A 256-bit vector of [16 x i16] containing the products.
   1743 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1744 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
   1745 {
   1746   return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
   1747 }
   1748 
   1749 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
   1750 ///    [16 x i16], and returns the lower 16 bits of each 32-bit product in the
   1751 ///    [16 x i16] result.
   1752 ///
   1753 /// \headerfile <immintrin.h>
   1754 ///
   1755 /// This intrinsic corresponds to the \c VPMULLW instruction.
   1756 ///
   1757 /// \param __a
   1758 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
   1759 /// \param __b
   1760 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
   1761 /// \returns A 256-bit vector of [16 x i16] containing the products.
   1762 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1763 _mm256_mullo_epi16(__m256i __a, __m256i __b)
   1764 {
   1765   return (__m256i)((__v16hu)__a * (__v16hu)__b);
   1766 }
   1767 
   1768 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
   1769 ///    [8 x i32], and returns the lower 32 bits of each 64-bit product in the
   1770 ///    [8 x i32] result.
   1771 ///
   1772 /// \headerfile <immintrin.h>
   1773 ///
   1774 /// This intrinsic corresponds to the \c VPMULLD instruction.
   1775 ///
   1776 /// \param __a
   1777 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
   1778 /// \param __b
   1779 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
   1780 /// \returns A 256-bit vector of [8 x i32] containing the products.
   1781 static __inline__  __m256i __DEFAULT_FN_ATTRS256
   1782 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
   1783 {
   1784   return (__m256i)((__v8su)__a * (__v8su)__b);
   1785 }
   1786 
   1787 /// Multiplies unsigned 32-bit integers from even-numered elements of two
   1788 ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
   1789 ///    [4 x i64] result.
   1790 ///
   1791 /// \code{.operation}
   1792 /// result[63:0] := __a[31:0] * __b[31:0]
   1793 /// result[127:64] := __a[95:64] * __b[95:64]
   1794 /// result[191:128] := __a[159:128] * __b[159:128]
   1795 /// result[255:192] := __a[223:192] * __b[223:192]
   1796 /// \endcode
   1797 ///
   1798 /// \headerfile <immintrin.h>
   1799 ///
   1800 /// This intrinsic corresponds to the \c VPMULUDQ instruction.
   1801 ///
   1802 /// \param __a
   1803 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
   1804 /// \param __b
   1805 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
   1806 /// \returns A 256-bit vector of [4 x i64] containing the products.
   1807 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1808 _mm256_mul_epu32(__m256i __a, __m256i __b)
   1809 {
   1810   return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
   1811 }
   1812 
   1813 /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
   1814 ///    \a __b.
   1815 ///
   1816 /// \headerfile <immintrin.h>
   1817 ///
   1818 /// This intrinsic corresponds to the \c VPOR instruction.
   1819 ///
   1820 /// \param __a
   1821 ///    A 256-bit integer vector.
   1822 /// \param __b
   1823 ///    A 256-bit integer vector.
   1824 /// \returns A 256-bit integer vector containing the result.
   1825 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1826 _mm256_or_si256(__m256i __a, __m256i __b)
   1827 {
   1828   return (__m256i)((__v4du)__a | (__v4du)__b);
   1829 }
   1830 
   1831 /// Computes four sum of absolute difference (SAD) operations on sets of eight
   1832 ///    unsigned 8-bit integers from the 256-bit integer vectors \a __a and
   1833 ///    \a __b.
   1834 ///
   1835 ///    One SAD result is computed for each set of eight bytes from \a __a and
   1836 ///    eight bytes from \a __b. The zero-extended SAD value is returned in the
   1837 ///    corresponding 64-bit element of the result.
   1838 ///
   1839 ///    A single SAD operation takes the differences between the corresponding
   1840 ///    bytes of \a __a and \a __b, takes the absolute value of each difference,
   1841 ///    and sums these eight values to form one 16-bit result. This operation
   1842 ///    is repeated four times with successive sets of eight bytes.
   1843 ///
   1844 /// \code{.operation}
   1845 /// FOR i := 0 TO 3
   1846 ///   j := i*64
   1847 ///   temp0 := ABS(__a[j+7:j] - __b[j+7:j])
   1848 ///   temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
   1849 ///   temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
   1850 ///   temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
   1851 ///   temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
   1852 ///   temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
   1853 ///   temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
   1854 ///   temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
   1855 ///   result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
   1856 ///                     temp4 + temp5 + temp6 + temp7
   1857 ///   result[j+63:j+16] := 0
   1858 /// ENDFOR
   1859 /// \endcode
   1860 ///
   1861 /// \headerfile <immintrin.h>
   1862 ///
   1863 /// This intrinsic corresponds to the \c VPSADBW instruction.
   1864 ///
   1865 /// \param __a
   1866 ///    A 256-bit integer vector.
   1867 /// \param __b
   1868 ///    A 256-bit integer vector.
   1869 /// \returns A 256-bit integer vector containing the result.
   1870 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1871 _mm256_sad_epu8(__m256i __a, __m256i __b)
   1872 {
   1873   return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
   1874 }
   1875 
   1876 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
   1877 ///    to control information in the 256-bit integer vector \a __b, and
   1878 ///    returns the 256-bit result. In effect there are two separate 128-bit
   1879 ///    shuffles in the lower and upper halves.
   1880 ///
   1881 /// \code{.operation}
   1882 /// FOR i := 0 TO 31
   1883 ///   j := i*8
   1884 ///   IF __b[j+7] == 1
   1885 ///     result[j+7:j] := 0
   1886 ///   ELSE
   1887 ///     k := __b[j+3:j] * 8
   1888 ///     IF i > 15
   1889 ///       k := k + 128
   1890 ///     FI
   1891 ///     result[j+7:j] := __a[k+7:k]
   1892 ///   FI
   1893 /// ENDFOR
   1894 /// \endcode
   1895 ///
   1896 /// \headerfile <immintrin.h>
   1897 ///
   1898 /// This intrinsic corresponds to the \c VPSHUFB instruction.
   1899 ///
   1900 /// \param __a
   1901 ///    A 256-bit integer vector containing source values.
   1902 /// \param __b
   1903 ///    A 256-bit integer vector containing control information to determine
   1904 ///    what goes into the corresponding byte of the result. If bit 7 of the
   1905 ///    control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
   1906 ///    control byte specify the index (within the same 128-bit half) of \a __a
   1907 ///    to copy to the result byte.
   1908 /// \returns A 256-bit integer vector containing the result.
   1909 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   1910 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
   1911 {
   1912   return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
   1913 }
   1914 
   1915 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
   1916 ///    according to control information in the integer literal \a imm, and
   1917 ///    returns the 256-bit result. In effect there are two parallel 128-bit
   1918 ///    shuffles in the lower and upper halves.
   1919 ///
   1920 /// \code{.operation}
   1921 /// FOR i := 0 to 3
   1922 ///   j := i*32
   1923 ///   k := (imm >> i*2)[1:0] * 32
   1924 ///   result[j+31:j] := a[k+31:k]
   1925 ///   result[128+j+31:128+j] := a[128+k+31:128+k]
   1926 /// ENDFOR
   1927 /// \endcode
   1928 ///
   1929 /// \headerfile <immintrin.h>
   1930 ///
   1931 /// \code
   1932 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
   1933 /// \endcode
   1934 ///
   1935 /// This intrinsic corresponds to the \c VPSHUFB instruction.
   1936 ///
   1937 /// \param a
   1938 ///    A 256-bit vector of [8 x i32] containing source values.
   1939 /// \param imm
   1940 ///    An immediate 8-bit value specifying which elements to copy from \a a.
   1941 ///    \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
   1942 ///    result, \a imm[3:2] specifies the index for elements 1 and 5, and so
   1943 ///    forth.
   1944 /// \returns A 256-bit vector of [8 x i32] containing the result.
   1945 #define _mm256_shuffle_epi32(a, imm) \
   1946   ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
   1947 
   1948 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
   1949 ///    according to control information in the integer literal \a imm, and
   1950 ///    returns the 256-bit result. The upper 64 bits of each 128-bit half
   1951 ///    are shuffled in parallel; the lower 64 bits of each 128-bit half are
   1952 ///    copied from \a a unchanged.
   1953 ///
   1954 /// \code{.operation}
   1955 /// result[63:0] := a[63:0]
   1956 /// result[191:128] := a[191:128]
   1957 /// FOR i := 0 TO 3
   1958 ///   j := i * 16 + 64
   1959 ///   k := (imm >> i*2)[1:0] * 16 + 64
   1960 ///   result[j+15:j] := a[k+15:k]
   1961 ///   result[128+j+15:128+j] := a[128+k+15:128+k]
   1962 /// ENDFOR
   1963 /// \endcode
   1964 ///
   1965 /// \headerfile <immintrin.h>
   1966 ///
   1967 /// \code
   1968 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
   1969 /// \endcode
   1970 ///
   1971 /// This intrinsic corresponds to the \c VPSHUFHW instruction.
   1972 ///
   1973 /// \param a
   1974 ///    A 256-bit vector of [16 x i16] containing source values.
   1975 /// \param imm
   1976 ///    An immediate 8-bit value specifying which elements to copy from \a a.
   1977 ///    \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
   1978 ///    result, \a imm[3:2] specifies the index for elements 5 and 9, and so
   1979 ///    forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
   1980 /// \returns A 256-bit vector of [16 x i16] containing the result.
   1981 #define _mm256_shufflehi_epi16(a, imm) \
   1982   ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
   1983 
   1984 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
   1985 ///    according to control information in the integer literal \a imm, and
   1986 ///    returns the 256-bit [16 x i16] result. The lower 64 bits of each
   1987 ///    128-bit half are shuffled; the upper 64 bits of each 128-bit half are
   1988 ///    copied from \a a unchanged.
   1989 ///
   1990 /// \code{.operation}
   1991 /// result[127:64] := a[127:64]
   1992 /// result[255:192] := a[255:192]
   1993 /// FOR i := 0 TO 3
   1994 ///   j := i * 16
   1995 ///   k := (imm >> i*2)[1:0] * 16
   1996 ///   result[j+15:j] := a[k+15:k]
   1997 ///   result[128+j+15:128+j] := a[128+k+15:128+k]
   1998 /// ENDFOR
   1999 /// \endcode
   2000 ///
   2001 /// \headerfile <immintrin.h>
   2002 ///
   2003 /// \code
   2004 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
   2005 /// \endcode
   2006 ///
   2007 /// This intrinsic corresponds to the \c VPSHUFLW instruction.
   2008 ///
   2009 /// \param a
   2010 ///    A 256-bit vector of [16 x i16] to use as a source of data for the
   2011 ///    result.
   2012 /// \param imm
   2013 ///    An immediate 8-bit value specifying which elements to copy from \a a.
   2014 ///    \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
   2015 ///    result, \a imm[3:2] specifies the index for elements 1 and 9, and so
   2016 ///    forth.
   2017 /// \returns A 256-bit vector of [16 x i16] containing the result.
   2018 #define _mm256_shufflelo_epi16(a, imm) \
   2019   ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
   2020 
   2021 /// Sets each byte of the result to the corresponding byte of the 256-bit
   2022 ///    integer vector in \a __a, the negative of that byte, or zero, depending
   2023 ///    on whether the corresponding byte of the 256-bit integer vector in
   2024 ///    \a __b is greater than zero, less than zero, or equal to zero,
   2025 ///    respectively.
   2026 ///
   2027 /// \headerfile <immintrin.h>
   2028 ///
   2029 /// This intrinsic corresponds to the \c VPSIGNB instruction.
   2030 ///
   2031 /// \param __a
   2032 ///    A 256-bit integer vector.
   2033 /// \param __b
   2034 ///    A 256-bit integer vector].
   2035 /// \returns A 256-bit integer vector containing the result.
   2036 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2037 _mm256_sign_epi8(__m256i __a, __m256i __b)
   2038 {
   2039     return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
   2040 }
   2041 
   2042 /// Sets each element of the result to the corresponding element of the
   2043 ///    256-bit vector of [16 x i16] in \a __a, the negative of that element,
   2044 ///    or zero, depending on whether the corresponding element of the 256-bit
   2045 ///    vector of [16 x i16] in \a __b is greater than zero, less than zero, or
   2046 ///    equal to zero, respectively.
   2047 ///
   2048 /// \headerfile <immintrin.h>
   2049 ///
   2050 /// This intrinsic corresponds to the \c VPSIGNW instruction.
   2051 ///
   2052 /// \param __a
   2053 ///    A 256-bit vector of [16 x i16].
   2054 /// \param __b
   2055 ///    A 256-bit vector of [16 x i16].
   2056 /// \returns A 256-bit vector of [16 x i16] containing the result.
   2057 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2058 _mm256_sign_epi16(__m256i __a, __m256i __b)
   2059 {
   2060     return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
   2061 }
   2062 
   2063 /// Sets each element of the result to the corresponding element of the
   2064 ///    256-bit vector of [8 x i32] in \a __a, the negative of that element, or
   2065 ///    zero, depending on whether the corresponding element of the 256-bit
   2066 ///    vector of [8 x i32] in \a __b is greater than zero, less than zero, or
   2067 ///    equal to zero, respectively.
   2068 ///
   2069 /// \headerfile <immintrin.h>
   2070 ///
   2071 /// This intrinsic corresponds to the \c VPSIGND instruction.
   2072 ///
   2073 /// \param __a
   2074 ///    A 256-bit vector of [8 x i32].
   2075 /// \param __b
   2076 ///    A 256-bit vector of [8 x i32].
   2077 /// \returns A 256-bit vector of [8 x i32] containing the result.
   2078 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2079 _mm256_sign_epi32(__m256i __a, __m256i __b)
   2080 {
   2081     return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
   2082 }
   2083 
   2084 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
   2085 ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
   2086 ///    is greater than 15, the returned result is all zeroes.
   2087 ///
   2088 /// \headerfile <immintrin.h>
   2089 ///
   2090 /// \code
   2091 /// __m256i _mm256_slli_si256(__m256i a, const int imm);
   2092 /// \endcode
   2093 ///
   2094 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
   2095 ///
   2096 /// \param a
   2097 ///    A 256-bit integer vector to be shifted.
   2098 /// \param imm
   2099 ///     An unsigned immediate value specifying the shift count (in bytes).
   2100 /// \returns A 256-bit integer vector containing the result.
   2101 #define _mm256_slli_si256(a, imm) \
   2102   ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
   2103 
   2104 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
   2105 ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
   2106 ///    is greater than 15, the returned result is all zeroes.
   2107 ///
   2108 /// \headerfile <immintrin.h>
   2109 ///
   2110 /// \code
   2111 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
   2112 /// \endcode
   2113 ///
   2114 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
   2115 ///
   2116 /// \param a
   2117 ///    A 256-bit integer vector to be shifted.
   2118 /// \param imm
   2119 ///    An unsigned immediate value specifying the shift count (in bytes).
   2120 /// \returns A 256-bit integer vector containing the result.
   2121 #define _mm256_bslli_epi128(a, imm) \
   2122   ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
   2123 
   2124 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
   2125 ///    left by \a __count bits, shifting in zero bits, and returns the result.
   2126 ///    If \a __count is greater than 15, the returned result is all zeroes.
   2127 ///
   2128 /// \headerfile <immintrin.h>
   2129 ///
   2130 /// This intrinsic corresponds to the \c VPSLLW instruction.
   2131 ///
   2132 /// \param __a
   2133 ///    A 256-bit vector of [16 x i16] to be shifted.
   2134 /// \param __count
   2135 ///    An unsigned integer value specifying the shift count (in bits).
   2136 /// \returns A 256-bit vector of [16 x i16] containing the result.
   2137 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2138 _mm256_slli_epi16(__m256i __a, int __count)
   2139 {
   2140   return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
   2141 }
   2142 
   2143 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
   2144 ///    left by the number of bits specified by the lower 64 bits of \a __count,
   2145 ///    shifting in zero bits, and returns the result. If \a __count is greater
   2146 ///    than 15, the returned result is all zeroes.
   2147 ///
   2148 /// \headerfile <immintrin.h>
   2149 ///
   2150 /// This intrinsic corresponds to the \c VPSLLW instruction.
   2151 ///
   2152 /// \param __a
   2153 ///    A 256-bit vector of [16 x i16] to be shifted.
   2154 /// \param __count
   2155 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
   2156 ///    shift count (in bits). The upper element is ignored.
   2157 /// \returns A 256-bit vector of [16 x i16] containing the result.
   2158 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2159 _mm256_sll_epi16(__m256i __a, __m128i __count)
   2160 {
   2161   return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
   2162 }
   2163 
   2164 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
   2165 ///    left by \a __count bits, shifting in zero bits, and returns the result.
   2166 ///    If \a __count is greater than 31, the returned result is all zeroes.
   2167 ///
   2168 /// \headerfile <immintrin.h>
   2169 ///
   2170 /// This intrinsic corresponds to the \c VPSLLD instruction.
   2171 ///
   2172 /// \param __a
   2173 ///    A 256-bit vector of [8 x i32] to be shifted.
   2174 /// \param __count
   2175 ///    An unsigned integer value specifying the shift count (in bits).
   2176 /// \returns A 256-bit vector of [8 x i32] containing the result.
   2177 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2178 _mm256_slli_epi32(__m256i __a, int __count)
   2179 {
   2180   return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
   2181 }
   2182 
   2183 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
   2184 ///    left by the number of bits given in the lower 64 bits of \a __count,
   2185 ///    shifting in zero bits, and returns the result. If \a __count is greater
   2186 ///    than 31, the returned result is all zeroes.
   2187 ///
   2188 /// \headerfile <immintrin.h>
   2189 ///
   2190 /// This intrinsic corresponds to the \c VPSLLD instruction.
   2191 ///
   2192 /// \param __a
   2193 ///    A 256-bit vector of [8 x i32] to be shifted.
   2194 /// \param __count
   2195 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
   2196 ///    shift count (in bits). The upper element is ignored.
   2197 /// \returns A 256-bit vector of [8 x i32] containing the result.
   2198 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2199 _mm256_sll_epi32(__m256i __a, __m128i __count)
   2200 {
   2201   return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
   2202 }
   2203 
   2204 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
   2205 ///    left by \a __count bits, shifting in zero bits, and returns the result.
   2206 ///    If \a __count is greater than 63, the returned result is all zeroes.
   2207 ///
   2208 /// \headerfile <immintrin.h>
   2209 ///
   2210 /// This intrinsic corresponds to the \c VPSLLQ instruction.
   2211 ///
   2212 /// \param __a
   2213 ///    A 256-bit vector of [4 x i64] to be shifted.
   2214 /// \param __count
   2215 ///    An unsigned integer value specifying the shift count (in bits).
   2216 /// \returns A 256-bit vector of [4 x i64] containing the result.
   2217 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2218 _mm256_slli_epi64(__m256i __a, int __count)
   2219 {
   2220   return __builtin_ia32_psllqi256((__v4di)__a, __count);
   2221 }
   2222 
   2223 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
   2224 ///    left by the number of bits given in the lower 64 bits of \a __count,
   2225 ///    shifting in zero bits, and returns the result. If \a __count is greater
   2226 ///    than 63, the returned result is all zeroes.
   2227 ///
   2228 /// \headerfile <immintrin.h>
   2229 ///
   2230 /// This intrinsic corresponds to the \c VPSLLQ instruction.
   2231 ///
   2232 /// \param __a
   2233 ///    A 256-bit vector of [4 x i64] to be shifted.
   2234 /// \param __count
   2235 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
   2236 ///    shift count (in bits). The upper element is ignored.
   2237 /// \returns A 256-bit vector of [4 x i64] containing the result.
   2238 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2239 _mm256_sll_epi64(__m256i __a, __m128i __count)
   2240 {
   2241   return __builtin_ia32_psllq256((__v4di)__a, __count);
   2242 }
   2243 
   2244 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
   2245 ///    right by \a __count bits, shifting in sign bits, and returns the result.
   2246 ///    If \a __count is greater than 15, each element of the result is either
   2247 ///    0 or -1 according to the corresponding input sign bit.
   2248 ///
   2249 /// \headerfile <immintrin.h>
   2250 ///
   2251 /// This intrinsic corresponds to the \c VPSRAW instruction.
   2252 ///
   2253 /// \param __a
   2254 ///    A 256-bit vector of [16 x i16] to be shifted.
   2255 /// \param __count
   2256 ///    An unsigned integer value specifying the shift count (in bits).
   2257 /// \returns A 256-bit vector of [16 x i16] containing the result.
   2258 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2259 _mm256_srai_epi16(__m256i __a, int __count)
   2260 {
   2261   return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
   2262 }
   2263 
   2264 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
   2265 ///    right by the number of bits given in the lower 64 bits of \a __count,
   2266 ///    shifting in sign bits, and returns the result. If \a __count is greater
   2267 ///    than 15, each element of the result is either 0 or -1 according to the
   2268 ///    corresponding input sign bit.
   2269 ///
   2270 /// \headerfile <immintrin.h>
   2271 ///
   2272 /// This intrinsic corresponds to the \c VPSRAW instruction.
   2273 ///
   2274 /// \param __a
   2275 ///    A 256-bit vector of [16 x i16] to be shifted.
   2276 /// \param __count
   2277 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
   2278 ///    shift count (in bits). The upper element is ignored.
   2279 /// \returns A 256-bit vector of [16 x i16] containing the result.
   2280 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2281 _mm256_sra_epi16(__m256i __a, __m128i __count)
   2282 {
   2283   return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
   2284 }
   2285 
   2286 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
   2287 ///    right by \a __count bits, shifting in sign bits, and returns the result.
   2288 ///    If \a __count is greater than 31, each element of the result is either
   2289 ///    0 or -1 according to the corresponding input sign bit.
   2290 ///
   2291 /// \headerfile <immintrin.h>
   2292 ///
   2293 /// This intrinsic corresponds to the \c VPSRAD instruction.
   2294 ///
   2295 /// \param __a
   2296 ///    A 256-bit vector of [8 x i32] to be shifted.
   2297 /// \param __count
   2298 ///    An unsigned integer value specifying the shift count (in bits).
   2299 /// \returns A 256-bit vector of [8 x i32] containing the result.
   2300 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2301 _mm256_srai_epi32(__m256i __a, int __count)
   2302 {
   2303   return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
   2304 }
   2305 
   2306 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
   2307 ///    right by the number of bits given in the lower 64 bits of \a __count,
   2308 ///    shifting in sign bits, and returns the result. If \a __count is greater
   2309 ///    than 31, each element of the result is either 0 or -1 according to the
   2310 ///    corresponding input sign bit.
   2311 ///
   2312 /// \headerfile <immintrin.h>
   2313 ///
   2314 /// This intrinsic corresponds to the \c VPSRAD instruction.
   2315 ///
   2316 /// \param __a
   2317 ///    A 256-bit vector of [8 x i32] to be shifted.
   2318 /// \param __count
   2319 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
   2320 ///    shift count (in bits). The upper element is ignored.
   2321 /// \returns A 256-bit vector of [8 x i32] containing the result.
   2322 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2323 _mm256_sra_epi32(__m256i __a, __m128i __count)
   2324 {
   2325   return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
   2326 }
   2327 
   2328 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
   2329 ///    \a imm bytes, shifting in zero bytes, and returns the result. If
   2330 ///    \a imm is greater than 15, the returned result is all zeroes.
   2331 ///
   2332 /// \headerfile <immintrin.h>
   2333 ///
   2334 /// \code
   2335 /// __m256i _mm256_srli_si256(__m256i a, const int imm);
   2336 /// \endcode
   2337 ///
   2338 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
   2339 ///
   2340 /// \param a
   2341 ///    A 256-bit integer vector to be shifted.
   2342 /// \param imm
   2343 ///    An unsigned immediate value specifying the shift count (in bytes).
   2344 /// \returns A 256-bit integer vector containing the result.
   2345 #define _mm256_srli_si256(a, imm) \
   2346   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
   2347 
   2348 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
   2349 ///    \a imm bytes, shifting in zero bytes, and returns the result. If
   2350 ///    \a imm is greater than 15, the returned result is all zeroes.
   2351 ///
   2352 /// \headerfile <immintrin.h>
   2353 ///
   2354 /// \code
   2355 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
   2356 /// \endcode
   2357 ///
   2358 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
   2359 ///
   2360 /// \param a
   2361 ///    A 256-bit integer vector to be shifted.
   2362 /// \param imm
   2363 ///     An unsigned immediate value specifying the shift count (in bytes).
   2364 /// \returns A 256-bit integer vector containing the result.
   2365 #define _mm256_bsrli_epi128(a, imm) \
   2366   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
   2367 
   2368 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
   2369 ///    right by \a __count bits, shifting in zero bits, and returns the result.
   2370 ///    If \a __count is greater than 15, the returned result is all zeroes.
   2371 ///
   2372 /// \headerfile <immintrin.h>
   2373 ///
   2374 /// This intrinsic corresponds to the \c VPSRLW instruction.
   2375 ///
   2376 /// \param __a
   2377 ///    A 256-bit vector of [16 x i16] to be shifted.
   2378 /// \param __count
   2379 ///    An unsigned integer value specifying the shift count (in bits).
   2380 /// \returns A 256-bit vector of [16 x i16] containing the result.
   2381 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2382 _mm256_srli_epi16(__m256i __a, int __count)
   2383 {
   2384   return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
   2385 }
   2386 
   2387 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
   2388 ///    right by the number of bits given in the lower 64 bits of \a __count,
   2389 ///    shifting in zero bits, and returns the result. If \a __count is greater
   2390 ///    than 15, the returned result is all zeroes.
   2391 ///
   2392 /// \headerfile <immintrin.h>
   2393 ///
   2394 /// This intrinsic corresponds to the \c VPSRLW instruction.
   2395 ///
   2396 /// \param __a
   2397 ///    A 256-bit vector of [16 x i16] to be shifted.
   2398 /// \param __count
   2399 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
   2400 ///    shift count (in bits). The upper element is ignored.
   2401 /// \returns A 256-bit vector of [16 x i16] containing the result.
   2402 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2403 _mm256_srl_epi16(__m256i __a, __m128i __count)
   2404 {
   2405   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
   2406 }
   2407 
   2408 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
   2409 ///    right by \a __count bits, shifting in zero bits, and returns the result.
   2410 ///    If \a __count is greater than 31, the returned result is all zeroes.
   2411 ///
   2412 /// \headerfile <immintrin.h>
   2413 ///
   2414 /// This intrinsic corresponds to the \c VPSRLD instruction.
   2415 ///
   2416 /// \param __a
   2417 ///    A 256-bit vector of [8 x i32] to be shifted.
   2418 /// \param __count
   2419 ///    An unsigned integer value specifying the shift count (in bits).
   2420 /// \returns A 256-bit vector of [8 x i32] containing the result.
   2421 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2422 _mm256_srli_epi32(__m256i __a, int __count)
   2423 {
   2424   return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
   2425 }
   2426 
   2427 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
   2428 ///    right by the number of bits given in the lower 64 bits of \a __count,
   2429 ///    shifting in zero bits, and returns the result. If \a __count is greater
   2430 ///    than 31, the returned result is all zeroes.
   2431 ///
   2432 /// \headerfile <immintrin.h>
   2433 ///
   2434 /// This intrinsic corresponds to the \c VPSRLD instruction.
   2435 ///
   2436 /// \param __a
   2437 ///    A 256-bit vector of [8 x i32] to be shifted.
   2438 /// \param __count
   2439 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
   2440 ///    shift count (in bits). The upper element is ignored.
   2441 /// \returns A 256-bit vector of [8 x i32] containing the result.
   2442 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2443 _mm256_srl_epi32(__m256i __a, __m128i __count)
   2444 {
   2445   return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
   2446 }
   2447 
   2448 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
   2449 ///    right by \a __count bits, shifting in zero bits, and returns the result.
   2450 ///    If \a __count is greater than 63, the returned result is all zeroes.
   2451 ///
   2452 /// \headerfile <immintrin.h>
   2453 ///
   2454 /// This intrinsic corresponds to the \c VPSRLQ instruction.
   2455 ///
   2456 /// \param __a
   2457 ///    A 256-bit vector of [4 x i64] to be shifted.
   2458 /// \param __count
   2459 ///    An unsigned integer value specifying the shift count (in bits).
   2460 /// \returns A 256-bit vector of [4 x i64] containing the result.
   2461 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2462 _mm256_srli_epi64(__m256i __a, int __count)
   2463 {
   2464   return __builtin_ia32_psrlqi256((__v4di)__a, __count);
   2465 }
   2466 
   2467 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
   2468 ///    right by the number of bits given in the lower 64 bits of \a __count,
   2469 ///    shifting in zero bits, and returns the result. If \a __count is greater
   2470 ///    than 63, the returned result is all zeroes.
   2471 ///
   2472 /// \headerfile <immintrin.h>
   2473 ///
   2474 /// This intrinsic corresponds to the \c VPSRLQ instruction.
   2475 ///
   2476 /// \param __a
   2477 ///    A 256-bit vector of [4 x i64] to be shifted.
   2478 /// \param __count
   2479 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
   2480 ///    shift count (in bits). The upper element is ignored.
   2481 /// \returns A 256-bit vector of [4 x i64] containing the result.
   2482 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2483 _mm256_srl_epi64(__m256i __a, __m128i __count)
   2484 {
   2485   return __builtin_ia32_psrlq256((__v4di)__a, __count);
   2486 }
   2487 
   2488 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
   2489 ///    vectors. Returns the lower 8 bits of each difference in the
   2490 ///    corresponding byte of the 256-bit integer vector result (overflow is
   2491 ///    ignored).
   2492 ///
   2493 /// \code{.operation}
   2494 /// FOR i := 0 TO 31
   2495 ///   j := i*8
   2496 ///   result[j+7:j] := __a[j+7:j] - __b[j+7:j]
   2497 /// ENDFOR
   2498 /// \endcode
   2499 ///
   2500 /// \headerfile <immintrin.h>
   2501 ///
   2502 /// This intrinsic corresponds to the \c VPSUBB instruction.
   2503 ///
   2504 /// \param __a
   2505 ///    A 256-bit integer vector containing the minuends.
   2506 /// \param __b
   2507 ///    A 256-bit integer vector containing the subtrahends.
   2508 /// \returns A 256-bit integer vector containing the differences.
   2509 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2510 _mm256_sub_epi8(__m256i __a, __m256i __b)
   2511 {
   2512   return (__m256i)((__v32qu)__a - (__v32qu)__b);
   2513 }
   2514 
   2515 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
   2516 ///    vectors of [16 x i16]. Returns the lower 16 bits of each difference in
   2517 ///    the corresponding element of the [16 x i16] result (overflow is
   2518 ///    ignored).
   2519 ///
   2520 /// \code{.operation}
   2521 /// FOR i := 0 TO 15
   2522 ///   j := i*16
   2523 ///   result[j+15:j] := __a[j+15:j] - __b[j+15:j]
   2524 /// ENDFOR
   2525 /// \endcode
   2526 ///
   2527 /// \headerfile <immintrin.h>
   2528 ///
   2529 /// This intrinsic corresponds to the \c VPSUBW instruction.
   2530 ///
   2531 /// \param __a
   2532 ///    A 256-bit vector of [16 x i16] containing the minuends.
   2533 /// \param __b
   2534 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
   2535 /// \returns A 256-bit vector of [16 x i16] containing the differences.
   2536 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2537 _mm256_sub_epi16(__m256i __a, __m256i __b)
   2538 {
   2539   return (__m256i)((__v16hu)__a - (__v16hu)__b);
   2540 }
   2541 
   2542 /// Subtracts 32-bit integers from corresponding elements of two 256-bit
   2543 ///    vectors of [8 x i32]. Returns the lower 32 bits of each difference in
   2544 ///    the corresponding element of the [8 x i32] result (overflow is ignored).
   2545 ///
   2546 /// \code{.operation}
   2547 /// FOR i := 0 TO 7
   2548 ///   j := i*32
   2549 ///   result[j+31:j] := __a[j+31:j] - __b[j+31:j]
   2550 /// ENDFOR
   2551 /// \endcode
   2552 ///
   2553 /// \headerfile <immintrin.h>
   2554 ///
   2555 /// This intrinsic corresponds to the \c VPSUBD instruction.
   2556 ///
   2557 /// \param __a
   2558 ///    A 256-bit vector of [8 x i32] containing the minuends.
   2559 /// \param __b
   2560 ///    A 256-bit vector of [8 x i32] containing the subtrahends.
   2561 /// \returns A 256-bit vector of [8 x i32] containing the differences.
   2562 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2563 _mm256_sub_epi32(__m256i __a, __m256i __b)
   2564 {
   2565   return (__m256i)((__v8su)__a - (__v8su)__b);
   2566 }
   2567 
   2568 /// Subtracts 64-bit integers from corresponding elements of two 256-bit
   2569 ///    vectors of [4 x i64]. Returns the lower 64 bits of each difference in
   2570 ///    the corresponding element of the [4 x i64] result (overflow is ignored).
   2571 ///
   2572 /// \code{.operation}
   2573 /// FOR i := 0 TO 3
   2574 ///   j := i*64
   2575 ///   result[j+63:j] := __a[j+63:j] - __b[j+63:j]
   2576 /// ENDFOR
   2577 /// \endcode
   2578 ///
   2579 /// \headerfile <immintrin.h>
   2580 ///
   2581 /// This intrinsic corresponds to the \c VPSUBQ instruction.
   2582 ///
   2583 /// \param __a
   2584 ///    A 256-bit vector of [4 x i64] containing the minuends.
   2585 /// \param __b
   2586 ///    A 256-bit vector of [4 x i64] containing the subtrahends.
   2587 /// \returns A 256-bit vector of [4 x i64] containing the differences.
   2588 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2589 _mm256_sub_epi64(__m256i __a, __m256i __b)
   2590 {
   2591   return (__m256i)((__v4du)__a - (__v4du)__b);
   2592 }
   2593 
   2594 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
   2595 ///    vectors using signed saturation, and returns each differences in the
   2596 ///    corresponding byte of the 256-bit integer vector result.
   2597 ///
   2598 /// \code{.operation}
   2599 /// FOR i := 0 TO 31
   2600 ///   j := i*8
   2601 ///   result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
   2602 /// ENDFOR
   2603 /// \endcode
   2604 ///
   2605 /// \headerfile <immintrin.h>
   2606 ///
   2607 /// This intrinsic corresponds to the \c VPSUBSB instruction.
   2608 ///
   2609 /// \param __a
   2610 ///    A 256-bit integer vector containing the minuends.
   2611 /// \param __b
   2612 ///    A 256-bit integer vector containing the subtrahends.
   2613 /// \returns A 256-bit integer vector containing the differences.
   2614 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2615 _mm256_subs_epi8(__m256i __a, __m256i __b)
   2616 {
   2617   return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
   2618 }
   2619 
   2620 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
   2621 ///    vectors of [16 x i16] using signed saturation, and returns each
   2622 ///    difference in the corresponding element of the [16 x i16] result.
   2623 ///
   2624 /// \code{.operation}
   2625 /// FOR i := 0 TO 15
   2626 ///   j := i*16
   2627 ///   result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
   2628 /// ENDFOR
   2629 /// \endcode
   2630 ///
   2631 /// \headerfile <immintrin.h>
   2632 ///
   2633 /// This intrinsic corresponds to the \c VPSUBSW instruction.
   2634 ///
   2635 /// \param __a
   2636 ///    A 256-bit vector of [16 x i16] containing the minuends.
   2637 /// \param __b
   2638 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
   2639 /// \returns A 256-bit vector of [16 x i16] containing the differences.
   2640 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2641 _mm256_subs_epi16(__m256i __a, __m256i __b)
   2642 {
   2643   return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
   2644 }
   2645 
   2646 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
   2647 ///    vectors using unsigned saturation, and returns each difference in the
   2648 ///    corresponding byte of the 256-bit integer vector result. For each byte,
   2649 ///    computes <c> result = __a - __b </c>.
   2650 ///
   2651 /// \code{.operation}
   2652 /// FOR i := 0 TO 31
   2653 ///   j := i*8
   2654 ///   result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
   2655 /// ENDFOR
   2656 /// \endcode
   2657 ///
   2658 /// \headerfile <immintrin.h>
   2659 ///
   2660 /// This intrinsic corresponds to the \c VPSUBUSB instruction.
   2661 ///
   2662 /// \param __a
   2663 ///    A 256-bit integer vector containing the minuends.
   2664 /// \param __b
   2665 ///    A 256-bit integer vector containing the subtrahends.
   2666 /// \returns A 256-bit integer vector containing the differences.
   2667 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2668 _mm256_subs_epu8(__m256i __a, __m256i __b)
   2669 {
   2670   return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
   2671 }
   2672 
   2673 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
   2674 ///    vectors of [16 x i16] using unsigned saturation, and returns each
   2675 ///    difference in the corresponding element of the [16 x i16] result.
   2676 ///
   2677 /// \code{.operation}
   2678 /// FOR i := 0 TO 15
   2679 ///   j := i*16
   2680 ///   result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
   2681 /// ENDFOR
   2682 /// \endcode
   2683 ///
   2684 /// \headerfile <immintrin.h>
   2685 ///
   2686 /// This intrinsic corresponds to the \c VPSUBUSW instruction.
   2687 ///
   2688 /// \param __a
   2689 ///    A 256-bit vector of [16 x i16] containing the minuends.
   2690 /// \param __b
   2691 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
   2692 /// \returns A 256-bit vector of [16 x i16] containing the differences.
   2693 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2694 _mm256_subs_epu16(__m256i __a, __m256i __b)
   2695 {
   2696   return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
   2697 }
   2698 
   2699 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
   2700 ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
   2701 ///    uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
   2702 ///    input; other bits in these parameters are ignored.
   2703 ///
   2704 /// \code{.operation}
   2705 /// result[7:0] := __a[71:64]
   2706 /// result[15:8] := __b[71:64]
   2707 /// result[23:16] := __a[79:72]
   2708 /// result[31:24] := __b[79:72]
   2709 /// . . .
   2710 /// result[127:120] := __b[127:120]
   2711 /// result[135:128] := __a[199:192]
   2712 /// . . .
   2713 /// result[255:248] := __b[255:248]
   2714 /// \endcode
   2715 ///
   2716 /// \headerfile <immintrin.h>
   2717 ///
   2718 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
   2719 ///
   2720 /// \param __a
   2721 ///    A 256-bit integer vector used as the source for the even-numbered bytes
   2722 ///    of the result.
   2723 /// \param __b
   2724 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
   2725 ///    of the result.
   2726 /// \returns A 256-bit integer vector containing the result.
   2727 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2728 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
   2729 {
   2730   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
   2731 }
   2732 
   2733 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
   2734 ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
   2735 ///    vector of [16 x i16]. Specifically, uses the upper 64 bits of each
   2736 ///    128-bit half of \a __a and \a __b as input; other bits in these
   2737 ///    parameters are ignored.
   2738 ///
   2739 /// \code{.operation}
   2740 /// result[15:0] := __a[79:64]
   2741 /// result[31:16] := __b[79:64]
   2742 /// result[47:32] := __a[95:80]
   2743 /// result[63:48] := __b[95:80]
   2744 /// . . .
   2745 /// result[127:112] := __b[127:112]
   2746 /// result[143:128] := __a[211:196]
   2747 /// . . .
   2748 /// result[255:240] := __b[255:240]
   2749 /// \endcode
   2750 ///
   2751 /// \headerfile <immintrin.h>
   2752 ///
   2753 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
   2754 ///
   2755 /// \param __a
   2756 ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
   2757 ///    elements of the result.
   2758 /// \param __b
   2759 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
   2760 ///    elements of the result.
   2761 /// \returns A 256-bit vector of [16 x i16] containing the result.
   2762 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2763 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
   2764 {
   2765   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
   2766 }
   2767 
   2768 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
   2769 ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
   2770 ///    of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
   2771 ///    of \a __a and \a __b as input; other bits in these parameters are
   2772 ///    ignored.
   2773 ///
   2774 /// \code{.operation}
   2775 /// result[31:0] := __a[95:64]
   2776 /// result[63:32] := __b[95:64]
   2777 /// result[95:64] := __a[127:96]
   2778 /// result[127:96] := __b[127:96]
   2779 /// result[159:128] := __a[223:192]
   2780 /// result[191:160] := __b[223:192]
   2781 /// result[223:192] := __a[255:224]
   2782 /// result[255:224] := __b[255:224]
   2783 /// \endcode
   2784 ///
   2785 /// \headerfile <immintrin.h>
   2786 ///
   2787 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
   2788 ///
   2789 /// \param __a
   2790 ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
   2791 ///    elements of the result.
   2792 /// \param __b
   2793 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
   2794 ///    elements of the result.
   2795 /// \returns A 256-bit vector of [8 x i32] containing the result.
   2796 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2797 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
   2798 {
   2799   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
   2800 }
   2801 
   2802 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
   2803 ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
   2804 ///    of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
   2805 ///    of \a __a and \a __b as input; other bits in these parameters are
   2806 ///    ignored.
   2807 ///
   2808 /// \code{.operation}
   2809 /// result[63:0] := __a[127:64]
   2810 /// result[127:64] := __b[127:64]
   2811 /// result[191:128] := __a[255:192]
   2812 /// result[255:192] := __b[255:192]
   2813 /// \endcode
   2814 ///
   2815 /// \headerfile <immintrin.h>
   2816 ///
   2817 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
   2818 ///
   2819 /// \param __a
   2820 ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
   2821 ///    elements of the result.
   2822 /// \param __b
   2823 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
   2824 ///    elements of the result.
   2825 /// \returns A 256-bit vector of [4 x i64] containing the result.
   2826 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2827 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
   2828 {
   2829   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
   2830 }
   2831 
   2832 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
   2833 ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
   2834 ///    uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
   2835 ///    input; other bits in these parameters are ignored.
   2836 ///
   2837 /// \code{.operation}
   2838 /// result[7:0] := __a[7:0]
   2839 /// result[15:8] := __b[7:0]
   2840 /// result[23:16] := __a[15:8]
   2841 /// result[31:24] := __b[15:8]
   2842 /// . . .
   2843 /// result[127:120] := __b[63:56]
   2844 /// result[135:128] := __a[135:128]
   2845 /// . . .
   2846 /// result[255:248] := __b[191:184]
   2847 /// \endcode
   2848 ///
   2849 /// \headerfile <immintrin.h>
   2850 ///
   2851 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
   2852 ///
   2853 /// \param __a
   2854 ///    A 256-bit integer vector used as the source for the even-numbered bytes
   2855 ///    of the result.
   2856 /// \param __b
   2857 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
   2858 ///    of the result.
   2859 /// \returns A 256-bit integer vector containing the result.
   2860 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2861 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
   2862 {
   2863   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
   2864 }
   2865 
   2866 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
   2867 ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
   2868 ///    vector of [16 x i16]. Specifically, uses the lower 64 bits of each
   2869 ///    128-bit half of \a __a and \a __b as input; other bits in these
   2870 ///    parameters are ignored.
   2871 ///
   2872 /// \code{.operation}
   2873 /// result[15:0] := __a[15:0]
   2874 /// result[31:16] := __b[15:0]
   2875 /// result[47:32] := __a[31:16]
   2876 /// result[63:48] := __b[31:16]
   2877 /// . . .
   2878 /// result[127:112] := __b[63:48]
   2879 /// result[143:128] := __a[143:128]
   2880 /// . . .
   2881 /// result[255:239] := __b[191:176]
   2882 /// \endcode
   2883 ///
   2884 /// \headerfile <immintrin.h>
   2885 ///
   2886 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
   2887 ///
   2888 /// \param __a
   2889 ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
   2890 ///    elements of the result.
   2891 /// \param __b
   2892 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
   2893 ///    elements of the result.
   2894 /// \returns A 256-bit vector of [16 x i16] containing the result.
   2895 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2896 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
   2897 {
   2898   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
   2899 }
   2900 
   2901 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
   2902 ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
   2903 ///    of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
   2904 ///    of \a __a and \a __b as input; other bits in these parameters are
   2905 ///    ignored.
   2906 ///
   2907 /// \code{.operation}
   2908 /// result[31:0] := __a[31:0]
   2909 /// result[63:32] := __b[31:0]
   2910 /// result[95:64] := __a[63:32]
   2911 /// result[127:96] := __b[63:32]
   2912 /// result[159:128] := __a[159:128]
   2913 /// result[191:160] := __b[159:128]
   2914 /// result[223:192] := __a[191:160]
   2915 /// result[255:224] := __b[191:190]
   2916 /// \endcode
   2917 ///
   2918 /// \headerfile <immintrin.h>
   2919 ///
   2920 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
   2921 ///
   2922 /// \param __a
   2923 ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
   2924 ///    elements of the result.
   2925 /// \param __b
   2926 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
   2927 ///    elements of the result.
   2928 /// \returns A 256-bit vector of [8 x i32] containing the result.
   2929 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2930 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
   2931 {
   2932   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
   2933 }
   2934 
   2935 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
   2936 ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
   2937 ///    of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
   2938 ///    of \a __a and \a __b as input; other bits in these parameters are
   2939 ///    ignored.
   2940 ///
   2941 /// \code{.operation}
   2942 /// result[63:0] := __a[63:0]
   2943 /// result[127:64] := __b[63:0]
   2944 /// result[191:128] := __a[191:128]
   2945 /// result[255:192] := __b[191:128]
   2946 /// \endcode
   2947 ///
   2948 /// \headerfile <immintrin.h>
   2949 ///
   2950 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
   2951 ///
   2952 /// \param __a
   2953 ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
   2954 ///    elements of the result.
   2955 /// \param __b
   2956 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
   2957 ///    elements of the result.
   2958 /// \returns A 256-bit vector of [4 x i64] containing the result.
   2959 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2960 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
   2961 {
   2962   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
   2963 }
   2964 
   2965 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
   2966 ///    \a __b.
   2967 ///
   2968 /// \headerfile <immintrin.h>
   2969 ///
   2970 /// This intrinsic corresponds to the \c VPXOR instruction.
   2971 ///
   2972 /// \param __a
   2973 ///    A 256-bit integer vector.
   2974 /// \param __b
   2975 ///    A 256-bit integer vector.
   2976 /// \returns A 256-bit integer vector containing the result.
   2977 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2978 _mm256_xor_si256(__m256i __a, __m256i __b)
   2979 {
   2980   return (__m256i)((__v4du)__a ^ (__v4du)__b);
   2981 }
   2982 
   2983 /// Loads the 256-bit integer vector from memory \a __V using a non-temporal
   2984 ///   memory hint and returns the vector. \a __V must be aligned on a 32-byte
   2985 ///   boundary.
   2986 ///
   2987 /// \headerfile <immintrin.h>
   2988 ///
   2989 /// This intrinsic corresponds to the \c VMOVNTDQA instruction.
   2990 ///
   2991 /// \param __V
   2992 ///    A pointer to the 32-byte aligned memory containing the vector to load.
   2993 /// \returns A 256-bit integer vector loaded from memory.
   2994 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   2995 _mm256_stream_load_si256(const void *__V)
   2996 {
   2997   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
   2998   return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
   2999 }
   3000 
   3001 /// Broadcasts the 32-bit floating-point value from the low element of the
   3002 ///    128-bit vector of [4 x float] in \a __X to all elements of the result's
   3003 ///    128-bit vector of [4 x float].
   3004 ///
   3005 /// \headerfile <immintrin.h>
   3006 ///
   3007 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
   3008 ///
   3009 /// \param __X
   3010 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
   3011 /// \returns A 128-bit vector of [4 x float] containing the result.
   3012 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   3013 _mm_broadcastss_ps(__m128 __X)
   3014 {
   3015   return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
   3016 }
   3017 
   3018 /// Broadcasts the 64-bit floating-point value from the low element of the
   3019 ///    128-bit vector of [2 x double] in \a __a to both elements of the
   3020 ///    result's 128-bit vector of [2 x double].
   3021 ///
   3022 /// \headerfile <immintrin.h>
   3023 ///
   3024 /// This intrinsic corresponds to the \c MOVDDUP instruction.
   3025 ///
   3026 /// \param __a
   3027 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
   3028 /// \returns A 128-bit vector of [2 x double] containing the result.
   3029 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   3030 _mm_broadcastsd_pd(__m128d __a)
   3031 {
   3032   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
   3033 }
   3034 
   3035 /// Broadcasts the 32-bit floating-point value from the low element of the
   3036 ///    128-bit vector of [4 x float] in \a __X to all elements of the
   3037 ///    result's 256-bit vector of [8 x float].
   3038 ///
   3039 /// \headerfile <immintrin.h>
   3040 ///
   3041 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
   3042 ///
   3043 /// \param __X
   3044 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
   3045 /// \returns A 256-bit vector of [8 x float] containing the result.
   3046 static __inline__ __m256 __DEFAULT_FN_ATTRS256
   3047 _mm256_broadcastss_ps(__m128 __X)
   3048 {
   3049   return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
   3050 }
   3051 
   3052 /// Broadcasts the 64-bit floating-point value from the low element of the
   3053 ///    128-bit vector of [2 x double] in \a __X to all elements of the
   3054 ///    result's 256-bit vector of [4 x double].
   3055 ///
   3056 /// \headerfile <immintrin.h>
   3057 ///
   3058 /// This intrinsic corresponds to the \c VBROADCASTSD instruction.
   3059 ///
   3060 /// \param __X
   3061 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
   3062 /// \returns A 256-bit vector of [4 x double] containing the result.
   3063 static __inline__ __m256d __DEFAULT_FN_ATTRS256
   3064 _mm256_broadcastsd_pd(__m128d __X)
   3065 {
   3066   return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
   3067 }
   3068 
   3069 /// Broadcasts the 128-bit integer data from \a __X to both the lower and
   3070 ///    upper halves of the 256-bit result.
   3071 ///
   3072 /// \headerfile <immintrin.h>
   3073 ///
   3074 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
   3075 ///
   3076 /// \param __X
   3077 ///    A 128-bit integer vector to be broadcast.
   3078 /// \returns A 256-bit integer vector containing the result.
   3079 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   3080 _mm256_broadcastsi128_si256(__m128i __X)
   3081 {
   3082   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
   3083 }
   3084 
   3085 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
   3086 
   3087 /// Merges 32-bit integer elements from either of the two 128-bit vectors of
   3088 ///    [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
   3089 ///    as specified by the immediate integer operand \a M.
   3090 ///
   3091 /// \code{.operation}
   3092 /// FOR i := 0 TO 3
   3093 ///   j := i*32
   3094 ///   IF M[i] == 0
   3095 ///     result[31+j:j] := V1[31+j:j]
   3096 ///   ELSE
   3097 ///     result[31+j:j] := V2[32+j:j]
   3098 ///   FI
   3099 /// ENDFOR
   3100 /// \endcode
   3101 ///
   3102 /// \headerfile <immintrin.h>
   3103 ///
   3104 /// \code
   3105 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
   3106 /// \endcode
   3107 ///
   3108 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
   3109 ///
   3110 /// \param V1
   3111 ///    A 128-bit vector of [4 x i32] containing source values.
   3112 /// \param V2
   3113 ///    A 128-bit vector of [4 x i32] containing source values.
   3114 /// \param M
   3115 ///    An immediate 8-bit integer operand, with bits [3:0] specifying the
   3116 ///    source for each element of the result. The position of the mask bit
   3117 ///    corresponds to the index of a copied value. When a mask bit is 0, the
   3118 ///    element is copied from \a V1; otherwise, it is copied from \a V2.
   3119 /// \returns A 128-bit vector of [4 x i32] containing the result.
   3120 #define _mm_blend_epi32(V1, V2, M) \
   3121   ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
   3122                                       (__v4si)(__m128i)(V2), (int)(M)))
   3123 
   3124 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
   3125 ///    [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
   3126 ///    as specified by the immediate integer operand \a M.
   3127 ///
   3128 /// \code{.operation}
   3129 /// FOR i := 0 TO 7
   3130 ///   j := i*32
   3131 ///   IF M[i] == 0
   3132 ///     result[31+j:j] := V1[31+j:j]
   3133 ///   ELSE
   3134 ///     result[31+j:j] := V2[32+j:j]
   3135 ///   FI
   3136 /// ENDFOR
   3137 /// \endcode
   3138 ///
   3139 /// \headerfile <immintrin.h>
   3140 ///
   3141 /// \code
   3142 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
   3143 /// \endcode
   3144 ///
   3145 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
   3146 ///
   3147 /// \param V1
   3148 ///    A 256-bit vector of [8 x i32] containing source values.
   3149 /// \param V2
   3150 ///    A 256-bit vector of [8 x i32] containing source values.
   3151 /// \param M
   3152 ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
   3153 ///    source for each element of the result. The position of the mask bit
   3154 ///    corresponds to the index of a copied value. When a mask bit is 0, the
   3155 ///    element is copied from \a V1; otherwise, it is is copied from \a V2.
   3156 /// \returns A 256-bit vector of [8 x i32] containing the result.
   3157 #define _mm256_blend_epi32(V1, V2, M) \
   3158   ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
   3159                                       (__v8si)(__m256i)(V2), (int)(M)))
   3160 
   3161 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
   3162 ///    bytes of the 256-bit result.
   3163 ///
   3164 /// \headerfile <immintrin.h>
   3165 ///
   3166 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
   3167 ///
   3168 /// \param __X
   3169 ///    A 128-bit integer vector whose low byte will be broadcast.
   3170 /// \returns A 256-bit integer vector containing the result.
   3171 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   3172 _mm256_broadcastb_epi8(__m128i __X)
   3173 {
   3174   return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
   3175 }
   3176 
   3177 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
   3178 ///    to all elements of the result's 256-bit vector of [16 x i16].
   3179 ///
   3180 /// \headerfile <immintrin.h>
   3181 ///
   3182 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
   3183 ///
   3184 /// \param __X
   3185 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
   3186 /// \returns A 256-bit vector of [16 x i16] containing the result.
   3187 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   3188 _mm256_broadcastw_epi16(__m128i __X)
   3189 {
   3190   return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
   3191 }
   3192 
   3193 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
   3194 ///    to all elements of the result's 256-bit vector of [8 x i32].
   3195 ///
   3196 /// \headerfile <immintrin.h>
   3197 ///
   3198 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
   3199 ///
   3200 /// \param __X
   3201 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
   3202 /// \returns A 256-bit vector of [8 x i32] containing the result.
   3203 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   3204 _mm256_broadcastd_epi32(__m128i __X)
   3205 {
   3206   return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
   3207 }
   3208 
   3209 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
   3210 ///    to all elements of the result's 256-bit vector of [4 x i64].
   3211 ///
   3212 /// \headerfile <immintrin.h>
   3213 ///
   3214 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
   3215 ///
   3216 /// \param __X
   3217 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
   3218 /// \returns A 256-bit vector of [4 x i64] containing the result.
   3219 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   3220 _mm256_broadcastq_epi64(__m128i __X)
   3221 {
   3222   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
   3223 }
   3224 
   3225 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
   3226 ///    bytes of the 128-bit result.
   3227 ///
   3228 /// \headerfile <immintrin.h>
   3229 ///
   3230 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
   3231 ///
   3232 /// \param __X
   3233 ///    A 128-bit integer vector whose low byte will be broadcast.
   3234 /// \returns A 128-bit integer vector containing the result.
   3235 static __inline__ __m128i __DEFAULT_FN_ATTRS128
   3236 _mm_broadcastb_epi8(__m128i __X)
   3237 {
   3238   return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
   3239 }
   3240 
   3241 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
   3242 ///    \a __X to all elements of the result's 128-bit vector of [8 x i16].
   3243 ///
   3244 /// \headerfile <immintrin.h>
   3245 ///
   3246 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
   3247 ///
   3248 /// \param __X
   3249 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
   3250 /// \returns A 128-bit vector of [8 x i16] containing the result.
   3251 static __inline__ __m128i __DEFAULT_FN_ATTRS128
   3252 _mm_broadcastw_epi16(__m128i __X)
   3253 {
   3254   return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
   3255 }
   3256 
   3257 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
   3258 ///    to all elements of the result's vector of [4 x i32].
   3259 ///
   3260 /// \headerfile <immintrin.h>
   3261 ///
   3262 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
   3263 ///
   3264 /// \param __X
   3265 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
   3266 /// \returns A 128-bit vector of [4 x i32] containing the result.
   3267 static __inline__ __m128i __DEFAULT_FN_ATTRS128
   3268 _mm_broadcastd_epi32(__m128i __X)
   3269 {
   3270   return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
   3271 }
   3272 
   3273 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
   3274 ///    to both elements of the result's 128-bit vector of [2 x i64].
   3275 ///
   3276 /// \headerfile <immintrin.h>
   3277 ///
   3278 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
   3279 ///
   3280 /// \param __X
   3281 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
   3282 /// \returns A 128-bit vector of [2 x i64] containing the result.
   3283 static __inline__ __m128i __DEFAULT_FN_ATTRS128
   3284 _mm_broadcastq_epi64(__m128i __X)
   3285 {
   3286   return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
   3287 }
   3288 
   3289 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
   3290 ///    256-bit vector of [8 x i32] in \a __a as specified by indexes in the
   3291 ///    elements of the 256-bit vector of [8 x i32] in \a __b.
   3292 ///
   3293 /// \code{.operation}
   3294 /// FOR i := 0 TO 7
   3295 ///   j := i*32
   3296 ///   k := __b[j+2:j] * 32
   3297 ///   result[j+31:j] := __a[k+31:k]
   3298 /// ENDFOR
   3299 /// \endcode
   3300 ///
   3301 /// \headerfile <immintrin.h>
   3302 ///
   3303 /// This intrinsic corresponds to the \c VPERMD instruction.
   3304 ///
   3305 /// \param __a
   3306 ///    A 256-bit vector of [8 x i32] containing the source values.
   3307 /// \param __b
   3308 ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
   3309 ///    \a __a.
   3310 /// \returns A 256-bit vector of [8 x i32] containing the result.
   3311 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   3312 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
   3313 {
   3314   return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
   3315 }
   3316 
   3317 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
   3318 ///    the 256-bit vector of [4 x double] in \a V as specified by the
   3319 ///    immediate value \a M.
   3320 ///
   3321 /// \code{.operation}
   3322 /// FOR i := 0 TO 3
   3323 ///   j := i*64
   3324 ///   k := (M >> i*2)[1:0] * 64
   3325 ///   result[j+63:j] := V[k+63:k]
   3326 /// ENDFOR
   3327 /// \endcode
   3328 ///
   3329 /// \headerfile <immintrin.h>
   3330 ///
   3331 /// \code
   3332 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
   3333 /// \endcode
   3334 ///
   3335 /// This intrinsic corresponds to the \c VPERMPD instruction.
   3336 ///
   3337 /// \param V
   3338 ///    A 256-bit vector of [4 x double] containing the source values.
   3339 /// \param M
   3340 ///    An immediate 8-bit value specifying which elements to copy from \a V.
   3341 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
   3342 ///    \a M[3:2] specifies the index for element 1, and so forth.
   3343 /// \returns A 256-bit vector of [4 x double] containing the result.
   3344 #define _mm256_permute4x64_pd(V, M) \
   3345   ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
   3346 
   3347 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
   3348 ///    the 256-bit vector of [8 x float] in \a __a as specified by indexes in
   3349 ///    the elements of the 256-bit vector of [8 x i32] in \a __b.
   3350 ///
   3351 /// \code{.operation}
   3352 /// FOR i := 0 TO 7
   3353 ///   j := i*32
   3354 ///   k := __b[j+2:j] * 32
   3355 ///   result[j+31:j] := __a[k+31:k]
   3356 /// ENDFOR
   3357 /// \endcode
   3358 ///
   3359 /// \headerfile <immintrin.h>
   3360 ///
   3361 /// This intrinsic corresponds to the \c VPERMPS instruction.
   3362 ///
   3363 /// \param __a
   3364 ///    A 256-bit vector of [8 x float] containing the source values.
   3365 /// \param __b
   3366 ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
   3367 ///    \a __a.
   3368 /// \returns A 256-bit vector of [8 x float] containing the result.
   3369 static __inline__ __m256 __DEFAULT_FN_ATTRS256
   3370 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
   3371 {
   3372   return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
   3373 }
   3374 
   3375 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
   3376 ///    of the 256-bit vector of [4 x i64] in \a V as specified by the
   3377 ///    immediate value \a M.
   3378 ///
   3379 /// \code{.operation}
   3380 /// FOR i := 0 TO 3
   3381 ///   j := i*64
   3382 ///   k := (M >> i*2)[1:0] * 64
   3383 ///   result[j+63:j] := V[k+63:k]
   3384 /// ENDFOR
   3385 /// \endcode
   3386 ///
   3387 /// \headerfile <immintrin.h>
   3388 ///
   3389 /// \code
   3390 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
   3391 /// \endcode
   3392 ///
   3393 /// This intrinsic corresponds to the \c VPERMQ instruction.
   3394 ///
   3395 /// \param V
   3396 ///    A 256-bit vector of [4 x i64] containing the source values.
   3397 /// \param M
   3398 ///    An immediate 8-bit value specifying which elements to copy from \a V.
   3399 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
   3400 ///    \a M[3:2] specifies the index for element 1, and so forth.
   3401 /// \returns A 256-bit vector of [4 x i64] containing the result.
   3402 #define _mm256_permute4x64_epi64(V, M) \
   3403   ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
   3404 
   3405 /// Sets each half of the 256-bit result either to zero or to one of the
   3406 ///    four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
   3407 ///    as specified by the immediate value \a M.
   3408 ///
   3409 /// \code{.operation}
   3410 /// FOR i := 0 TO 1
   3411 ///   j := i*128
   3412 ///   k := M >> (i*4)
   3413 ///   IF k[3] == 0
   3414 ///     CASE (k[1:0]) OF
   3415 ///     0: result[127+j:j] := V1[127:0]
   3416 ///     1: result[127+j:j] := V1[255:128]
   3417 ///     2: result[127+j:j] := V2[127:0]
   3418 ///     3: result[127+j:j] := V2[255:128]
   3419 ///     ESAC
   3420 ///   ELSE
   3421 ///     result[127+j:j] := 0
   3422 ///   FI
   3423 /// ENDFOR
   3424 /// \endcode
   3425 ///
   3426 /// \headerfile <immintrin.h>
   3427 ///
   3428 /// \code
   3429 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
   3430 /// \endcode
   3431 ///
   3432 /// This intrinsic corresponds to the \c VPERM2I128 instruction.
   3433 ///
   3434 /// \param V1
   3435 ///    A 256-bit integer vector containing source values.
   3436 /// \param V2
   3437 ///    A 256-bit integer vector containing source values.
   3438 /// \param M
   3439 ///    An immediate value specifying how to form the result. Bits [3:0]
   3440 ///    control the lower half of the result, bits [7:4] control the upper half.
   3441 ///    Within each 4-bit control value, if bit 3 is 1, the result is zero,
   3442 ///    otherwise bits [1:0] determine the source as follows. \n
   3443 ///    0: the lower half of \a V1 \n
   3444 ///    1: the upper half of \a V1 \n
   3445 ///    2: the lower half of \a V2 \n
   3446 ///    3: the upper half of \a V2
   3447 /// \returns A 256-bit integer vector containing the result.
   3448 #define _mm256_permute2x128_si256(V1, V2, M) \
   3449   ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
   3450 
   3451 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
   3452 ///     of the immediate \a M is zero, extracts the lower half of the result;
   3453 ///     otherwise, extracts the upper half.
   3454 ///
   3455 /// \headerfile <immintrin.h>
   3456 ///
   3457 /// \code
   3458 /// __m128i _mm256_extracti128_si256(__m256i V, const int M);
   3459 /// \endcode
   3460 ///
   3461 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
   3462 ///
   3463 /// \param V
   3464 ///    A 256-bit integer vector containing the source values.
   3465 /// \param M
   3466 ///    An immediate value specifying which half of \a V to extract.
   3467 /// \returns A 128-bit integer vector containing the result.
   3468 #define _mm256_extracti128_si256(V, M) \
   3469   ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
   3470 
   3471 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
   3472 ///     result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
   3473 ///     is zero, overwrites the lower half of the result; otherwise,
   3474 ///     overwrites the upper half.
   3475 ///
   3476 /// \headerfile <immintrin.h>
   3477 ///
   3478 /// \code
   3479 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
   3480 /// \endcode
   3481 ///
   3482 /// This intrinsic corresponds to the \c VINSERTI128 instruction.
   3483 ///
   3484 /// \param V1
   3485 ///    A 256-bit integer vector containing a source value.
   3486 /// \param V2
   3487 ///    A 128-bit integer vector containing a source value.
   3488 /// \param M
   3489 ///    An immediate value specifying where to put \a V2 in the result.
   3490 /// \returns A 256-bit integer vector containing the result.
   3491 #define _mm256_inserti128_si256(V1, V2, M) \
   3492   ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
   3493                                          (__v2di)(__m128i)(V2), (int)(M)))
   3494 
   3495 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if
   3496 ///    the most significant bit of the corresponding element in the mask
   3497 ///    \a __M is set; otherwise, sets that element of the result to zero.
   3498 ///    Returns the 256-bit [8 x i32] result.
   3499 ///
   3500 /// \code{.operation}
   3501 /// FOR i := 0 TO 7
   3502 ///   j := i*32
   3503 ///   IF __M[j+31] == 1
   3504 ///     result[j+31:j] := Load32(__X+(i*4))
   3505 ///   ELSE
   3506 ///     result[j+31:j] := 0
   3507 ///   FI
   3508 /// ENDFOR
   3509 /// \endcode
   3510 ///
   3511 /// \headerfile <immintrin.h>
   3512 ///
   3513 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
   3514 ///
   3515 /// \param __X
   3516 ///    A pointer to the memory used for loading values.
   3517 /// \param __M
   3518 ///    A 256-bit vector of [8 x i32] containing the mask bits.
   3519 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
   3520 ///    elements.
   3521 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   3522 _mm256_maskload_epi32(int const *__X, __m256i __M)
   3523 {
   3524   return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
   3525 }
   3526 
   3527 /// Conditionally loads four 64-bit integer elements from memory \a __X, if
   3528 ///    the most significant bit of the corresponding element in the mask
   3529 ///    \a __M is set; otherwise, sets that element of the result to zero.
   3530 ///    Returns the 256-bit [4 x i64] result.
   3531 ///
   3532 /// \code{.operation}
   3533 /// FOR i := 0 TO 3
   3534 ///   j := i*64
   3535 ///   IF __M[j+63] == 1
   3536 ///     result[j+63:j] := Load64(__X+(i*8))
   3537 ///   ELSE
   3538 ///     result[j+63:j] := 0
   3539 ///   FI
   3540 /// ENDFOR
   3541 /// \endcode
   3542 ///
   3543 /// \headerfile <immintrin.h>
   3544 ///
   3545 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
   3546 ///
   3547 /// \param __X
   3548 ///    A pointer to the memory used for loading values.
   3549 /// \param __M
   3550 ///    A 256-bit vector of [4 x i64] containing the mask bits.
   3551 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
   3552 ///    elements.
   3553 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   3554 _mm256_maskload_epi64(long long const *__X, __m256i __M)
   3555 {
   3556   return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
   3557 }
   3558 
   3559 /// Conditionally loads four 32-bit integer elements from memory \a __X, if
   3560 ///    the most significant bit of the corresponding element in the mask
   3561 ///    \a __M is set; otherwise, sets that element of the result to zero.
   3562 ///    Returns the 128-bit [4 x i32] result.
   3563 ///
   3564 /// \code{.operation}
   3565 /// FOR i := 0 TO 3
   3566 ///   j := i*32
   3567 ///   IF __M[j+31] == 1
   3568 ///     result[j+31:j] := Load32(__X+(i*4))
   3569 ///   ELSE
   3570 ///     result[j+31:j] := 0
   3571 ///   FI
   3572 /// ENDFOR
   3573 /// \endcode
   3574 ///
   3575 /// \headerfile <immintrin.h>
   3576 ///
   3577 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
   3578 ///
   3579 /// \param __X
   3580 ///    A pointer to the memory used for loading values.
   3581 /// \param __M
   3582 ///    A 128-bit vector of [4 x i32] containing the mask bits.
   3583 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
   3584 ///    elements.
   3585 static __inline__ __m128i __DEFAULT_FN_ATTRS128
   3586 _mm_maskload_epi32(int const *__X, __m128i __M)
   3587 {
   3588   return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
   3589 }
   3590 
   3591 /// Conditionally loads two 64-bit integer elements from memory \a __X, if
   3592 ///    the most significant bit of the corresponding element in the mask
   3593 ///    \a __M is set; otherwise, sets that element of the result to zero.
   3594 ///    Returns the 128-bit [2 x i64] result.
   3595 ///
   3596 /// \code{.operation}
   3597 /// FOR i := 0 TO 1
   3598 ///   j := i*64
   3599 ///   IF __M[j+63] == 1
   3600 ///     result[j+63:j] := Load64(__X+(i*8))
   3601 ///   ELSE
   3602 ///     result[j+63:j] := 0
   3603 ///   FI
   3604 /// ENDFOR
   3605 /// \endcode
   3606 ///
   3607 /// \headerfile <immintrin.h>
   3608 ///
   3609 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
   3610 ///
   3611 /// \param __X
   3612 ///    A pointer to the memory used for loading values.
   3613 /// \param __M
   3614 ///    A 128-bit vector of [2 x i64] containing the mask bits.
   3615 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
   3616 ///    elements.
   3617 static __inline__ __m128i __DEFAULT_FN_ATTRS128
   3618 _mm_maskload_epi64(long long const *__X, __m128i __M)
   3619 {
   3620   return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
   3621 }
   3622 
   3623 /// Conditionally stores eight 32-bit integer elements from the 256-bit vector
   3624 ///    of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
   3625 ///    the corresponding element in the mask \a __M is set; otherwise, the
   3626 ///    memory element is unchanged.
   3627 ///
   3628 /// \code{.operation}
   3629 /// FOR i := 0 TO 7
   3630 ///   j := i*32
   3631 ///   IF __M[j+31] == 1
   3632 ///     Store32(__X+(i*4), __Y[j+31:j])
   3633 ///   FI
   3634 /// ENDFOR
   3635 /// \endcode
   3636 ///
   3637 /// \headerfile <immintrin.h>
   3638 ///
   3639 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
   3640 ///
   3641 /// \param __X
   3642 ///    A pointer to the memory used for storing values.
   3643 /// \param __M
   3644 ///    A 256-bit vector of [8 x i32] containing the mask bits.
   3645 /// \param __Y
   3646 ///    A 256-bit vector of [8 x i32] containing the values to store.
   3647 static __inline__ void __DEFAULT_FN_ATTRS256
   3648 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
   3649 {
   3650   __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
   3651 }
   3652 
   3653 /// Conditionally stores four 64-bit integer elements from the 256-bit vector
   3654 ///    of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
   3655 ///    the corresponding element in the mask \a __M is set; otherwise, the
   3656 ///    memory element is unchanged.
   3657 ///
   3658 /// \code{.operation}
   3659 /// FOR i := 0 TO 3
   3660 ///   j := i*64
   3661 ///   IF __M[j+63] == 1
   3662 ///     Store64(__X+(i*8), __Y[j+63:j])
   3663 ///   FI
   3664 /// ENDFOR
   3665 /// \endcode
   3666 ///
   3667 /// \headerfile <immintrin.h>
   3668 ///
   3669 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
   3670 ///
   3671 /// \param __X
   3672 ///    A pointer to the memory used for storing values.
   3673 /// \param __M
   3674 ///    A 256-bit vector of [4 x i64] containing the mask bits.
   3675 /// \param __Y
   3676 ///    A 256-bit vector of [4 x i64] containing the values to store.
   3677 static __inline__ void __DEFAULT_FN_ATTRS256
   3678 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
   3679 {
   3680   __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
   3681 }
   3682 
   3683 /// Conditionally stores four 32-bit integer elements from the 128-bit vector
   3684 ///    of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
   3685 ///    the corresponding element in the mask \a __M is set; otherwise, the
   3686 ///    memory element is unchanged.
   3687 ///
   3688 /// \code{.operation}
   3689 /// FOR i := 0 TO 3
   3690 ///   j := i*32
   3691 ///   IF __M[j+31] == 1
   3692 ///     Store32(__X+(i*4), __Y[j+31:j])
   3693 ///   FI
   3694 /// ENDFOR
   3695 /// \endcode
   3696 ///
   3697 /// \headerfile <immintrin.h>
   3698 ///
   3699 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
   3700 ///
   3701 /// \param __X
   3702 ///    A pointer to the memory used for storing values.
   3703 /// \param __M
   3704 ///    A 128-bit vector of [4 x i32] containing the mask bits.
   3705 /// \param __Y
   3706 ///    A 128-bit vector of [4 x i32] containing the values to store.
   3707 static __inline__ void __DEFAULT_FN_ATTRS128
   3708 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
   3709 {
   3710   __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
   3711 }
   3712 
   3713 /// Conditionally stores two 64-bit integer elements from the 128-bit vector
   3714 ///    of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
   3715 ///    the corresponding element in the mask \a __M is set; otherwise, the
   3716 ///    memory element is unchanged.
   3717 ///
   3718 /// \code{.operation}
   3719 /// FOR i := 0 TO 1
   3720 ///   j := i*64
   3721 ///   IF __M[j+63] == 1
   3722 ///     Store64(__X+(i*8), __Y[j+63:j])
   3723 ///   FI
   3724 /// ENDFOR
   3725 /// \endcode
   3726 ///
   3727 /// \headerfile <immintrin.h>
   3728 ///
   3729 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
   3730 ///
   3731 /// \param __X
   3732 ///    A pointer to the memory used for storing values.
   3733 /// \param __M
   3734 ///    A 128-bit vector of [2 x i64] containing the mask bits.
   3735 /// \param __Y
   3736 ///    A 128-bit vector of [2 x i64] containing the values to store.
   3737 static __inline__ void __DEFAULT_FN_ATTRS128
   3738 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
   3739 {
   3740   __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
   3741 }
   3742 
   3743 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
   3744 ///    left by the number of bits given in the corresponding element of the
   3745 ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
   3746 ///    returns the result. If the shift count for any element is greater than
   3747 ///    31, the result for that element is zero.
   3748 ///
   3749 /// \headerfile <immintrin.h>
   3750 ///
   3751 /// This intrinsic corresponds to the \c VPSLLVD instruction.
   3752 ///
   3753 /// \param __X
   3754 ///    A 256-bit vector of [8 x i32] to be shifted.
   3755 /// \param __Y
   3756 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
   3757 ///    bits).
   3758 /// \returns A 256-bit vector of [8 x i32] containing the result.
   3759 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   3760 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
   3761 {
   3762   return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
   3763 }
   3764 
   3765 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
   3766 ///    left by the number of bits given in the corresponding element of the
   3767 ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
   3768 ///    returns the result. If the shift count for any element is greater than
   3769 ///    31, the result for that element is zero.
   3770 ///
   3771 /// \headerfile <immintrin.h>
   3772 ///
   3773 /// This intrinsic corresponds to the \c VPSLLVD instruction.
   3774 ///
   3775 /// \param __X
   3776 ///    A 128-bit vector of [4 x i32] to be shifted.
   3777 /// \param __Y
   3778 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
   3779 ///    bits).
   3780 /// \returns A 128-bit vector of [4 x i32] containing the result.
   3781 static __inline__ __m128i __DEFAULT_FN_ATTRS128
   3782 _mm_sllv_epi32(__m128i __X, __m128i __Y)
   3783 {
   3784   return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
   3785 }
   3786 
   3787 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
   3788 ///    left by the number of bits given in the corresponding element of the
   3789 ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
   3790 ///    returns the result. If the shift count for any element is greater than
   3791 ///    63, the result for that element is zero.
   3792 ///
   3793 /// \headerfile <immintrin.h>
   3794 ///
   3795 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
   3796 ///
   3797 /// \param __X
   3798 ///    A 256-bit vector of [4 x i64] to be shifted.
   3799 /// \param __Y
   3800 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
   3801 ///    bits).
   3802 /// \returns A 256-bit vector of [4 x i64] containing the result.
   3803 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   3804 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
   3805 {
   3806   return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
   3807 }
   3808 
   3809 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
   3810 ///    left by the number of bits given in the corresponding element of the
   3811 ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
   3812 ///    returns the result. If the shift count for any element is greater than
   3813 ///    63, the result for that element is zero.
   3814 ///
   3815 /// \headerfile <immintrin.h>
   3816 ///
   3817 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
   3818 ///
   3819 /// \param __X
   3820 ///    A 128-bit vector of [2 x i64] to be shifted.
   3821 /// \param __Y
   3822 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
   3823 ///    bits).
   3824 /// \returns A 128-bit vector of [2 x i64] containing the result.
   3825 static __inline__ __m128i __DEFAULT_FN_ATTRS128
   3826 _mm_sllv_epi64(__m128i __X, __m128i __Y)
   3827 {
   3828   return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
   3829 }
   3830 
   3831 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
   3832 ///    right by the number of bits given in the corresponding element of the
   3833 ///    256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
   3834 ///    returns the result. If the shift count for any element is greater than
   3835 ///    31, the result for that element is 0 or -1 according to the sign bit
   3836 ///    for that element.
   3837 ///
   3838 /// \headerfile <immintrin.h>
   3839 ///
   3840 /// This intrinsic corresponds to the \c VPSRAVD instruction.
   3841 ///
   3842 /// \param __X
   3843 ///    A 256-bit vector of [8 x i32] to be shifted.
   3844 /// \param __Y
   3845 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
   3846 ///    bits).
   3847 /// \returns A 256-bit vector of [8 x i32] containing the result.
   3848 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   3849 _mm256_srav_epi32(__m256i __X, __m256i __Y)
   3850 {
   3851   return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
   3852 }
   3853 
   3854 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
   3855 ///    right by the number of bits given in the corresponding element of the
   3856 ///    128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
   3857 ///    returns the result. If the shift count for any element is greater than
   3858 ///    31, the result for that element is 0 or -1 according to the sign bit
   3859 ///    for that element.
   3860 ///
   3861 /// \headerfile <immintrin.h>
   3862 ///
   3863 /// This intrinsic corresponds to the \c VPSRAVD instruction.
   3864 ///
   3865 /// \param __X
   3866 ///    A 128-bit vector of [4 x i32] to be shifted.
   3867 /// \param __Y
   3868 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
   3869 ///    bits).
   3870 /// \returns A 128-bit vector of [4 x i32] containing the result.
   3871 static __inline__ __m128i __DEFAULT_FN_ATTRS128
   3872 _mm_srav_epi32(__m128i __X, __m128i __Y)
   3873 {
   3874   return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
   3875 }
   3876 
   3877 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
   3878 ///    right by the number of bits given in the corresponding element of the
   3879 ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
   3880 ///    returns the result. If the shift count for any element is greater than
   3881 ///    31, the result for that element is zero.
   3882 ///
   3883 /// \headerfile <immintrin.h>
   3884 ///
   3885 /// This intrinsic corresponds to the \c VPSRLVD instruction.
   3886 ///
   3887 /// \param __X
   3888 ///    A 256-bit vector of [8 x i32] to be shifted.
   3889 /// \param __Y
   3890 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
   3891 ///    bits).
   3892 /// \returns A 256-bit vector of [8 x i32] containing the result.
   3893 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   3894 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
   3895 {
   3896   return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
   3897 }
   3898 
   3899 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
   3900 ///    right by the number of bits given in the corresponding element of the
   3901 ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
   3902 ///    returns the result. If the shift count for any element is greater than
   3903 ///    31, the result for that element is zero.
   3904 ///
   3905 /// \headerfile <immintrin.h>
   3906 ///
   3907 /// This intrinsic corresponds to the \c VPSRLVD instruction.
   3908 ///
   3909 /// \param __X
   3910 ///    A 128-bit vector of [4 x i32] to be shifted.
   3911 /// \param __Y
   3912 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
   3913 ///    bits).
   3914 /// \returns A 128-bit vector of [4 x i32] containing the result.
   3915 static __inline__ __m128i __DEFAULT_FN_ATTRS128
   3916 _mm_srlv_epi32(__m128i __X, __m128i __Y)
   3917 {
   3918   return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
   3919 }
   3920 
   3921 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
   3922 ///    right by the number of bits given in the corresponding element of the
   3923 ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
   3924 ///    returns the result. If the shift count for any element is greater than
   3925 ///    63, the result for that element is zero.
   3926 ///
   3927 /// \headerfile <immintrin.h>
   3928 ///
   3929 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
   3930 ///
   3931 /// \param __X
   3932 ///    A 256-bit vector of [4 x i64] to be shifted.
   3933 /// \param __Y
   3934 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
   3935 ///    bits).
   3936 /// \returns A 256-bit vector of [4 x i64] containing the result.
   3937 static __inline__ __m256i __DEFAULT_FN_ATTRS256
   3938 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
   3939 {
   3940   return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
   3941 }
   3942 
   3943 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
   3944 ///    right by the number of bits given in the corresponding element of the
   3945 ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
   3946 ///    returns the result. If the shift count for any element is greater than
   3947 ///    63, the result for that element is zero.
   3948 ///
   3949 /// \headerfile <immintrin.h>
   3950 ///
   3951 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
   3952 ///
   3953 /// \param __X
   3954 ///    A 128-bit vector of [2 x i64] to be shifted.
   3955 /// \param __Y
   3956 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
   3957 ///    bits).
   3958 /// \returns A 128-bit vector of [2 x i64] containing the result.
   3959 static __inline__ __m128i __DEFAULT_FN_ATTRS128
   3960 _mm_srlv_epi64(__m128i __X, __m128i __Y)
   3961 {
   3962   return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
   3963 }
   3964 
   3965 /// Conditionally gathers two 64-bit floating-point values, either from the
   3966 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
   3967 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
   3968 ///    of [2 x double] in \a mask determines the source for each element.
   3969 ///
   3970 /// \code{.operation}
   3971 /// FOR element := 0 to 1
   3972 ///   j := element*64
   3973 ///   k := element*32
   3974 ///   IF mask[j+63] == 0
   3975 ///     result[j+63:j] := a[j+63:j]
   3976 ///   ELSE
   3977 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
   3978 ///   FI
   3979 /// ENDFOR
   3980 /// \endcode
   3981 ///
   3982 /// \headerfile <immintrin.h>
   3983 ///
   3984 /// \code
   3985 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
   3986 ///                               __m128d mask, const int s);
   3987 /// \endcode
   3988 ///
   3989 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
   3990 ///
   3991 /// \param a
   3992 ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
   3993 ///    zero.
   3994 /// \param m
   3995 ///    A pointer to the memory used for loading values.
   3996 /// \param i
   3997 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
   3998 ///    the first two elements are used.
   3999 /// \param mask
   4000 ///    A 128-bit vector of [2 x double] containing the mask. The most
   4001 ///    significant bit of each element in the mask vector represents the mask
   4002 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
   4003 ///    is gathered; otherwise the value is loaded from memory.
   4004 /// \param s
   4005 ///    A literal constant scale factor for the indexes in \a i. Must be
   4006 ///    1, 2, 4, or 8.
   4007 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
   4008 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
   4009   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
   4010                                       (double const *)(m), \
   4011                                       (__v4si)(__m128i)(i), \
   4012                                       (__v2df)(__m128d)(mask), (s)))
   4013 
   4014 /// Conditionally gathers four 64-bit floating-point values, either from the
   4015 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
   4016 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
   4017 ///    of [4 x double] in \a mask determines the source for each element.
   4018 ///
   4019 /// \code{.operation}
   4020 /// FOR element := 0 to 3
   4021 ///   j := element*64
   4022 ///   k := element*32
   4023 ///   IF mask[j+63] == 0
   4024 ///     result[j+63:j] := a[j+63:j]
   4025 ///   ELSE
   4026 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
   4027 ///   FI
   4028 /// ENDFOR
   4029 /// \endcode
   4030 ///
   4031 /// \headerfile <immintrin.h>
   4032 ///
   4033 /// \code
   4034 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
   4035 ///                                  __m256d mask, const int s);
   4036 /// \endcode
   4037 ///
   4038 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
   4039 ///
   4040 /// \param a
   4041 ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
   4042 ///    zero.
   4043 /// \param m
   4044 ///    A pointer to the memory used for loading values.
   4045 /// \param i
   4046 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
   4047 /// \param mask
   4048 ///    A 256-bit vector of [4 x double] containing the mask. The most
   4049 ///    significant bit of each element in the mask vector represents the mask
   4050 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
   4051 ///    is gathered; otherwise the value is loaded from memory.
   4052 /// \param s
   4053 ///    A literal constant scale factor for the indexes in \a i. Must be
   4054 ///    1, 2, 4, or 8.
   4055 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
   4056 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
   4057   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
   4058                                          (double const *)(m), \
   4059                                          (__v4si)(__m128i)(i), \
   4060                                          (__v4df)(__m256d)(mask), (s)))
   4061 
   4062 /// Conditionally gathers two 64-bit floating-point values, either from the
   4063 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
   4064 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
   4065 ///    of [2 x double] in \a mask determines the source for each element.
   4066 ///
   4067 /// \code{.operation}
   4068 /// FOR element := 0 to 1
   4069 ///   j := element*64
   4070 ///   k := element*64
   4071 ///   IF mask[j+63] == 0
   4072 ///     result[j+63:j] := a[j+63:j]
   4073 ///   ELSE
   4074 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
   4075 ///   FI
   4076 /// ENDFOR
   4077 /// \endcode
   4078 ///
   4079 /// \headerfile <immintrin.h>
   4080 ///
   4081 /// \code
   4082 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
   4083 ///                               __m128d mask, const int s);
   4084 /// \endcode
   4085 ///
   4086 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
   4087 ///
   4088 /// \param a
   4089 ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
   4090 ///    zero.
   4091 /// \param m
   4092 ///    A pointer to the memory used for loading values.
   4093 /// \param i
   4094 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
   4095 /// \param mask
   4096 ///    A 128-bit vector of [2 x double] containing the mask. The most
   4097 ///    significant bit of each element in the mask vector represents the mask
   4098 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
   4099 ///    is gathered; otherwise the value is loaded from memory.
   4100 /// \param s
   4101 ///    A literal constant scale factor for the indexes in \a i. Must be
   4102 ///    1, 2, 4, or 8.
   4103 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
   4104 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
   4105   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
   4106                                       (double const *)(m), \
   4107                                       (__v2di)(__m128i)(i), \
   4108                                       (__v2df)(__m128d)(mask), (s)))
   4109 
   4110 /// Conditionally gathers four 64-bit floating-point values, either from the
   4111 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
   4112 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
   4113 ///    of [4 x double] in \a mask determines the source for each element.
   4114 ///
   4115 /// \code{.operation}
   4116 /// FOR element := 0 to 3
   4117 ///   j := element*64
   4118 ///   k := element*64
   4119 ///   IF mask[j+63] == 0
   4120 ///     result[j+63:j] := a[j+63:j]
   4121 ///   ELSE
   4122 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
   4123 ///   FI
   4124 /// ENDFOR
   4125 /// \endcode
   4126 ///
   4127 /// \headerfile <immintrin.h>
   4128 ///
   4129 /// \code
   4130 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
   4131 ///                                  __m256d mask, const int s);
   4132 /// \endcode
   4133 ///
   4134 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
   4135 ///
   4136 /// \param a
   4137 ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
   4138 ///    zero.
   4139 /// \param m
   4140 ///    A pointer to the memory used for loading values.
   4141 /// \param i
   4142 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
   4143 /// \param mask
   4144 ///    A 256-bit vector of [4 x double] containing the mask. The most
   4145 ///    significant bit of each element in the mask vector represents the mask
   4146 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
   4147 ///    is gathered; otherwise the value is loaded from memory.
   4148 /// \param s
   4149 ///    A literal constant scale factor for the indexes in \a i. Must be
   4150 ///    1, 2, 4, or 8.
   4151 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
   4152 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
   4153   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
   4154                                          (double const *)(m), \
   4155                                          (__v4di)(__m256i)(i), \
   4156                                          (__v4df)(__m256d)(mask), (s)))
   4157 
   4158 /// Conditionally gathers four 32-bit floating-point values, either from the
   4159 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
   4160 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
   4161 ///    of [4 x float] in \a mask determines the source for each element.
   4162 ///
   4163 /// \code{.operation}
   4164 /// FOR element := 0 to 3
   4165 ///   j := element*32
   4166 ///   k := element*32
   4167 ///   IF mask[j+31] == 0
   4168 ///     result[j+31:j] := a[j+31:j]
   4169 ///   ELSE
   4170 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
   4171 ///   FI
   4172 /// ENDFOR
   4173 /// \endcode
   4174 ///
   4175 /// \headerfile <immintrin.h>
   4176 ///
   4177 /// \code
   4178 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
   4179 ///                              __m128 mask, const int s);
   4180 /// \endcode
   4181 ///
   4182 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
   4183 ///
   4184 /// \param a
   4185 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
   4186 ///    zero.
   4187 /// \param m
   4188 ///    A pointer to the memory used for loading values.
   4189 /// \param i
   4190 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
   4191 /// \param mask
   4192 ///    A 128-bit vector of [4 x float] containing the mask. The most
   4193 ///    significant bit of each element in the mask vector represents the mask
   4194 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
   4195 ///    is gathered; otherwise the value is loaded from memory.
   4196 /// \param s
   4197 ///    A literal constant scale factor for the indexes in \a i. Must be
   4198 ///    1, 2, 4, or 8.
   4199 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
   4200 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
   4201   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
   4202                                      (float const *)(m), \
   4203                                      (__v4si)(__m128i)(i), \
   4204                                      (__v4sf)(__m128)(mask), (s)))
   4205 
   4206 /// Conditionally gathers eight 32-bit floating-point values, either from the
   4207 ///    256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
   4208 ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
   4209 ///    of [8 x float] in \a mask determines the source for each element.
   4210 ///
   4211 /// \code{.operation}
   4212 /// FOR element := 0 to 7
   4213 ///   j := element*32
   4214 ///   k := element*32
   4215 ///   IF mask[j+31] == 0
   4216 ///     result[j+31:j] := a[j+31:j]
   4217 ///   ELSE
   4218 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
   4219 ///   FI
   4220 /// ENDFOR
   4221 /// \endcode
   4222 ///
   4223 /// \headerfile <immintrin.h>
   4224 ///
   4225 /// \code
   4226 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
   4227 ///                                 __m256 mask, const int s);
   4228 /// \endcode
   4229 ///
   4230 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
   4231 ///
   4232 /// \param a
   4233 ///    A 256-bit vector of [8 x float] used as the source when a mask bit is
   4234 ///    zero.
   4235 /// \param m
   4236 ///    A pointer to the memory used for loading values.
   4237 /// \param i
   4238 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
   4239 /// \param mask
   4240 ///    A 256-bit vector of [8 x float] containing the mask. The most
   4241 ///    significant bit of each element in the mask vector represents the mask
   4242 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
   4243 ///    is gathered; otherwise the value is loaded from memory.
   4244 /// \param s
   4245 ///    A literal constant scale factor for the indexes in \a i. Must be
   4246 ///    1, 2, 4, or 8.
   4247 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
   4248 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
   4249   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
   4250                                         (float const *)(m), \
   4251                                         (__v8si)(__m256i)(i), \
   4252                                         (__v8sf)(__m256)(mask), (s)))
   4253 
   4254 /// Conditionally gathers two 32-bit floating-point values, either from the
   4255 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
   4256 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
   4257 ///    of [4 x float] in \a mask determines the source for the lower two
   4258 ///    elements. The upper two elements of the result are zeroed.
   4259 ///
   4260 /// \code{.operation}
   4261 /// FOR element := 0 to 1
   4262 ///   j := element*32
   4263 ///   k := element*64
   4264 ///   IF mask[j+31] == 0
   4265 ///     result[j+31:j] := a[j+31:j]
   4266 ///   ELSE
   4267 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
   4268 ///   FI
   4269 /// ENDFOR
   4270 /// result[127:64] := 0
   4271 /// \endcode
   4272 ///
   4273 /// \headerfile <immintrin.h>
   4274 ///
   4275 /// \code
   4276 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
   4277 ///                              __m128 mask, const int s);
   4278 /// \endcode
   4279 ///
   4280 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
   4281 ///
   4282 /// \param a
   4283 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
   4284 ///    zero. Only the first two elements are used.
   4285 /// \param m
   4286 ///    A pointer to the memory used for loading values.
   4287 /// \param i
   4288 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
   4289 /// \param mask
   4290 ///    A 128-bit vector of [4 x float] containing the mask. The most
   4291 ///    significant bit of each element in the mask vector represents the mask
   4292 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
   4293 ///    is gathered; otherwise the value is loaded from memory. Only the first
   4294 ///    two elements are used.
   4295 /// \param s
   4296 ///    A literal constant scale factor for the indexes in \a i. Must be
   4297 ///    1, 2, 4, or 8.
   4298 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
   4299 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
   4300   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
   4301                                      (float const *)(m), \
   4302                                      (__v2di)(__m128i)(i), \
   4303                                      (__v4sf)(__m128)(mask), (s)))
   4304 
   4305 /// Conditionally gathers four 32-bit floating-point values, either from the
   4306 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
   4307 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
   4308 ///    of [4 x float] in \a mask determines the source for each element.
   4309 ///
   4310 /// \code{.operation}
   4311 /// FOR element := 0 to 3
   4312 ///   j := element*32
   4313 ///   k := element*64
   4314 ///   IF mask[j+31] == 0
   4315 ///     result[j+31:j] := a[j+31:j]
   4316 ///   ELSE
   4317 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
   4318 ///   FI
   4319 /// ENDFOR
   4320 /// \endcode
   4321 ///
   4322 /// \headerfile <immintrin.h>
   4323 ///
   4324 /// \code
   4325 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
   4326 ///                                 __m128 mask, const int s);
   4327 /// \endcode
   4328 ///
   4329 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
   4330 ///
   4331 /// \param a
   4332 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
   4333 ///   zero.
   4334 /// \param m
   4335 ///    A pointer to the memory used for loading values.
   4336 /// \param i
   4337 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
   4338 /// \param mask
   4339 ///    A 128-bit vector of [4 x float] containing the mask. The most
   4340 ///    significant bit of each element in the mask vector represents the mask
   4341 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
   4342 ///    is gathered; otherwise the value is loaded from memory.
   4343 /// \param s
   4344 ///    A literal constant scale factor for the indexes in \a i. Must be
   4345 ///    1, 2, 4, or 8.
   4346 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
   4347 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
   4348   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
   4349                                         (float const *)(m), \
   4350                                         (__v4di)(__m256i)(i), \
   4351                                         (__v4sf)(__m128)(mask), (s)))
   4352 
   4353 /// Conditionally gathers four 32-bit integer values, either from the
   4354 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
   4355 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
   4356 ///    of [4 x i32] in \a mask determines the source for each element.
   4357 ///
   4358 /// \code{.operation}
   4359 /// FOR element := 0 to 3
   4360 ///   j := element*32
   4361 ///   k := element*32
   4362 ///   IF mask[j+31] == 0
   4363 ///     result[j+31:j] := a[j+31:j]
   4364 ///   ELSE
   4365 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
   4366 ///   FI
   4367 /// ENDFOR
   4368 /// \endcode
   4369 ///
   4370 /// \headerfile <immintrin.h>
   4371 ///
   4372 /// \code
   4373 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
   4374 ///                                  __m128i mask, const int s);
   4375 /// \endcode
   4376 ///
   4377 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
   4378 ///
   4379 /// \param a
   4380 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
   4381 ///    zero.
   4382 /// \param m
   4383 ///    A pointer to the memory used for loading values.
   4384 /// \param i
   4385 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
   4386 /// \param mask
   4387 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
   4388 ///    bit of each element in the mask vector represents the mask bits. If a
   4389 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
   4390 ///    otherwise the value is loaded from memory.
   4391 /// \param s
   4392 ///    A literal constant scale factor for the indexes in \a i. Must be
   4393 ///    1, 2, 4, or 8.
   4394 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
   4395 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
   4396   ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
   4397                                      (int const *)(m), \
   4398                                      (__v4si)(__m128i)(i), \
   4399                                      (__v4si)(__m128i)(mask), (s)))
   4400 
   4401 /// Conditionally gathers eight 32-bit integer values, either from the
   4402 ///    256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
   4403 ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
   4404 ///    of [8 x i32] in \a mask determines the source for each element.
   4405 ///
   4406 /// \code{.operation}
   4407 /// FOR element := 0 to 7
   4408 ///   j := element*32
   4409 ///   k := element*32
   4410 ///   IF mask[j+31] == 0
   4411 ///     result[j+31:j] := a[j+31:j]
   4412 ///   ELSE
   4413 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
   4414 ///   FI
   4415 /// ENDFOR
   4416 /// \endcode
   4417 ///
   4418 /// \headerfile <immintrin.h>
   4419 ///
   4420 /// \code
   4421 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
   4422 ///                                     __m256i mask, const int s);
   4423 /// \endcode
   4424 ///
   4425 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
   4426 ///
   4427 /// \param a
   4428 ///    A 256-bit vector of [8 x i32] used as the source when a mask bit is
   4429 ///    zero.
   4430 /// \param m
   4431 ///    A pointer to the memory used for loading values.
   4432 /// \param i
   4433 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
   4434 /// \param mask
   4435 ///    A 256-bit vector of [8 x i32] containing the mask. The most significant
   4436 ///    bit of each element in the mask vector represents the mask bits. If a
   4437 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
   4438 ///    otherwise the value is loaded from memory.
   4439 /// \param s
   4440 ///    A literal constant scale factor for the indexes in \a i. Must be
   4441 ///    1, 2, 4, or 8.
   4442 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
   4443 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
   4444   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
   4445                                         (int const *)(m), \
   4446                                         (__v8si)(__m256i)(i), \
   4447                                         (__v8si)(__m256i)(mask), (s)))
   4448 
   4449 /// Conditionally gathers two 32-bit integer values, either from the
   4450 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
   4451 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
   4452 ///    of [4 x i32] in \a mask determines the source for the lower two
   4453 ///    elements. The upper two elements of the result are zeroed.
   4454 ///
   4455 /// \code{.operation}
   4456 /// FOR element := 0 to 1
   4457 ///   j := element*32
   4458 ///   k := element*64
   4459 ///   IF mask[j+31] == 0
   4460 ///     result[j+31:j] := a[j+31:j]
   4461 ///   ELSE
   4462 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
   4463 ///   FI
   4464 /// ENDFOR
   4465 /// result[127:64] := 0
   4466 /// \endcode
   4467 ///
   4468 /// \headerfile <immintrin.h>
   4469 ///
   4470 /// \code
   4471 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
   4472 ///                                  __m128i mask, const int s);
   4473 /// \endcode
   4474 ///
   4475 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
   4476 ///
   4477 /// \param a
   4478 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
   4479 ///   zero. Only the first two elements are used.
   4480 /// \param m
   4481 ///    A pointer to the memory used for loading values.
   4482 /// \param i
   4483 ///    A 128-bit vector of [2 x i64] containing indexes into \a m.
   4484 /// \param mask
   4485 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
   4486 ///    bit of each element in the mask vector represents the mask bits. If a
   4487 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
   4488 ///    otherwise the value is loaded from memory. Only the first two elements
   4489 ///    are used.
   4490 /// \param s
   4491 ///    A literal constant scale factor for the indexes in \a i. Must be
   4492 ///    1, 2, 4, or 8.
   4493 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
   4494 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
   4495   ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
   4496                                      (int const *)(m), \
   4497                                      (__v2di)(__m128i)(i), \
   4498                                      (__v4si)(__m128i)(mask), (s)))
   4499 
   4500 /// Conditionally gathers four 32-bit integer values, either from the
   4501 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
   4502 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
   4503 ///    of [4 x i32] in \a mask determines the source for each element.
   4504 ///
   4505 /// \code{.operation}
   4506 /// FOR element := 0 to 3
   4507 ///   j := element*32
   4508 ///   k := element*64
   4509 ///   IF mask[j+31] == 0
   4510 ///     result[j+31:j] := a[j+31:j]
   4511 ///   ELSE
   4512 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
   4513 ///   FI
   4514 /// ENDFOR
   4515 /// \endcode
   4516 ///
   4517 /// \headerfile <immintrin.h>
   4518 ///
   4519 /// \code
   4520 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
   4521 ///                                     __m128i mask, const int s);
   4522 /// \endcode
   4523 ///
   4524 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
   4525 ///
   4526 /// \param a
   4527 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
   4528 ///    zero.
   4529 /// \param m
   4530 ///    A pointer to the memory used for loading values.
   4531 /// \param i
   4532 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
   4533 /// \param mask
   4534 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
   4535 ///    bit of each element in the mask vector represents the mask bits. If a
   4536 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
   4537 ///    otherwise the value is loaded from memory.
   4538 /// \param s
   4539 ///    A literal constant scale factor for the indexes in \a i. Must be
   4540 ///    1, 2, 4, or 8.
   4541 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
   4542 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
   4543   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
   4544                                         (int const *)(m), \
   4545                                         (__v4di)(__m256i)(i), \
   4546                                         (__v4si)(__m128i)(mask), (s)))
   4547 
   4548 /// Conditionally gathers two 64-bit integer values, either from the
   4549 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
   4550 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
   4551 ///    of [2 x i64] in \a mask determines the source for each element.
   4552 ///
   4553 /// \code{.operation}
   4554 /// FOR element := 0 to 1
   4555 ///   j := element*64
   4556 ///   k := element*32
   4557 ///   IF mask[j+63] == 0
   4558 ///     result[j+63:j] := a[j+63:j]
   4559 ///   ELSE
   4560 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
   4561 ///   FI
   4562 /// ENDFOR
   4563 /// \endcode
   4564 ///
   4565 /// \headerfile <immintrin.h>
   4566 ///
   4567 /// \code
   4568 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
   4569 ///                                  __m128i mask, const int s);
   4570 /// \endcode
   4571 ///
   4572 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
   4573 ///
   4574 /// \param a
   4575 ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
   4576 ///    zero.
   4577 /// \param m
   4578 ///    A pointer to the memory used for loading values.
   4579 /// \param i
   4580 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
   4581 ///    the first two elements are used.
   4582 /// \param mask
   4583 ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
   4584 ///    bit of each element in the mask vector represents the mask bits. If a
   4585 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
   4586 ///    otherwise the value is loaded from memory.
   4587 /// \param s
   4588 ///    A literal constant scale factor for the indexes in \a i. Must be
   4589 ///    1, 2, 4, or 8.
   4590 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
   4591 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
   4592   ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
   4593                                      (long long const *)(m), \
   4594                                      (__v4si)(__m128i)(i), \
   4595                                      (__v2di)(__m128i)(mask), (s)))
   4596 
   4597 /// Conditionally gathers four 64-bit integer values, either from the
   4598 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
   4599 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
   4600 ///    of [4 x i64] in \a mask determines the source for each element.
   4601 ///
   4602 /// \code{.operation}
   4603 /// FOR element := 0 to 3
   4604 ///   j := element*64
   4605 ///   k := element*32
   4606 ///   IF mask[j+63] == 0
   4607 ///     result[j+63:j] := a[j+63:j]
   4608 ///   ELSE
   4609 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
   4610 ///   FI
   4611 /// ENDFOR
   4612 /// \endcode
   4613 ///
   4614 /// \headerfile <immintrin.h>
   4615 ///
   4616 /// \code
   4617 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
   4618 ///                                     __m128i i, __m256i mask, const int s);
   4619 /// \endcode
   4620 ///
   4621 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
   4622 ///
   4623 /// \param a
   4624 ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
   4625 ///    zero.
   4626 /// \param m
   4627 ///    A pointer to the memory used for loading values.
   4628 /// \param i
   4629 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
   4630 /// \param mask
   4631 ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
   4632 ///    bit of each element in the mask vector represents the mask bits. If a
   4633 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
   4634 ///    otherwise the value is loaded from memory.
   4635 /// \param s
   4636 ///    A literal constant scale factor for the indexes in \a i. Must be
   4637 ///    1, 2, 4, or 8.
   4638 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
   4639 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
   4640   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
   4641                                         (long long const *)(m), \
   4642                                         (__v4si)(__m128i)(i), \
   4643                                         (__v4di)(__m256i)(mask), (s)))
   4644 
   4645 /// Conditionally gathers two 64-bit integer values, either from the
   4646 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
   4647 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
   4648 ///    of [2 x i64] in \a mask determines the source for each element.
   4649 ///
   4650 /// \code{.operation}
   4651 /// FOR element := 0 to 1
   4652 ///   j := element*64
   4653 ///   k := element*64
   4654 ///   IF mask[j+63] == 0
   4655 ///     result[j+63:j] := a[j+63:j]
   4656 ///   ELSE
   4657 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
   4658 ///   FI
   4659 /// ENDFOR
   4660 /// \endcode
   4661 ///
   4662 /// \headerfile <immintrin.h>
   4663 ///
   4664 /// \code
   4665 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
   4666 ///                                  __m128i mask, const int s);
   4667 /// \endcode
   4668 ///
   4669 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
   4670 ///
   4671 /// \param a
   4672 ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
   4673 ///    zero.
   4674 /// \param m
   4675 ///    A pointer to the memory used for loading values.
   4676 /// \param i
   4677 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
   4678 /// \param mask
   4679 ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
   4680 ///    bit of each element in the mask vector represents the mask bits. If a
   4681 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
   4682 ///    otherwise the value is loaded from memory.
   4683 /// \param s
   4684 ///    A literal constant scale factor for the indexes in \a i. Must be
   4685 ///    1, 2, 4, or 8.
   4686 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
   4687 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
   4688   ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
   4689                                      (long long const *)(m), \
   4690                                      (__v2di)(__m128i)(i), \
   4691                                      (__v2di)(__m128i)(mask), (s)))
   4692 
   4693 /// Conditionally gathers four 64-bit integer values, either from the
   4694 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
   4695 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
   4696 ///    of [4 x i64] in \a mask determines the source for each element.
   4697 ///
   4698 /// \code{.operation}
   4699 /// FOR element := 0 to 3
   4700 ///   j := element*64
   4701 ///   k := element*64
   4702 ///   IF mask[j+63] == 0
   4703 ///     result[j+63:j] := a[j+63:j]
   4704 ///   ELSE
   4705 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
   4706 ///   FI
   4707 /// ENDFOR
   4708 /// \endcode
   4709 ///
   4710 /// \headerfile <immintrin.h>
   4711 ///
   4712 /// \code
   4713 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
   4714 ///                                     __m256i i, __m256i mask, const int s);
   4715 /// \endcode
   4716 ///
   4717 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
   4718 ///
   4719 /// \param a
   4720 ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
   4721 ///    zero.
   4722 /// \param m
   4723 ///    A pointer to the memory used for loading values.
   4724 /// \param i
   4725 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
   4726 /// \param mask
   4727 ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
   4728 ///    bit of each element in the mask vector represents the mask bits. If a
   4729 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
   4730 ///    otherwise the value is loaded from memory.
   4731 /// \param s
   4732 ///    A literal constant scale factor for the indexes in \a i. Must be
   4733 ///    1, 2, 4, or 8.
   4734 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
   4735 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
   4736   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
   4737                                         (long long const *)(m), \
   4738                                         (__v4di)(__m256i)(i), \
   4739                                         (__v4di)(__m256i)(mask), (s)))
   4740 
   4741 /// Gathers two 64-bit floating-point values from memory \a m using scaled
   4742 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
   4743 ///
   4744 /// \code{.operation}
   4745 /// FOR element := 0 to 1
   4746 ///   j := element*64
   4747 ///   k := element*32
   4748 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
   4749 /// ENDFOR
   4750 /// \endcode
   4751 ///
   4752 /// \headerfile <immintrin.h>
   4753 ///
   4754 /// \code
   4755 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
   4756 /// \endcode
   4757 ///
   4758 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
   4759 ///
   4760 /// \param m
   4761 ///    A pointer to the memory used for loading values.
   4762 /// \param i
   4763 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
   4764 ///    the first two elements are used.
   4765 /// \param s
   4766 ///    A literal constant scale factor for the indexes in \a i. Must be
   4767 ///    1, 2, 4, or 8.
   4768 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
   4769 #define _mm_i32gather_pd(m, i, s) \
   4770   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
   4771                                       (double const *)(m), \
   4772                                       (__v4si)(__m128i)(i), \
   4773                                       (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
   4774                                                            _mm_setzero_pd()), \
   4775                                       (s)))
   4776 
   4777 /// Gathers four 64-bit floating-point values from memory \a m using scaled
   4778 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
   4779 ///
   4780 /// \code{.operation}
   4781 /// FOR element := 0 to 3
   4782 ///   j := element*64
   4783 ///   k := element*32
   4784 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
   4785 /// ENDFOR
   4786 /// \endcode
   4787 ///
   4788 /// \headerfile <immintrin.h>
   4789 ///
   4790 /// \code
   4791 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
   4792 /// \endcode
   4793 ///
   4794 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
   4795 ///
   4796 /// \param m
   4797 ///    A pointer to the memory used for loading values.
   4798 /// \param i
   4799 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
   4800 /// \param s
   4801 ///    A literal constant scale factor for the indexes in \a i. Must be
   4802 ///    1, 2, 4, or 8.
   4803 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
   4804 #define _mm256_i32gather_pd(m, i, s) \
   4805   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
   4806                                          (double const *)(m), \
   4807                                          (__v4si)(__m128i)(i), \
   4808                                          (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
   4809                                                                _mm256_setzero_pd(), \
   4810                                                                _CMP_EQ_OQ), \
   4811                                          (s)))
   4812 
   4813 /// Gathers two 64-bit floating-point values from memory \a m using scaled
   4814 ///    indexes from the 128-bit vector of [2 x i64] in \a i.
   4815 ///
   4816 /// \code{.operation}
   4817 /// FOR element := 0 to 1
   4818 ///   j := element*64
   4819 ///   k := element*64
   4820 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
   4821 /// ENDFOR
   4822 /// \endcode
   4823 ///
   4824 /// \headerfile <immintrin.h>
   4825 ///
   4826 /// \code
   4827 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
   4828 /// \endcode
   4829 ///
   4830 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
   4831 ///
   4832 /// \param m
   4833 ///    A pointer to the memory used for loading values.
   4834 /// \param i
   4835 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
   4836 /// \param s
   4837 ///    A literal constant scale factor for the indexes in \a i. Must be
   4838 ///    1, 2, 4, or 8.
   4839 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
   4840 #define _mm_i64gather_pd(m, i, s) \
   4841   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
   4842                                       (double const *)(m), \
   4843                                       (__v2di)(__m128i)(i), \
   4844                                       (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
   4845                                                            _mm_setzero_pd()), \
   4846                                       (s)))
   4847 
   4848 /// Gathers four 64-bit floating-point values from memory \a m using scaled
   4849 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
   4850 ///
   4851 /// \code{.operation}
   4852 /// FOR element := 0 to 3
   4853 ///   j := element*64
   4854 ///   k := element*64
   4855 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
   4856 /// ENDFOR
   4857 /// \endcode
   4858 ///
   4859 /// \headerfile <immintrin.h>
   4860 ///
   4861 /// \code
   4862 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
   4863 /// \endcode
   4864 ///
   4865 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
   4866 ///
   4867 /// \param m
   4868 ///    A pointer to the memory used for loading values.
   4869 /// \param i
   4870 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
   4871 /// \param s
   4872 ///    A literal constant scale factor for the indexes in \a i. Must be
   4873 ///    1, 2, 4, or 8.
   4874 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
   4875 #define _mm256_i64gather_pd(m, i, s) \
   4876   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
   4877                                          (double const *)(m), \
   4878                                          (__v4di)(__m256i)(i), \
   4879                                          (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
   4880                                                                _mm256_setzero_pd(), \
   4881                                                                _CMP_EQ_OQ), \
   4882                                          (s)))
   4883 
   4884 /// Gathers four 32-bit floating-point values from memory \a m using scaled
   4885 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
   4886 ///
   4887 /// \code{.operation}
   4888 /// FOR element := 0 to 3
   4889 ///   j := element*32
   4890 ///   k := element*32
   4891 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
   4892 /// ENDFOR
   4893 /// \endcode
   4894 ///
   4895 /// \headerfile <immintrin.h>
   4896 ///
   4897 /// \code
   4898 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
   4899 /// \endcode
   4900 ///
   4901 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
   4902 ///
   4903 /// \param m
   4904 ///    A pointer to the memory used for loading values.
   4905 /// \param i
   4906 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
   4907 /// \param s
   4908 ///    A literal constant scale factor for the indexes in \a i. Must be
   4909 ///    1, 2, 4, or 8.
   4910 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
   4911 #define _mm_i32gather_ps(m, i, s) \
   4912   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
   4913                                      (float const *)(m), \
   4914                                      (__v4si)(__m128i)(i), \
   4915                                      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
   4916                                                           _mm_setzero_ps()), \
   4917                                      (s)))
   4918 
   4919 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
   4920 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
   4921 ///
   4922 /// \code{.operation}
   4923 /// FOR element := 0 to 7
   4924 ///   j := element*32
   4925 ///   k := element*32
   4926 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
   4927 /// ENDFOR
   4928 /// \endcode
   4929 ///
   4930 /// \headerfile <immintrin.h>
   4931 ///
   4932 /// \code
   4933 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
   4934 /// \endcode
   4935 ///
   4936 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
   4937 ///
   4938 /// \param m
   4939 ///    A pointer to the memory used for loading values.
   4940 /// \param i
   4941 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
   4942 /// \param s
   4943 ///    A literal constant scale factor for the indexes in \a i. Must be
   4944 ///    1, 2, 4, or 8.
   4945 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
   4946 #define _mm256_i32gather_ps(m, i, s) \
   4947   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
   4948                                         (float const *)(m), \
   4949                                         (__v8si)(__m256i)(i), \
   4950                                         (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
   4951                                                               _mm256_setzero_ps(), \
   4952                                                               _CMP_EQ_OQ), \
   4953                                         (s)))
   4954 
   4955 /// Gathers two 32-bit floating-point values from memory \a m using scaled
   4956 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
   4957 ///    elements of the result are zeroed.
   4958 ///
   4959 /// \code{.operation}
   4960 /// FOR element := 0 to 1
   4961 ///   j := element*32
   4962 ///   k := element*64
   4963 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
   4964 /// ENDFOR
   4965 /// result[127:64] := 0
   4966 /// \endcode
   4967 ///
   4968 /// \headerfile <immintrin.h>
   4969 ///
   4970 /// \code
   4971 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
   4972 /// \endcode
   4973 ///
   4974 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
   4975 ///
   4976 /// \param m
   4977 ///    A pointer to the memory used for loading values.
   4978 /// \param i
   4979 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
   4980 /// \param s
   4981 ///    A literal constant scale factor for the indexes in \a i. Must be
   4982 ///    1, 2, 4, or 8.
   4983 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
   4984 #define _mm_i64gather_ps(m, i, s) \
   4985   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
   4986                                      (float const *)(m), \
   4987                                      (__v2di)(__m128i)(i), \
   4988                                      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
   4989                                                           _mm_setzero_ps()), \
   4990                                      (s)))
   4991 
   4992 /// Gathers four 32-bit floating-point values from memory \a m using scaled
   4993 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
   4994 ///
   4995 /// \code{.operation}
   4996 /// FOR element := 0 to 3
   4997 ///   j := element*32
   4998 ///   k := element*64
   4999 ///   result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
   5000 /// ENDFOR
   5001 /// \endcode
   5002 ///
   5003 /// \headerfile <immintrin.h>
   5004 ///
   5005 /// \code
   5006 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
   5007 /// \endcode
   5008 ///
   5009 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
   5010 ///
   5011 /// \param m
   5012 ///    A pointer to the memory used for loading values.
   5013 /// \param i
   5014 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
   5015 /// \param s
   5016 ///    A literal constant scale factor for the indexes in \a i. Must be
   5017 ///    1, 2, 4, or 8.
   5018 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
   5019 #define _mm256_i64gather_ps(m, i, s) \
   5020   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
   5021                                         (float const *)(m), \
   5022                                         (__v4di)(__m256i)(i), \
   5023                                         (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
   5024                                                              _mm_setzero_ps()), \
   5025                                         (s)))
   5026 
   5027 /// Gathers four 32-bit floating-point values from memory \a m using scaled
   5028 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
   5029 ///
   5030 /// \code{.operation}
   5031 /// FOR element := 0 to 3
   5032 ///   j := element*32
   5033 ///   k := element*32
   5034 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
   5035 /// ENDFOR
   5036 /// \endcode
   5037 ///
   5038 /// \headerfile <immintrin.h>
   5039 ///
   5040 /// \code
   5041 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
   5042 /// \endcode
   5043 ///
   5044 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
   5045 ///
   5046 /// \param m
   5047 ///    A pointer to the memory used for loading values.
   5048 /// \param i
   5049 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
   5050 /// \param s
   5051 ///    A literal constant scale factor for the indexes in \a i. Must be
   5052 ///    1, 2, 4, or 8.
   5053 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
   5054 #define _mm_i32gather_epi32(m, i, s) \
   5055   ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
   5056                                      (int const *)(m), (__v4si)(__m128i)(i), \
   5057                                      (__v4si)_mm_set1_epi32(-1), (s)))
   5058 
   5059 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
   5060 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
   5061 ///
   5062 /// \code{.operation}
   5063 /// FOR element := 0 to 7
   5064 ///   j := element*32
   5065 ///   k := element*32
   5066 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
   5067 /// ENDFOR
   5068 /// \endcode
   5069 ///
   5070 /// \headerfile <immintrin.h>
   5071 ///
   5072 /// \code
   5073 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
   5074 /// \endcode
   5075 ///
   5076 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
   5077 ///
   5078 /// \param m
   5079 ///    A pointer to the memory used for loading values.
   5080 /// \param i
   5081 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
   5082 /// \param s
   5083 ///    A literal constant scale factor for the indexes in \a i. Must be
   5084 ///    1, 2, 4, or 8.
   5085 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
   5086 #define _mm256_i32gather_epi32(m, i, s) \
   5087   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
   5088                                         (int const *)(m), (__v8si)(__m256i)(i), \
   5089                                         (__v8si)_mm256_set1_epi32(-1), (s)))
   5090 
   5091 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
   5092 ///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
   5093 ///    of the result are zeroed.
   5094 ///
   5095 /// \code{.operation}
   5096 /// FOR element := 0 to 1
   5097 ///   j := element*32
   5098 ///   k := element*64
   5099 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
   5100 /// ENDFOR
   5101 /// result[127:64] := 0
   5102 /// \endcode
   5103 ///
   5104 /// \headerfile <immintrin.h>
   5105 ///
   5106 /// \code
   5107 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
   5108 /// \endcode
   5109 ///
   5110 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
   5111 ///
   5112 /// \param m
   5113 ///    A pointer to the memory used for loading values.
   5114 /// \param i
   5115 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
   5116 /// \param s
   5117 ///    A literal constant scale factor for the indexes in \a i. Must be
   5118 ///    1, 2, 4, or 8.
   5119 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
   5120 #define _mm_i64gather_epi32(m, i, s) \
   5121   ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
   5122                                      (int const *)(m), (__v2di)(__m128i)(i), \
   5123                                      (__v4si)_mm_set1_epi32(-1), (s)))
   5124 
   5125 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
   5126 ///    from the 256-bit vector of [4 x i64] in \a i.
   5127 ///
   5128 /// \code{.operation}
   5129 /// FOR element := 0 to 3
   5130 ///   j := element*32
   5131 ///   k := element*64
   5132 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
   5133 /// ENDFOR
   5134 /// \endcode
   5135 ///
   5136 /// \headerfile <immintrin.h>
   5137 ///
   5138 /// \code
   5139 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
   5140 /// \endcode
   5141 ///
   5142 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
   5143 ///
   5144 /// \param m
   5145 ///    A pointer to the memory used for loading values.
   5146 /// \param i
   5147 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
   5148 /// \param s
   5149 ///    A literal constant scale factor for the indexes in \a i. Must be
   5150 ///    1, 2, 4, or 8.
   5151 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
   5152 #define _mm256_i64gather_epi32(m, i, s) \
   5153   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
   5154                                         (int const *)(m), (__v4di)(__m256i)(i), \
   5155                                         (__v4si)_mm_set1_epi32(-1), (s)))
   5156 
   5157 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
   5158 ///    from the 128-bit vector of [4 x i32] in \a i.
   5159 ///
   5160 /// \code{.operation}
   5161 /// FOR element := 0 to 1
   5162 ///   j := element*64
   5163 ///   k := element*32
   5164 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
   5165 /// ENDFOR
   5166 /// \endcode
   5167 ///
   5168 /// \headerfile <immintrin.h>
   5169 ///
   5170 /// \code
   5171 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
   5172 /// \endcode
   5173 ///
   5174 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
   5175 ///
   5176 /// \param m
   5177 ///    A pointer to the memory used for loading values.
   5178 /// \param i
   5179 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
   5180 ///    the first two elements are used.
   5181 /// \param s
   5182 ///    A literal constant scale factor for the indexes in \a i. Must be
   5183 ///    1, 2, 4, or 8.
   5184 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
   5185 #define _mm_i32gather_epi64(m, i, s) \
   5186   ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
   5187                                      (long long const *)(m), \
   5188                                      (__v4si)(__m128i)(i), \
   5189                                      (__v2di)_mm_set1_epi64x(-1), (s)))
   5190 
   5191 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
   5192 ///    from the 128-bit vector of [4 x i32] in \a i.
   5193 ///
   5194 /// \code{.operation}
   5195 /// FOR element := 0 to 3
   5196 ///   j := element*64
   5197 ///   k := element*32
   5198 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
   5199 /// ENDFOR
   5200 /// \endcode
   5201 ///
   5202 /// \headerfile <immintrin.h>
   5203 ///
   5204 /// \code
   5205 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
   5206 /// \endcode
   5207 ///
   5208 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
   5209 ///
   5210 /// \param m
   5211 ///    A pointer to the memory used for loading values.
   5212 /// \param i
   5213 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
   5214 /// \param s
   5215 ///    A literal constant scale factor for the indexes in \a i. Must be
   5216 ///    1, 2, 4, or 8.
   5217 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
   5218 #define _mm256_i32gather_epi64(m, i, s) \
   5219   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
   5220                                         (long long const *)(m), \
   5221                                         (__v4si)(__m128i)(i), \
   5222                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
   5223 
   5224 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
   5225 ///    from the 128-bit vector of [2 x i64] in \a i.
   5226 ///
   5227 /// \code{.operation}
   5228 /// FOR element := 0 to 1
   5229 ///   j := element*64
   5230 ///   k := element*64
   5231 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
   5232 /// ENDFOR
   5233 /// \endcode
   5234 ///
   5235 /// \headerfile <immintrin.h>
   5236 ///
   5237 /// \code
   5238 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
   5239 /// \endcode
   5240 ///
   5241 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
   5242 ///
   5243 /// \param m
   5244 ///    A pointer to the memory used for loading values.
   5245 /// \param i
   5246 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
   5247 /// \param s
   5248 ///    A literal constant scale factor for the indexes in \a i. Must be
   5249 ///    1, 2, 4, or 8.
   5250 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
   5251 #define _mm_i64gather_epi64(m, i, s) \
   5252   ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
   5253                                      (long long const *)(m), \
   5254                                      (__v2di)(__m128i)(i), \
   5255                                      (__v2di)_mm_set1_epi64x(-1), (s)))
   5256 
   5257 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
   5258 ///    from the 256-bit vector of [4 x i64] in \a i.
   5259 ///
   5260 /// \code{.operation}
   5261 /// FOR element := 0 to 3
   5262 ///   j := element*64
   5263 ///   k := element*64
   5264 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
   5265 /// ENDFOR
   5266 /// \endcode
   5267 ///
   5268 /// \headerfile <immintrin.h>
   5269 ///
   5270 /// \code
   5271 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
   5272 /// \endcode
   5273 ///
   5274 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
   5275 ///
   5276 /// \param m
   5277 ///    A pointer to the memory used for loading values.
   5278 /// \param i
   5279 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
   5280 /// \param s
   5281 ///    A literal constant scale factor for the indexes in \a i. Must be
   5282 ///    1, 2, 4, or 8.
   5283 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
   5284 #define _mm256_i64gather_epi64(m, i, s) \
   5285   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
   5286                                         (long long const *)(m), \
   5287                                         (__v4di)(__m256i)(i), \
   5288                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
   5289 
   5290 #undef __DEFAULT_FN_ATTRS256
   5291 #undef __DEFAULT_FN_ATTRS128
   5292 
   5293 #endif /* __AVX2INTRIN_H */