zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

avxintrin.h (201849B) - Raw


      1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 #ifndef __IMMINTRIN_H
     11 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
     12 #endif
     13 
     14 #ifndef __AVXINTRIN_H
     15 #define __AVXINTRIN_H
     16 
     17 typedef double __v4df __attribute__ ((__vector_size__ (32)));
     18 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
     19 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
     20 typedef int __v8si __attribute__ ((__vector_size__ (32)));
     21 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
     22 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
     23 
     24 /* Unsigned types */
     25 typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
     26 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
     27 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
     28 typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
     29 
     30 /* We need an explicitly signed variant for char. Note that this shouldn't
     31  * appear in the interface though. */
     32 typedef signed char __v32qs __attribute__((__vector_size__(32)));
     33 
     34 typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
     35 typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
     36 typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
     37 
     38 typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
     39 typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
     40 typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
     41 
     42 #ifdef __SSE2__
     43 /* Both _Float16 and __bf16 require SSE2 being enabled. */
     44 typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
     45 typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
     46 typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
     47 
     48 typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
     49 typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
     50 #endif
     51 
     52 /* Define the default attributes for the functions in this file. */
     53 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
     54 #define __DEFAULT_FN_ATTRS                                                     \
     55   __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
     56                  __min_vector_width__(256)))
     57 #define __DEFAULT_FN_ATTRS128                                                  \
     58   __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
     59                  __min_vector_width__(128)))
     60 #else
     61 #define __DEFAULT_FN_ATTRS                                                     \
     62   __attribute__((__always_inline__, __nodebug__, __target__("avx"),            \
     63                  __min_vector_width__(256)))
     64 #define __DEFAULT_FN_ATTRS128                                                  \
     65   __attribute__((__always_inline__, __nodebug__, __target__("avx"),            \
     66                  __min_vector_width__(128)))
     67 #endif
     68 
     69 #if defined(__cplusplus) && (__cplusplus >= 201103L)
     70 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
     71 #define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
     72 #else
     73 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128
     74 #define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS
     75 #endif
     76 
     77 /* Arithmetic */
     78 /// Adds two 256-bit vectors of [4 x double].
     79 ///
     80 /// \headerfile <x86intrin.h>
     81 ///
     82 /// This intrinsic corresponds to the <c> VADDPD </c> instruction.
     83 ///
     84 /// \param __a
     85 ///    A 256-bit vector of [4 x double] containing one of the source operands.
     86 /// \param __b
     87 ///    A 256-bit vector of [4 x double] containing one of the source operands.
     88 /// \returns A 256-bit vector of [4 x double] containing the sums of both
     89 ///    operands.
     90 static __inline __m256d __DEFAULT_FN_ATTRS
     91 _mm256_add_pd(__m256d __a, __m256d __b)
     92 {
     93   return (__m256d)((__v4df)__a+(__v4df)__b);
     94 }
     95 
     96 /// Adds two 256-bit vectors of [8 x float].
     97 ///
     98 /// \headerfile <x86intrin.h>
     99 ///
    100 /// This intrinsic corresponds to the <c> VADDPS </c> instruction.
    101 ///
    102 /// \param __a
    103 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    104 /// \param __b
    105 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    106 /// \returns A 256-bit vector of [8 x float] containing the sums of both
    107 ///    operands.
    108 static __inline __m256 __DEFAULT_FN_ATTRS
    109 _mm256_add_ps(__m256 __a, __m256 __b)
    110 {
    111   return (__m256)((__v8sf)__a+(__v8sf)__b);
    112 }
    113 
    114 /// Subtracts two 256-bit vectors of [4 x double].
    115 ///
    116 /// \headerfile <x86intrin.h>
    117 ///
    118 /// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
    119 ///
    120 /// \param __a
    121 ///    A 256-bit vector of [4 x double] containing the minuend.
    122 /// \param __b
    123 ///    A 256-bit vector of [4 x double] containing the subtrahend.
    124 /// \returns A 256-bit vector of [4 x double] containing the differences between
    125 ///    both operands.
    126 static __inline __m256d __DEFAULT_FN_ATTRS
    127 _mm256_sub_pd(__m256d __a, __m256d __b)
    128 {
    129   return (__m256d)((__v4df)__a-(__v4df)__b);
    130 }
    131 
    132 /// Subtracts two 256-bit vectors of [8 x float].
    133 ///
    134 /// \headerfile <x86intrin.h>
    135 ///
    136 /// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
    137 ///
    138 /// \param __a
    139 ///    A 256-bit vector of [8 x float] containing the minuend.
    140 /// \param __b
    141 ///    A 256-bit vector of [8 x float] containing the subtrahend.
    142 /// \returns A 256-bit vector of [8 x float] containing the differences between
    143 ///    both operands.
    144 static __inline __m256 __DEFAULT_FN_ATTRS
    145 _mm256_sub_ps(__m256 __a, __m256 __b)
    146 {
    147   return (__m256)((__v8sf)__a-(__v8sf)__b);
    148 }
    149 
    150 /// Adds the even-indexed values and subtracts the odd-indexed values of
    151 ///    two 256-bit vectors of [4 x double].
    152 ///
    153 /// \headerfile <x86intrin.h>
    154 ///
    155 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
    156 ///
    157 /// \param __a
    158 ///    A 256-bit vector of [4 x double] containing the left source operand.
    159 /// \param __b
    160 ///    A 256-bit vector of [4 x double] containing the right source operand.
    161 /// \returns A 256-bit vector of [4 x double] containing the alternating sums
    162 ///    and differences between both operands.
    163 static __inline __m256d __DEFAULT_FN_ATTRS
    164 _mm256_addsub_pd(__m256d __a, __m256d __b)
    165 {
    166   return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
    167 }
    168 
    169 /// Adds the even-indexed values and subtracts the odd-indexed values of
    170 ///    two 256-bit vectors of [8 x float].
    171 ///
    172 /// \headerfile <x86intrin.h>
    173 ///
    174 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
    175 ///
    176 /// \param __a
    177 ///    A 256-bit vector of [8 x float] containing the left source operand.
    178 /// \param __b
    179 ///    A 256-bit vector of [8 x float] containing the right source operand.
    180 /// \returns A 256-bit vector of [8 x float] containing the alternating sums and
    181 ///    differences between both operands.
    182 static __inline __m256 __DEFAULT_FN_ATTRS
    183 _mm256_addsub_ps(__m256 __a, __m256 __b)
    184 {
    185   return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
    186 }
    187 
    188 /// Divides two 256-bit vectors of [4 x double].
    189 ///
    190 /// \headerfile <x86intrin.h>
    191 ///
    192 /// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
    193 ///
    194 /// \param __a
    195 ///    A 256-bit vector of [4 x double] containing the dividend.
    196 /// \param __b
    197 ///    A 256-bit vector of [4 x double] containing the divisor.
    198 /// \returns A 256-bit vector of [4 x double] containing the quotients of both
    199 ///    operands.
    200 static __inline __m256d __DEFAULT_FN_ATTRS
    201 _mm256_div_pd(__m256d __a, __m256d __b)
    202 {
    203   return (__m256d)((__v4df)__a/(__v4df)__b);
    204 }
    205 
    206 /// Divides two 256-bit vectors of [8 x float].
    207 ///
    208 /// \headerfile <x86intrin.h>
    209 ///
    210 /// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
    211 ///
    212 /// \param __a
    213 ///    A 256-bit vector of [8 x float] containing the dividend.
    214 /// \param __b
    215 ///    A 256-bit vector of [8 x float] containing the divisor.
    216 /// \returns A 256-bit vector of [8 x float] containing the quotients of both
    217 ///    operands.
    218 static __inline __m256 __DEFAULT_FN_ATTRS
    219 _mm256_div_ps(__m256 __a, __m256 __b)
    220 {
    221   return (__m256)((__v8sf)__a/(__v8sf)__b);
    222 }
    223 
    224 /// Compares two 256-bit vectors of [4 x double] and returns the greater
    225 ///    of each pair of values.
    226 ///
    227 ///    If either value in a comparison is NaN, returns the value from \a __b.
    228 ///
    229 /// \headerfile <x86intrin.h>
    230 ///
    231 /// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
    232 ///
    233 /// \param __a
    234 ///    A 256-bit vector of [4 x double] containing one of the operands.
    235 /// \param __b
    236 ///    A 256-bit vector of [4 x double] containing one of the operands.
    237 /// \returns A 256-bit vector of [4 x double] containing the maximum values
    238 ///    between both operands.
    239 static __inline __m256d __DEFAULT_FN_ATTRS
    240 _mm256_max_pd(__m256d __a, __m256d __b)
    241 {
    242   return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
    243 }
    244 
    245 /// Compares two 256-bit vectors of [8 x float] and returns the greater
    246 ///    of each pair of values.
    247 ///
    248 ///    If either value in a comparison is NaN, returns the value from \a __b.
    249 ///
    250 /// \headerfile <x86intrin.h>
    251 ///
    252 /// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
    253 ///
    254 /// \param __a
    255 ///    A 256-bit vector of [8 x float] containing one of the operands.
    256 /// \param __b
    257 ///    A 256-bit vector of [8 x float] containing one of the operands.
    258 /// \returns A 256-bit vector of [8 x float] containing the maximum values
    259 ///    between both operands.
    260 static __inline __m256 __DEFAULT_FN_ATTRS
    261 _mm256_max_ps(__m256 __a, __m256 __b)
    262 {
    263   return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
    264 }
    265 
    266 /// Compares two 256-bit vectors of [4 x double] and returns the lesser
    267 ///    of each pair of values.
    268 ///
    269 ///    If either value in a comparison is NaN, returns the value from \a __b.
    270 ///
    271 /// \headerfile <x86intrin.h>
    272 ///
    273 /// This intrinsic corresponds to the <c> VMINPD </c> instruction.
    274 ///
    275 /// \param __a
    276 ///    A 256-bit vector of [4 x double] containing one of the operands.
    277 /// \param __b
    278 ///    A 256-bit vector of [4 x double] containing one of the operands.
    279 /// \returns A 256-bit vector of [4 x double] containing the minimum values
    280 ///    between both operands.
    281 static __inline __m256d __DEFAULT_FN_ATTRS
    282 _mm256_min_pd(__m256d __a, __m256d __b)
    283 {
    284   return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
    285 }
    286 
    287 /// Compares two 256-bit vectors of [8 x float] and returns the lesser
    288 ///    of each pair of values.
    289 ///
    290 ///    If either value in a comparison is NaN, returns the value from \a __b.
    291 ///
    292 /// \headerfile <x86intrin.h>
    293 ///
    294 /// This intrinsic corresponds to the <c> VMINPS </c> instruction.
    295 ///
    296 /// \param __a
    297 ///    A 256-bit vector of [8 x float] containing one of the operands.
    298 /// \param __b
    299 ///    A 256-bit vector of [8 x float] containing one of the operands.
    300 /// \returns A 256-bit vector of [8 x float] containing the minimum values
    301 ///    between both operands.
    302 static __inline __m256 __DEFAULT_FN_ATTRS
    303 _mm256_min_ps(__m256 __a, __m256 __b)
    304 {
    305   return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
    306 }
    307 
    308 /// Multiplies two 256-bit vectors of [4 x double].
    309 ///
    310 /// \headerfile <x86intrin.h>
    311 ///
    312 /// This intrinsic corresponds to the <c> VMULPD </c> instruction.
    313 ///
    314 /// \param __a
    315 ///    A 256-bit vector of [4 x double] containing one of the operands.
    316 /// \param __b
    317 ///    A 256-bit vector of [4 x double] containing one of the operands.
    318 /// \returns A 256-bit vector of [4 x double] containing the products of both
    319 ///    operands.
    320 static __inline __m256d __DEFAULT_FN_ATTRS
    321 _mm256_mul_pd(__m256d __a, __m256d __b)
    322 {
    323   return (__m256d)((__v4df)__a * (__v4df)__b);
    324 }
    325 
    326 /// Multiplies two 256-bit vectors of [8 x float].
    327 ///
    328 /// \headerfile <x86intrin.h>
    329 ///
    330 /// This intrinsic corresponds to the <c> VMULPS </c> instruction.
    331 ///
    332 /// \param __a
    333 ///    A 256-bit vector of [8 x float] containing one of the operands.
    334 /// \param __b
    335 ///    A 256-bit vector of [8 x float] containing one of the operands.
    336 /// \returns A 256-bit vector of [8 x float] containing the products of both
    337 ///    operands.
    338 static __inline __m256 __DEFAULT_FN_ATTRS
    339 _mm256_mul_ps(__m256 __a, __m256 __b)
    340 {
    341   return (__m256)((__v8sf)__a * (__v8sf)__b);
    342 }
    343 
    344 /// Calculates the square roots of the values in a 256-bit vector of
    345 ///    [4 x double].
    346 ///
    347 /// \headerfile <x86intrin.h>
    348 ///
    349 /// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
    350 ///
    351 /// \param __a
    352 ///    A 256-bit vector of [4 x double].
    353 /// \returns A 256-bit vector of [4 x double] containing the square roots of the
    354 ///    values in the operand.
    355 static __inline __m256d __DEFAULT_FN_ATTRS
    356 _mm256_sqrt_pd(__m256d __a)
    357 {
    358   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
    359 }
    360 
    361 /// Calculates the square roots of the values in a 256-bit vector of
    362 ///    [8 x float].
    363 ///
    364 /// \headerfile <x86intrin.h>
    365 ///
    366 /// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
    367 ///
    368 /// \param __a
    369 ///    A 256-bit vector of [8 x float].
    370 /// \returns A 256-bit vector of [8 x float] containing the square roots of the
    371 ///    values in the operand.
    372 static __inline __m256 __DEFAULT_FN_ATTRS
    373 _mm256_sqrt_ps(__m256 __a)
    374 {
    375   return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
    376 }
    377 
    378 /// Calculates the reciprocal square roots of the values in a 256-bit
    379 ///    vector of [8 x float].
    380 ///
    381 /// \headerfile <x86intrin.h>
    382 ///
    383 /// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
    384 ///
    385 /// \param __a
    386 ///    A 256-bit vector of [8 x float].
    387 /// \returns A 256-bit vector of [8 x float] containing the reciprocal square
    388 ///    roots of the values in the operand.
    389 static __inline __m256 __DEFAULT_FN_ATTRS
    390 _mm256_rsqrt_ps(__m256 __a)
    391 {
    392   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
    393 }
    394 
    395 /// Calculates the reciprocals of the values in a 256-bit vector of
    396 ///    [8 x float].
    397 ///
    398 /// \headerfile <x86intrin.h>
    399 ///
    400 /// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
    401 ///
    402 /// \param __a
    403 ///    A 256-bit vector of [8 x float].
    404 /// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
    405 ///    values in the operand.
    406 static __inline __m256 __DEFAULT_FN_ATTRS
    407 _mm256_rcp_ps(__m256 __a)
    408 {
    409   return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
    410 }
    411 
    412 /// Rounds the values in a 256-bit vector of [4 x double] as specified
    413 ///    by the byte operand. The source values are rounded to integer values and
    414 ///    returned as 64-bit double-precision floating-point values.
    415 ///
    416 /// \headerfile <x86intrin.h>
    417 ///
    418 /// \code
    419 /// __m256d _mm256_round_pd(__m256d V, const int M);
    420 /// \endcode
    421 ///
    422 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
    423 ///
    424 /// \param V
    425 ///    A 256-bit vector of [4 x double].
    426 /// \param M
    427 ///    An integer value that specifies the rounding operation. \n
    428 ///    Bits [7:4] are reserved. \n
    429 ///    Bit [3] is a precision exception value: \n
    430 ///      0: A normal PE exception is used. \n
    431 ///      1: The PE field is not updated. \n
    432 ///    Bit [2] is the rounding control source: \n
    433 ///      0: Use bits [1:0] of \a M. \n
    434 ///      1: Use the current MXCSR setting. \n
    435 ///    Bits [1:0] contain the rounding control definition: \n
    436 ///      00: Nearest. \n
    437 ///      01: Downward (toward negative infinity). \n
    438 ///      10: Upward (toward positive infinity). \n
    439 ///      11: Truncated.
    440 /// \returns A 256-bit vector of [4 x double] containing the rounded values.
    441 #define _mm256_round_pd(V, M) \
    442   ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
    443 
    444 /// Rounds the values stored in a 256-bit vector of [8 x float] as
    445 ///    specified by the byte operand. The source values are rounded to integer
    446 ///    values and returned as floating-point values.
    447 ///
    448 /// \headerfile <x86intrin.h>
    449 ///
    450 /// \code
    451 /// __m256 _mm256_round_ps(__m256 V, const int M);
    452 /// \endcode
    453 ///
    454 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
    455 ///
    456 /// \param V
    457 ///    A 256-bit vector of [8 x float].
    458 /// \param M
    459 ///    An integer value that specifies the rounding operation. \n
    460 ///    Bits [7:4] are reserved. \n
    461 ///    Bit [3] is a precision exception value: \n
    462 ///      0: A normal PE exception is used. \n
    463 ///      1: The PE field is not updated. \n
    464 ///    Bit [2] is the rounding control source: \n
    465 ///      0: Use bits [1:0] of \a M. \n
    466 ///      1: Use the current MXCSR setting. \n
    467 ///    Bits [1:0] contain the rounding control definition: \n
    468 ///      00: Nearest. \n
    469 ///      01: Downward (toward negative infinity). \n
    470 ///      10: Upward (toward positive infinity). \n
    471 ///      11: Truncated.
    472 /// \returns A 256-bit vector of [8 x float] containing the rounded values.
    473 #define _mm256_round_ps(V, M) \
    474   ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
    475 
    476 /// Rounds up the values stored in a 256-bit vector of [4 x double]. The
    477 ///    source values are rounded up to integer values and returned as 64-bit
    478 ///    double-precision floating-point values.
    479 ///
    480 /// \headerfile <x86intrin.h>
    481 ///
    482 /// \code
    483 /// __m256d _mm256_ceil_pd(__m256d V);
    484 /// \endcode
    485 ///
    486 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
    487 ///
    488 /// \param V
    489 ///    A 256-bit vector of [4 x double].
    490 /// \returns A 256-bit vector of [4 x double] containing the rounded up values.
    491 #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
    492 
    493 /// Rounds down the values stored in a 256-bit vector of [4 x double].
    494 ///    The source values are rounded down to integer values and returned as
    495 ///    64-bit double-precision floating-point values.
    496 ///
    497 /// \headerfile <x86intrin.h>
    498 ///
    499 /// \code
    500 /// __m256d _mm256_floor_pd(__m256d V);
    501 /// \endcode
    502 ///
    503 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
    504 ///
    505 /// \param V
    506 ///    A 256-bit vector of [4 x double].
    507 /// \returns A 256-bit vector of [4 x double] containing the rounded down
    508 ///    values.
    509 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
    510 
    511 /// Rounds up the values stored in a 256-bit vector of [8 x float]. The
    512 ///    source values are rounded up to integer values and returned as
    513 ///    floating-point values.
    514 ///
    515 /// \headerfile <x86intrin.h>
    516 ///
    517 /// \code
    518 /// __m256 _mm256_ceil_ps(__m256 V);
    519 /// \endcode
    520 ///
    521 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
    522 ///
    523 /// \param V
    524 ///    A 256-bit vector of [8 x float].
    525 /// \returns A 256-bit vector of [8 x float] containing the rounded up values.
    526 #define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
    527 
    528 /// Rounds down the values stored in a 256-bit vector of [8 x float]. The
    529 ///    source values are rounded down to integer values and returned as
    530 ///    floating-point values.
    531 ///
    532 /// \headerfile <x86intrin.h>
    533 ///
    534 /// \code
    535 /// __m256 _mm256_floor_ps(__m256 V);
    536 /// \endcode
    537 ///
    538 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
    539 ///
    540 /// \param V
    541 ///    A 256-bit vector of [8 x float].
    542 /// \returns A 256-bit vector of [8 x float] containing the rounded down values.
    543 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
    544 
    545 /* Logical */
    546 /// Performs a bitwise AND of two 256-bit vectors of [4 x double].
    547 ///
    548 /// \headerfile <x86intrin.h>
    549 ///
    550 /// This intrinsic corresponds to the <c> VANDPD </c> instruction.
    551 ///
    552 /// \param __a
    553 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    554 /// \param __b
    555 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    556 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
    557 ///    values between both operands.
    558 static __inline __m256d __DEFAULT_FN_ATTRS
    559 _mm256_and_pd(__m256d __a, __m256d __b)
    560 {
    561   return (__m256d)((__v4du)__a & (__v4du)__b);
    562 }
    563 
    564 /// Performs a bitwise AND of two 256-bit vectors of [8 x float].
    565 ///
    566 /// \headerfile <x86intrin.h>
    567 ///
    568 /// This intrinsic corresponds to the <c> VANDPS </c> instruction.
    569 ///
    570 /// \param __a
    571 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    572 /// \param __b
    573 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    574 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
    575 ///    values between both operands.
    576 static __inline __m256 __DEFAULT_FN_ATTRS
    577 _mm256_and_ps(__m256 __a, __m256 __b)
    578 {
    579   return (__m256)((__v8su)__a & (__v8su)__b);
    580 }
    581 
    582 /// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
    583 ///    the one's complement of the values contained in the first source operand.
    584 ///
    585 /// \headerfile <x86intrin.h>
    586 ///
    587 /// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
    588 ///
    589 /// \param __a
    590 ///    A 256-bit vector of [4 x double] containing the left source operand. The
    591 ///    one's complement of this value is used in the bitwise AND.
    592 /// \param __b
    593 ///    A 256-bit vector of [4 x double] containing the right source operand.
    594 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
    595 ///    values of the second operand and the one's complement of the first
    596 ///    operand.
    597 static __inline __m256d __DEFAULT_FN_ATTRS
    598 _mm256_andnot_pd(__m256d __a, __m256d __b)
    599 {
    600   return (__m256d)(~(__v4du)__a & (__v4du)__b);
    601 }
    602 
    603 /// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
    604 ///    the one's complement of the values contained in the first source operand.
    605 ///
    606 /// \headerfile <x86intrin.h>
    607 ///
    608 /// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
    609 ///
    610 /// \param __a
    611 ///    A 256-bit vector of [8 x float] containing the left source operand. The
    612 ///    one's complement of this value is used in the bitwise AND.
    613 /// \param __b
    614 ///    A 256-bit vector of [8 x float] containing the right source operand.
    615 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
    616 ///    values of the second operand and the one's complement of the first
    617 ///    operand.
    618 static __inline __m256 __DEFAULT_FN_ATTRS
    619 _mm256_andnot_ps(__m256 __a, __m256 __b)
    620 {
    621   return (__m256)(~(__v8su)__a & (__v8su)__b);
    622 }
    623 
    624 /// Performs a bitwise OR of two 256-bit vectors of [4 x double].
    625 ///
    626 /// \headerfile <x86intrin.h>
    627 ///
    628 /// This intrinsic corresponds to the <c> VORPD </c> instruction.
    629 ///
    630 /// \param __a
    631 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    632 /// \param __b
    633 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    634 /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
    635 ///    values between both operands.
    636 static __inline __m256d __DEFAULT_FN_ATTRS
    637 _mm256_or_pd(__m256d __a, __m256d __b)
    638 {
    639   return (__m256d)((__v4du)__a | (__v4du)__b);
    640 }
    641 
    642 /// Performs a bitwise OR of two 256-bit vectors of [8 x float].
    643 ///
    644 /// \headerfile <x86intrin.h>
    645 ///
    646 /// This intrinsic corresponds to the <c> VORPS </c> instruction.
    647 ///
    648 /// \param __a
    649 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    650 /// \param __b
    651 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    652 /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
    653 ///    values between both operands.
    654 static __inline __m256 __DEFAULT_FN_ATTRS
    655 _mm256_or_ps(__m256 __a, __m256 __b)
    656 {
    657   return (__m256)((__v8su)__a | (__v8su)__b);
    658 }
    659 
    660 /// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
    661 ///
    662 /// \headerfile <x86intrin.h>
    663 ///
    664 /// This intrinsic corresponds to the <c> VXORPD </c> instruction.
    665 ///
    666 /// \param __a
    667 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    668 /// \param __b
    669 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    670 /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
    671 ///    values between both operands.
    672 static __inline __m256d __DEFAULT_FN_ATTRS
    673 _mm256_xor_pd(__m256d __a, __m256d __b)
    674 {
    675   return (__m256d)((__v4du)__a ^ (__v4du)__b);
    676 }
    677 
    678 /// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
    679 ///
    680 /// \headerfile <x86intrin.h>
    681 ///
    682 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
    683 ///
    684 /// \param __a
    685 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    686 /// \param __b
    687 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    688 /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
    689 ///    values between both operands.
    690 static __inline __m256 __DEFAULT_FN_ATTRS
    691 _mm256_xor_ps(__m256 __a, __m256 __b)
    692 {
    693   return (__m256)((__v8su)__a ^ (__v8su)__b);
    694 }
    695 
    696 /* Horizontal arithmetic */
    697 /// Horizontally adds the adjacent pairs of values contained in two
    698 ///    256-bit vectors of [4 x double].
    699 ///
    700 /// \headerfile <x86intrin.h>
    701 ///
    702 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
    703 ///
    704 /// \param __a
    705 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    706 ///    The horizontal sums of the values are returned in the even-indexed
    707 ///    elements of a vector of [4 x double].
    708 /// \param __b
    709 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    710 ///    The horizontal sums of the values are returned in the odd-indexed
    711 ///    elements of a vector of [4 x double].
    712 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
    713 ///    both operands.
    714 static __inline __m256d __DEFAULT_FN_ATTRS
    715 _mm256_hadd_pd(__m256d __a, __m256d __b)
    716 {
    717   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
    718 }
    719 
    720 /// Horizontally adds the adjacent pairs of values contained in two
    721 ///    256-bit vectors of [8 x float].
    722 ///
    723 /// \headerfile <x86intrin.h>
    724 ///
    725 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
    726 ///
    727 /// \param __a
    728 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    729 ///    The horizontal sums of the values are returned in the elements with
    730 ///    index 0, 1, 4, 5 of a vector of [8 x float].
    731 /// \param __b
    732 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    733 ///    The horizontal sums of the values are returned in the elements with
    734 ///    index 2, 3, 6, 7 of a vector of [8 x float].
    735 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
    736 ///    both operands.
    737 static __inline __m256 __DEFAULT_FN_ATTRS
    738 _mm256_hadd_ps(__m256 __a, __m256 __b)
    739 {
    740   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
    741 }
    742 
    743 /// Horizontally subtracts the adjacent pairs of values contained in two
    744 ///    256-bit vectors of [4 x double].
    745 ///
    746 /// \headerfile <x86intrin.h>
    747 ///
    748 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
    749 ///
    750 /// \param __a
    751 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    752 ///    The horizontal differences between the values are returned in the
    753 ///    even-indexed elements of a vector of [4 x double].
    754 /// \param __b
    755 ///    A 256-bit vector of [4 x double] containing one of the source operands.
    756 ///    The horizontal differences between the values are returned in the
    757 ///    odd-indexed elements of a vector of [4 x double].
    758 /// \returns A 256-bit vector of [4 x double] containing the horizontal
    759 ///    differences of both operands.
    760 static __inline __m256d __DEFAULT_FN_ATTRS
    761 _mm256_hsub_pd(__m256d __a, __m256d __b)
    762 {
    763   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
    764 }
    765 
    766 /// Horizontally subtracts the adjacent pairs of values contained in two
    767 ///    256-bit vectors of [8 x float].
    768 ///
    769 /// \headerfile <x86intrin.h>
    770 ///
    771 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
    772 ///
    773 /// \param __a
    774 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    775 ///    The horizontal differences between the values are returned in the
    776 ///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
    777 /// \param __b
    778 ///    A 256-bit vector of [8 x float] containing one of the source operands.
    779 ///    The horizontal differences between the values are returned in the
    780 ///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
    781 /// \returns A 256-bit vector of [8 x float] containing the horizontal
    782 ///    differences of both operands.
    783 static __inline __m256 __DEFAULT_FN_ATTRS
    784 _mm256_hsub_ps(__m256 __a, __m256 __b)
    785 {
    786   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
    787 }
    788 
    789 /* Vector permutations */
    790 /// Copies the values in a 128-bit vector of [2 x double] as specified
    791 ///    by the 128-bit integer vector operand.
    792 ///
    793 /// \headerfile <x86intrin.h>
    794 ///
    795 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
    796 ///
    797 /// \param __a
    798 ///    A 128-bit vector of [2 x double].
    799 /// \param __c
    800 ///    A 128-bit integer vector operand specifying how the values are to be
    801 ///    copied. \n
    802 ///    Bit [1]: \n
    803 ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
    804 ///         vector. \n
    805 ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
    806 ///         returned vector. \n
    807 ///    Bit [65]: \n
    808 ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
    809 ///         returned vector. \n
    810 ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
    811 ///         returned vector.
    812 /// \returns A 128-bit vector of [2 x double] containing the copied values.
    813 static __inline __m128d __DEFAULT_FN_ATTRS128
    814 _mm_permutevar_pd(__m128d __a, __m128i __c)
    815 {
    816   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
    817 }
    818 
    819 /// Copies the values in a 256-bit vector of [4 x double] as specified
    820 ///    by the 256-bit integer vector operand.
    821 ///
    822 /// \headerfile <x86intrin.h>
    823 ///
    824 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
    825 ///
    826 /// \param __a
    827 ///    A 256-bit vector of [4 x double].
    828 /// \param __c
    829 ///    A 256-bit integer vector operand specifying how the values are to be
    830 ///    copied. \n
    831 ///    Bit [1]: \n
    832 ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
    833 ///         vector. \n
    834 ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
    835 ///         returned vector. \n
    836 ///    Bit [65]: \n
    837 ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
    838 ///         returned vector. \n
    839 ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
    840 ///         returned vector. \n
    841 ///    Bit [129]: \n
    842 ///      0: Bits [191:128] of the source are copied to bits [191:128] of the
    843 ///         returned vector. \n
    844 ///      1: Bits [255:192] of the source are copied to bits [191:128] of the
    845 ///         returned vector. \n
    846 ///    Bit [193]: \n
    847 ///      0: Bits [191:128] of the source are copied to bits [255:192] of the
    848 ///         returned vector. \n
    849 ///      1: Bits [255:192] of the source are copied to bits [255:192] of the
    850 ///    returned vector.
    851 /// \returns A 256-bit vector of [4 x double] containing the copied values.
    852 static __inline __m256d __DEFAULT_FN_ATTRS
    853 _mm256_permutevar_pd(__m256d __a, __m256i __c)
    854 {
    855   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
    856 }
    857 
    858 /// Copies the values stored in a 128-bit vector of [4 x float] as
    859 ///    specified by the 128-bit integer vector operand.
    860 ///
    861 /// \headerfile <x86intrin.h>
    862 ///
    863 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
    864 ///
    865 /// \param __a
    866 ///    A 128-bit vector of [4 x float].
    867 /// \param __c
    868 ///    A 128-bit integer vector operand specifying how the values are to be
    869 ///    copied. \n
    870 ///    Bits [1:0]: \n
    871 ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
    872 ///          returned vector. \n
    873 ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
    874 ///          returned vector. \n
    875 ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
    876 ///          returned vector. \n
    877 ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
    878 ///          returned vector. \n
    879 ///    Bits [33:32]: \n
    880 ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
    881 ///          returned vector. \n
    882 ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
    883 ///          returned vector. \n
    884 ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
    885 ///          returned vector. \n
    886 ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
    887 ///          returned vector. \n
    888 ///    Bits [65:64]: \n
    889 ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
    890 ///          returned vector. \n
    891 ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
    892 ///          returned vector. \n
    893 ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
    894 ///          returned vector. \n
    895 ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
    896 ///          returned vector. \n
    897 ///    Bits [97:96]: \n
    898 ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
    899 ///          returned vector. \n
    900 ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
    901 ///          returned vector. \n
    902 ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
    903 ///          returned vector. \n
    904 ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
    905 ///          returned vector.
    906 /// \returns A 128-bit vector of [4 x float] containing the copied values.
    907 static __inline __m128 __DEFAULT_FN_ATTRS128
    908 _mm_permutevar_ps(__m128 __a, __m128i __c)
    909 {
    910   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
    911 }
    912 
    913 /// Copies the values stored in a 256-bit vector of [8 x float] as
    914 ///    specified by the 256-bit integer vector operand.
    915 ///
    916 /// \headerfile <x86intrin.h>
    917 ///
    918 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
    919 ///
    920 /// \param __a
    921 ///    A 256-bit vector of [8 x float].
    922 /// \param __c
    923 ///    A 256-bit integer vector operand specifying how the values are to be
    924 ///    copied. \n
    925 ///    Bits [1:0]: \n
    926 ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
    927 ///          returned vector. \n
    928 ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
    929 ///          returned vector. \n
    930 ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
    931 ///          returned vector. \n
    932 ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
    933 ///          returned vector. \n
    934 ///    Bits [33:32]: \n
    935 ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
    936 ///          returned vector. \n
    937 ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
    938 ///          returned vector. \n
    939 ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
    940 ///          returned vector. \n
    941 ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
    942 ///          returned vector. \n
    943 ///    Bits [65:64]: \n
    944 ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
    945 ///          returned vector. \n
    946 ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
    947 ///          returned vector. \n
    948 ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
    949 ///          returned vector. \n
    950 ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
    951 ///          returned vector. \n
    952 ///    Bits [97:96]: \n
    953 ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
    954 ///          returned vector. \n
    955 ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
    956 ///          returned vector. \n
    957 ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
    958 ///          returned vector. \n
    959 ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
    960 ///          returned vector. \n
    961 ///    Bits [129:128]: \n
    962 ///      00: Bits [159:128] of the source are copied to bits [159:128] of the
    963 ///          returned vector. \n
    964 ///      01: Bits [191:160] of the source are copied to bits [159:128] of the
    965 ///          returned vector. \n
    966 ///      10: Bits [223:192] of the source are copied to bits [159:128] of the
    967 ///          returned vector. \n
    968 ///      11: Bits [255:224] of the source are copied to bits [159:128] of the
    969 ///          returned vector. \n
    970 ///    Bits [161:160]: \n
    971 ///      00: Bits [159:128] of the source are copied to bits [191:160] of the
    972 ///          returned vector. \n
    973 ///      01: Bits [191:160] of the source are copied to bits [191:160] of the
    974 ///          returned vector. \n
    975 ///      10: Bits [223:192] of the source are copied to bits [191:160] of the
    976 ///          returned vector. \n
    977 ///      11: Bits [255:224] of the source are copied to bits [191:160] of the
    978 ///          returned vector. \n
    979 ///    Bits [193:192]: \n
    980 ///      00: Bits [159:128] of the source are copied to bits [223:192] of the
    981 ///          returned vector. \n
    982 ///      01: Bits [191:160] of the source are copied to bits [223:192] of the
    983 ///          returned vector. \n
    984 ///      10: Bits [223:192] of the source are copied to bits [223:192] of the
    985 ///          returned vector. \n
    986 ///      11: Bits [255:224] of the source are copied to bits [223:192] of the
    987 ///          returned vector. \n
    988 ///    Bits [225:224]: \n
    989 ///      00: Bits [159:128] of the source are copied to bits [255:224] of the
    990 ///          returned vector. \n
    991 ///      01: Bits [191:160] of the source are copied to bits [255:224] of the
    992 ///          returned vector. \n
    993 ///      10: Bits [223:192] of the source are copied to bits [255:224] of the
    994 ///          returned vector. \n
    995 ///      11: Bits [255:224] of the source are copied to bits [255:224] of the
    996 ///          returned vector.
    997 /// \returns A 256-bit vector of [8 x float] containing the copied values.
    998 static __inline __m256 __DEFAULT_FN_ATTRS
    999 _mm256_permutevar_ps(__m256 __a, __m256i __c)
   1000 {
   1001   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
   1002 }
   1003 
   1004 /// Copies the values in a 128-bit vector of [2 x double] as specified
   1005 ///    by the immediate integer operand.
   1006 ///
   1007 /// \headerfile <x86intrin.h>
   1008 ///
   1009 /// \code
   1010 /// __m128d _mm_permute_pd(__m128d A, const int C);
   1011 /// \endcode
   1012 ///
   1013 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
   1014 ///
   1015 /// \param A
   1016 ///    A 128-bit vector of [2 x double].
   1017 /// \param C
   1018 ///    An immediate integer operand specifying how the values are to be
   1019 ///    copied. \n
   1020 ///    Bit [0]: \n
   1021 ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
   1022 ///         vector. \n
   1023 ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
   1024 ///         returned vector. \n
   1025 ///    Bit [1]: \n
   1026 ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
   1027 ///         returned vector. \n
   1028 ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
   1029 ///         returned vector.
   1030 /// \returns A 128-bit vector of [2 x double] containing the copied values.
   1031 #define _mm_permute_pd(A, C) \
   1032   ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
   1033 
   1034 /// Copies the values in a 256-bit vector of [4 x double] as specified by
   1035 ///    the immediate integer operand.
   1036 ///
   1037 /// \headerfile <x86intrin.h>
   1038 ///
   1039 /// \code
   1040 /// __m256d _mm256_permute_pd(__m256d A, const int C);
   1041 /// \endcode
   1042 ///
   1043 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
   1044 ///
   1045 /// \param A
   1046 ///    A 256-bit vector of [4 x double].
   1047 /// \param C
   1048 ///    An immediate integer operand specifying how the values are to be
   1049 ///    copied. \n
   1050 ///    Bit [0]: \n
   1051 ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
   1052 ///         vector. \n
   1053 ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
   1054 ///         returned vector. \n
   1055 ///    Bit [1]: \n
   1056 ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
   1057 ///         returned vector. \n
   1058 ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
   1059 ///         returned vector. \n
   1060 ///    Bit [2]: \n
   1061 ///      0: Bits [191:128] of the source are copied to bits [191:128] of the
   1062 ///         returned vector. \n
   1063 ///      1: Bits [255:192] of the source are copied to bits [191:128] of the
   1064 ///         returned vector. \n
   1065 ///    Bit [3]: \n
   1066 ///      0: Bits [191:128] of the source are copied to bits [255:192] of the
   1067 ///         returned vector. \n
   1068 ///      1: Bits [255:192] of the source are copied to bits [255:192] of the
   1069 ///         returned vector.
   1070 /// \returns A 256-bit vector of [4 x double] containing the copied values.
   1071 #define _mm256_permute_pd(A, C) \
   1072   ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
   1073 
   1074 /// Copies the values in a 128-bit vector of [4 x float] as specified by
   1075 ///    the immediate integer operand.
   1076 ///
   1077 /// \headerfile <x86intrin.h>
   1078 ///
   1079 /// \code
   1080 /// __m128 _mm_permute_ps(__m128 A, const int C);
   1081 /// \endcode
   1082 ///
   1083 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
   1084 ///
   1085 /// \param A
   1086 ///    A 128-bit vector of [4 x float].
   1087 /// \param C
   1088 ///    An immediate integer operand specifying how the values are to be
   1089 ///    copied. \n
   1090 ///    Bits [1:0]: \n
   1091 ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
   1092 ///          returned vector. \n
   1093 ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
   1094 ///          returned vector. \n
   1095 ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
   1096 ///          returned vector. \n
   1097 ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
   1098 ///          returned vector. \n
   1099 ///    Bits [3:2]: \n
   1100 ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
   1101 ///          returned vector. \n
   1102 ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
   1103 ///          returned vector. \n
   1104 ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
   1105 ///          returned vector. \n
   1106 ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
   1107 ///          returned vector. \n
   1108 ///    Bits [5:4]: \n
   1109 ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
   1110 ///          returned vector. \n
   1111 ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
   1112 ///          returned vector. \n
   1113 ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
   1114 ///          returned vector. \n
   1115 ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
   1116 ///          returned vector. \n
   1117 ///    Bits [7:6]: \n
   1118 ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
   1119 ///          returned vector. \n
   1120 ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
   1121 ///          returned vector. \n
   1122 ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
   1123 ///          returned vector. \n
   1124 ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
   1125 ///          returned vector.
   1126 /// \returns A 128-bit vector of [4 x float] containing the copied values.
   1127 #define _mm_permute_ps(A, C) \
   1128   ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
   1129 
   1130 /// Copies the values in a 256-bit vector of [8 x float] as specified by
   1131 ///    the immediate integer operand.
   1132 ///
   1133 /// \headerfile <x86intrin.h>
   1134 ///
   1135 /// \code
   1136 /// __m256 _mm256_permute_ps(__m256 A, const int C);
   1137 /// \endcode
   1138 ///
   1139 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
   1140 ///
   1141 /// \param A
   1142 ///    A 256-bit vector of [8 x float].
   1143 /// \param C
   1144 ///    An immediate integer operand specifying how the values are to be
   1145 ///    copied. \n
   1146 ///    Bits [1:0]: \n
   1147 ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
   1148 ///          returned vector. \n
   1149 ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
   1150 ///          returned vector. \n
   1151 ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
   1152 ///          returned vector. \n
   1153 ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
   1154 ///          returned vector. \n
   1155 ///    Bits [3:2]: \n
   1156 ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
   1157 ///          returned vector. \n
   1158 ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
   1159 ///          returned vector. \n
   1160 ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
   1161 ///          returned vector. \n
   1162 ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
   1163 ///          returned vector. \n
   1164 ///    Bits [5:4]: \n
   1165 ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
   1166 ///          returned vector. \n
   1167 ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
   1168 ///          returned vector. \n
   1169 ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
   1170 ///          returned vector. \n
   1171 ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
   1172 ///          returned vector. \n
   1173 ///    Bits [7:6]: \n
   1174 ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
   1175 ///          returned vector. \n
   1176 ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
   1177 ///          returned vector. \n
   1178 ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
   1179 ///          returned vector. \n
   1180 ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
   1181 ///          returned vector. \n
   1182 ///    Bits [1:0]: \n
   1183 ///      00: Bits [159:128] of the source are copied to bits [159:128] of the
   1184 ///          returned vector. \n
   1185 ///      01: Bits [191:160] of the source are copied to bits [159:128] of the
   1186 ///          returned vector. \n
   1187 ///      10: Bits [223:192] of the source are copied to bits [159:128] of the
   1188 ///          returned vector. \n
   1189 ///      11: Bits [255:224] of the source are copied to bits [159:128] of the
   1190 ///          returned vector. \n
   1191 ///    Bits [3:2]: \n
   1192 ///      00: Bits [159:128] of the source are copied to bits [191:160] of the
   1193 ///          returned vector. \n
   1194 ///      01: Bits [191:160] of the source are copied to bits [191:160] of the
   1195 ///          returned vector. \n
   1196 ///      10: Bits [223:192] of the source are copied to bits [191:160] of the
   1197 ///          returned vector. \n
   1198 ///      11: Bits [255:224] of the source are copied to bits [191:160] of the
   1199 ///          returned vector. \n
   1200 ///    Bits [5:4]: \n
   1201 ///      00: Bits [159:128] of the source are copied to bits [223:192] of the
   1202 ///          returned vector. \n
   1203 ///      01: Bits [191:160] of the source are copied to bits [223:192] of the
   1204 ///          returned vector. \n
   1205 ///      10: Bits [223:192] of the source are copied to bits [223:192] of the
   1206 ///          returned vector. \n
   1207 ///      11: Bits [255:224] of the source are copied to bits [223:192] of the
   1208 ///          returned vector. \n
   1209 ///    Bits [7:6]: \n
   1210 ///      00: Bits [159:128] of the source are copied to bits [255:224] of the
   1211 ///          returned vector. \n
   1212 ///      01: Bits [191:160] of the source are copied to bits [255:224] of the
   1213 ///          returned vector. \n
   1214 ///      10: Bits [223:192] of the source are copied to bits [255:224] of the
   1215 ///          returned vector. \n
   1216 ///      11: Bits [255:224] of the source are copied to bits [255:224] of the
   1217 ///          returned vector.
   1218 /// \returns A 256-bit vector of [8 x float] containing the copied values.
   1219 #define _mm256_permute_ps(A, C) \
   1220   ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
   1221 
   1222 /// Permutes 128-bit data values stored in two 256-bit vectors of
   1223 ///    [4 x double], as specified by the immediate integer operand.
   1224 ///
   1225 /// \headerfile <x86intrin.h>
   1226 ///
   1227 /// \code
   1228 /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
   1229 /// \endcode
   1230 ///
   1231 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
   1232 ///
   1233 /// \param V1
   1234 ///    A 256-bit vector of [4 x double].
   1235 /// \param V2
   1236 ///    A 256-bit vector of [4 x double.
   1237 /// \param M
   1238 ///    An immediate integer operand specifying how the values are to be
   1239 ///    permuted. \n
   1240 ///    Bits [1:0]: \n
   1241 ///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
   1242 ///          destination. \n
   1243 ///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
   1244 ///          destination. \n
   1245 ///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
   1246 ///          destination. \n
   1247 ///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
   1248 ///          destination. \n
   1249 ///    Bits [5:4]: \n
   1250 ///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
   1251 ///          destination. \n
   1252 ///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
   1253 ///          destination. \n
   1254 ///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
   1255 ///          destination. \n
   1256 ///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
   1257 ///          destination.
   1258 /// \returns A 256-bit vector of [4 x double] containing the copied values.
   1259 #define _mm256_permute2f128_pd(V1, V2, M) \
   1260   ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
   1261                                             (__v4df)(__m256d)(V2), (int)(M)))
   1262 
   1263 /// Permutes 128-bit data values stored in two 256-bit vectors of
   1264 ///    [8 x float], as specified by the immediate integer operand.
   1265 ///
   1266 /// \headerfile <x86intrin.h>
   1267 ///
   1268 /// \code
   1269 /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
   1270 /// \endcode
   1271 ///
   1272 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
   1273 ///
   1274 /// \param V1
   1275 ///    A 256-bit vector of [8 x float].
   1276 /// \param V2
   1277 ///    A 256-bit vector of [8 x float].
   1278 /// \param M
   1279 ///    An immediate integer operand specifying how the values are to be
   1280 ///    permuted. \n
   1281 ///    Bits [1:0]: \n
   1282 ///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
   1283 ///    destination. \n
   1284 ///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
   1285 ///    destination. \n
   1286 ///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
   1287 ///    destination. \n
   1288 ///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
   1289 ///    destination. \n
   1290 ///    Bits [5:4]: \n
   1291 ///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
   1292 ///    destination. \n
   1293 ///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
   1294 ///    destination. \n
   1295 ///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
   1296 ///    destination. \n
   1297 ///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
   1298 ///    destination.
   1299 /// \returns A 256-bit vector of [8 x float] containing the copied values.
   1300 #define _mm256_permute2f128_ps(V1, V2, M) \
   1301   ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
   1302                                            (__v8sf)(__m256)(V2), (int)(M)))
   1303 
   1304 /// Permutes 128-bit data values stored in two 256-bit integer vectors,
   1305 ///    as specified by the immediate integer operand.
   1306 ///
   1307 /// \headerfile <x86intrin.h>
   1308 ///
   1309 /// \code
   1310 /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
   1311 /// \endcode
   1312 ///
   1313 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
   1314 ///
   1315 /// \param V1
   1316 ///    A 256-bit integer vector.
   1317 /// \param V2
   1318 ///    A 256-bit integer vector.
   1319 /// \param M
   1320 ///    An immediate integer operand specifying how the values are to be copied.
   1321 ///    Bits [1:0]: \n
   1322 ///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
   1323 ///    destination. \n
   1324 ///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
   1325 ///    destination. \n
   1326 ///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
   1327 ///    destination. \n
   1328 ///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
   1329 ///    destination. \n
   1330 ///    Bits [5:4]: \n
   1331 ///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
   1332 ///    destination. \n
   1333 ///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
   1334 ///    destination. \n
   1335 ///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
   1336 ///    destination. \n
   1337 ///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
   1338 ///    destination.
   1339 /// \returns A 256-bit integer vector containing the copied values.
   1340 #define _mm256_permute2f128_si256(V1, V2, M) \
   1341   ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
   1342                                             (__v8si)(__m256i)(V2), (int)(M)))
   1343 
   1344 /* Vector Blend */
   1345 /// Merges 64-bit double-precision data values stored in either of the
   1346 ///    two 256-bit vectors of [4 x double], as specified by the immediate
   1347 ///    integer operand.
   1348 ///
   1349 /// \headerfile <x86intrin.h>
   1350 ///
   1351 /// \code
   1352 /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
   1353 /// \endcode
   1354 ///
   1355 /// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
   1356 ///
   1357 /// \param V1
   1358 ///    A 256-bit vector of [4 x double].
   1359 /// \param V2
   1360 ///    A 256-bit vector of [4 x double].
   1361 /// \param M
   1362 ///    An immediate integer operand, with mask bits [3:0] specifying how the
   1363 ///    values are to be copied. The position of the mask bit corresponds to the
   1364 ///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
   1365 ///    element in operand \a V1 is copied to the same position in the
   1366 ///    destination. When a mask bit is 1, the corresponding 64-bit element in
   1367 ///    operand \a V2 is copied to the same position in the destination.
   1368 /// \returns A 256-bit vector of [4 x double] containing the copied values.
   1369 #define _mm256_blend_pd(V1, V2, M) \
   1370   ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
   1371                                       (__v4df)(__m256d)(V2), (int)(M)))
   1372 
   1373 /// Merges 32-bit single-precision data values stored in either of the
   1374 ///    two 256-bit vectors of [8 x float], as specified by the immediate
   1375 ///    integer operand.
   1376 ///
   1377 /// \headerfile <x86intrin.h>
   1378 ///
   1379 /// \code
   1380 /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
   1381 /// \endcode
   1382 ///
   1383 /// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
   1384 ///
   1385 /// \param V1
   1386 ///    A 256-bit vector of [8 x float].
   1387 /// \param V2
   1388 ///    A 256-bit vector of [8 x float].
   1389 /// \param M
   1390 ///    An immediate integer operand, with mask bits [7:0] specifying how the
   1391 ///    values are to be copied. The position of the mask bit corresponds to the
   1392 ///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
   1393 ///    element in operand \a V1 is copied to the same position in the
   1394 ///    destination. When a mask bit is 1, the corresponding 32-bit element in
   1395 ///    operand \a V2 is copied to the same position in the destination.
   1396 /// \returns A 256-bit vector of [8 x float] containing the copied values.
   1397 #define _mm256_blend_ps(V1, V2, M) \
   1398   ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
   1399                                      (__v8sf)(__m256)(V2), (int)(M)))
   1400 
   1401 /// Merges 64-bit double-precision data values stored in either of the
   1402 ///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
   1403 ///    operand.
   1404 ///
   1405 /// \headerfile <x86intrin.h>
   1406 ///
   1407 /// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
   1408 ///
   1409 /// \param __a
   1410 ///    A 256-bit vector of [4 x double].
   1411 /// \param __b
   1412 ///    A 256-bit vector of [4 x double].
   1413 /// \param __c
   1414 ///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
   1415 ///    how the values are to be copied. The position of the mask bit corresponds
   1416 ///    to the most significant bit of a copied value. When a mask bit is 0, the
   1417 ///    corresponding 64-bit element in operand \a __a is copied to the same
   1418 ///    position in the destination. When a mask bit is 1, the corresponding
   1419 ///    64-bit element in operand \a __b is copied to the same position in the
   1420 ///    destination.
   1421 /// \returns A 256-bit vector of [4 x double] containing the copied values.
   1422 static __inline __m256d __DEFAULT_FN_ATTRS
   1423 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
   1424 {
   1425   return (__m256d)__builtin_ia32_blendvpd256(
   1426     (__v4df)__a, (__v4df)__b, (__v4df)__c);
   1427 }
   1428 
   1429 /// Merges 32-bit single-precision data values stored in either of the
   1430 ///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
   1431 ///    operand.
   1432 ///
   1433 /// \headerfile <x86intrin.h>
   1434 ///
   1435 /// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
   1436 ///
   1437 /// \param __a
   1438 ///    A 256-bit vector of [8 x float].
   1439 /// \param __b
   1440 ///    A 256-bit vector of [8 x float].
   1441 /// \param __c
   1442 ///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
   1443 ///    and 31 specifying how the values are to be copied. The position of the
   1444 ///    mask bit corresponds to the most significant bit of a copied value. When
   1445 ///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
   1446 ///    copied to the same position in the destination. When a mask bit is 1, the
   1447 ///    corresponding 32-bit element in operand \a __b is copied to the same
   1448 ///    position in the destination.
   1449 /// \returns A 256-bit vector of [8 x float] containing the copied values.
   1450 static __inline __m256 __DEFAULT_FN_ATTRS
   1451 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
   1452 {
   1453   return (__m256)__builtin_ia32_blendvps256(
   1454     (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
   1455 }
   1456 
   1457 /* Vector Dot Product */
   1458 /// Computes two dot products in parallel, using the lower and upper
   1459 ///    halves of two [8 x float] vectors as input to the two computations, and
   1460 ///    returning the two dot products in the lower and upper halves of the
   1461 ///    [8 x float] result.
   1462 ///
   1463 ///    The immediate integer operand controls which input elements will
   1464 ///    contribute to the dot product, and where the final results are returned.
   1465 ///    In general, for each dot product, the four corresponding elements of the
   1466 ///    input vectors are multiplied; the first two and second two products are
   1467 ///    summed, then the two sums are added to form the final result.
   1468 ///
   1469 /// \headerfile <x86intrin.h>
   1470 ///
   1471 /// \code
   1472 /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
   1473 /// \endcode
   1474 ///
   1475 /// This intrinsic corresponds to the <c> VDPPS </c> instruction.
   1476 ///
   1477 /// \param V1
   1478 ///    A vector of [8 x float] values, treated as two [4 x float] vectors.
   1479 /// \param V2
   1480 ///    A vector of [8 x float] values, treated as two [4 x float] vectors.
   1481 /// \param M
   1482 ///    An immediate integer argument. Bits [7:4] determine which elements of
   1483 ///    the input vectors are used, with bit [4] corresponding to the lowest
   1484 ///    element and bit [7] corresponding to the highest element of each [4 x
   1485 ///    float] subvector. If a bit is set, the corresponding elements from the
   1486 ///    two input vectors are used as an input for dot product; otherwise that
   1487 ///    input is treated as zero. Bits [3:0] determine which elements of the
   1488 ///    result will receive a copy of the final dot product, with bit [0]
   1489 ///    corresponding to the lowest element and bit [3] corresponding to the
   1490 ///    highest element of each [4 x float] subvector. If a bit is set, the dot
   1491 ///    product is returned in the corresponding element; otherwise that element
   1492 ///    is set to zero. The bitmask is applied in the same way to each of the
   1493 ///    two parallel dot product computations.
   1494 /// \returns A 256-bit vector of [8 x float] containing the two dot products.
   1495 #define _mm256_dp_ps(V1, V2, M) \
   1496   ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
   1497                                   (__v8sf)(__m256)(V2), (M)))
   1498 
   1499 /* Vector shuffle */
   1500 /// Selects 8 float values from the 256-bit operands of [8 x float], as
   1501 ///    specified by the immediate value operand.
   1502 ///
   1503 ///    The four selected elements in each operand are copied to the destination
   1504 ///    according to the bits specified in the immediate operand. The selected
   1505 ///    elements from the first 256-bit operand are copied to bits [63:0] and
   1506 ///    bits [191:128] of the destination, and the selected elements from the
   1507 ///    second 256-bit operand are copied to bits [127:64] and bits [255:192] of
   1508 ///    the destination. For example, if bits [7:0] of the immediate operand
   1509 ///    contain a value of 0xFF, the 256-bit destination vector would contain the
   1510 ///    following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
   1511 ///
   1512 /// \headerfile <x86intrin.h>
   1513 ///
   1514 /// \code
   1515 /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
   1516 /// \endcode
   1517 ///
   1518 /// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
   1519 ///
   1520 /// \param a
   1521 ///    A 256-bit vector of [8 x float]. The four selected elements in this
   1522 ///    operand are copied to bits [63:0] and bits [191:128] in the destination,
   1523 ///    according to the bits specified in the immediate operand.
   1524 /// \param b
   1525 ///    A 256-bit vector of [8 x float]. The four selected elements in this
   1526 ///    operand are copied to bits [127:64] and bits [255:192] in the
   1527 ///    destination, according to the bits specified in the immediate operand.
   1528 /// \param mask
   1529 ///    An immediate value containing an 8-bit value specifying which elements to
   1530 ///    copy from \a a and \a b \n.
   1531 ///    Bits [3:0] specify the values copied from operand \a a. \n
   1532 ///    Bits [7:4] specify the values copied from operand \a b. \n
   1533 ///    The destinations within the 256-bit destination are assigned values as
   1534 ///    follows, according to the bit value assignments described below: \n
   1535 ///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
   1536 ///    destination. \n
   1537 ///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
   1538 ///    destination. \n
   1539 ///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
   1540 ///    destination. \n
   1541 ///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
   1542 ///    the destination. \n
   1543 ///    Bit value assignments: \n
   1544 ///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
   1545 ///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
   1546 ///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
   1547 ///    11: Bits [127:96] and [255:224] are copied from the selected operand. \n
   1548 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
   1549 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
   1550 ///    <c>[b6, b4, b2, b0]</c>.
   1551 /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
   1552 #define _mm256_shuffle_ps(a, b, mask) \
   1553   ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
   1554                                     (__v8sf)(__m256)(b), (int)(mask)))
   1555 
   1556 /// Selects four double-precision values from the 256-bit operands of
   1557 ///    [4 x double], as specified by the immediate value operand.
   1558 ///
   1559 ///    The selected elements from the first 256-bit operand are copied to bits
   1560 ///    [63:0] and bits [191:128] in the destination, and the selected elements
   1561 ///    from the second 256-bit operand are copied to bits [127:64] and bits
   1562 ///    [255:192] in the destination. For example, if bits [3:0] of the immediate
   1563 ///    operand contain a value of 0xF, the 256-bit destination vector would
   1564 ///    contain the following values: b[3], a[3], b[1], a[1].
   1565 ///
   1566 /// \headerfile <x86intrin.h>
   1567 ///
   1568 /// \code
   1569 /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
   1570 /// \endcode
   1571 ///
   1572 /// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
   1573 ///
   1574 /// \param a
   1575 ///    A 256-bit vector of [4 x double].
   1576 /// \param b
   1577 ///    A 256-bit vector of [4 x double].
   1578 /// \param mask
   1579 ///    An immediate value containing 8-bit values specifying which elements to
   1580 ///    copy from \a a and \a b: \n
   1581 ///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
   1582 ///    destination. \n
   1583 ///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
   1584 ///    destination. \n
   1585 ///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
   1586 ///    destination. \n
   1587 ///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
   1588 ///    destination. \n
   1589 ///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
   1590 ///    destination. \n
   1591 ///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
   1592 ///    destination. \n
   1593 ///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
   1594 ///    destination. \n
   1595 ///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
   1596 ///    destination.
   1597 /// \returns A 256-bit vector of [4 x double] containing the shuffled values.
   1598 #define _mm256_shuffle_pd(a, b, mask) \
   1599   ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
   1600                                      (__v4df)(__m256d)(b), (int)(mask)))
   1601 
   1602 /* Compare */
   1603 #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
   1604 #define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
   1605 #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
   1606 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
   1607 #define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
   1608 #define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
   1609 #define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
   1610 #define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
   1611 #define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
   1612 #define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
   1613 #define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
   1614 #define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
   1615 #define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
   1616 #define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
   1617 #define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unordered, non-signaling)  */
   1618 #define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
   1619 #define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
   1620 #define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unordered, non-signaling)  */
   1621 #define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
   1622 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
   1623 #define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
   1624 #define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
   1625 #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
   1626 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
   1627 
   1628 /* Below intrinsic defined in emmintrin.h can be used for AVX */
   1629 /// Compares each of the corresponding double-precision values of two
   1630 ///    128-bit vectors of [2 x double], using the operation specified by the
   1631 ///    immediate integer operand.
   1632 ///
   1633 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
   1634 ///    If either value in a comparison is NaN, comparisons that are ordered
   1635 ///    return false, and comparisons that are unordered return true.
   1636 ///
   1637 /// \headerfile <x86intrin.h>
   1638 ///
   1639 /// \code
   1640 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
   1641 /// \endcode
   1642 ///
   1643 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
   1644 ///
   1645 /// \param a
   1646 ///    A 128-bit vector of [2 x double].
   1647 /// \param b
   1648 ///    A 128-bit vector of [2 x double].
   1649 /// \param c
   1650 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1651 ///    operation to use: \n
   1652 ///    0x00: Equal (ordered, non-signaling) \n
   1653 ///    0x01: Less-than (ordered, signaling) \n
   1654 ///    0x02: Less-than-or-equal (ordered, signaling) \n
   1655 ///    0x03: Unordered (non-signaling) \n
   1656 ///    0x04: Not-equal (unordered, non-signaling) \n
   1657 ///    0x05: Not-less-than (unordered, signaling) \n
   1658 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
   1659 ///    0x07: Ordered (non-signaling) \n
   1660 ///    0x08: Equal (unordered, non-signaling) \n
   1661 ///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
   1662 ///    0x0A: Not-greater-than (unordered, signaling) \n
   1663 ///    0x0B: False (ordered, non-signaling) \n
   1664 ///    0x0C: Not-equal (ordered, non-signaling) \n
   1665 ///    0x0D: Greater-than-or-equal (ordered, signaling) \n
   1666 ///    0x0E: Greater-than (ordered, signaling) \n
   1667 ///    0x0F: True (unordered, non-signaling) \n
   1668 ///    0x10: Equal (ordered, signaling) \n
   1669 ///    0x11: Less-than (ordered, non-signaling) \n
   1670 ///    0x12: Less-than-or-equal (ordered, non-signaling) \n
   1671 ///    0x13: Unordered (signaling) \n
   1672 ///    0x14: Not-equal (unordered, signaling) \n
   1673 ///    0x15: Not-less-than (unordered, non-signaling) \n
   1674 ///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
   1675 ///    0x17: Ordered (signaling) \n
   1676 ///    0x18: Equal (unordered, signaling) \n
   1677 ///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
   1678 ///    0x1A: Not-greater-than (unordered, non-signaling) \n
   1679 ///    0x1B: False (ordered, signaling) \n
   1680 ///    0x1C: Not-equal (ordered, signaling) \n
   1681 ///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
   1682 ///    0x1E: Greater-than (ordered, non-signaling) \n
   1683 ///    0x1F: True (unordered, signaling)
   1684 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
   1685 /// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)
   1686 
   1687 /* Below intrinsic defined in xmmintrin.h can be used for AVX */
   1688 /// Compares each of the corresponding values of two 128-bit vectors of
   1689 ///    [4 x float], using the operation specified by the immediate integer
   1690 ///    operand.
   1691 ///
   1692 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
   1693 ///    If either value in a comparison is NaN, comparisons that are ordered
   1694 ///    return false, and comparisons that are unordered return true.
   1695 ///
   1696 /// \headerfile <x86intrin.h>
   1697 ///
   1698 /// \code
   1699 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
   1700 /// \endcode
   1701 ///
   1702 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
   1703 ///
   1704 /// \param a
   1705 ///    A 128-bit vector of [4 x float].
   1706 /// \param b
   1707 ///    A 128-bit vector of [4 x float].
   1708 /// \param c
   1709 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1710 ///    operation to use: \n
   1711 ///    0x00: Equal (ordered, non-signaling) \n
   1712 ///    0x01: Less-than (ordered, signaling) \n
   1713 ///    0x02: Less-than-or-equal (ordered, signaling) \n
   1714 ///    0x03: Unordered (non-signaling) \n
   1715 ///    0x04: Not-equal (unordered, non-signaling) \n
   1716 ///    0x05: Not-less-than (unordered, signaling) \n
   1717 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
   1718 ///    0x07: Ordered (non-signaling) \n
   1719 ///    0x08: Equal (unordered, non-signaling) \n
   1720 ///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
   1721 ///    0x0A: Not-greater-than (unordered, signaling) \n
   1722 ///    0x0B: False (ordered, non-signaling) \n
   1723 ///    0x0C: Not-equal (ordered, non-signaling) \n
   1724 ///    0x0D: Greater-than-or-equal (ordered, signaling) \n
   1725 ///    0x0E: Greater-than (ordered, signaling) \n
   1726 ///    0x0F: True (unordered, non-signaling) \n
   1727 ///    0x10: Equal (ordered, signaling) \n
   1728 ///    0x11: Less-than (ordered, non-signaling) \n
   1729 ///    0x12: Less-than-or-equal (ordered, non-signaling) \n
   1730 ///    0x13: Unordered (signaling) \n
   1731 ///    0x14: Not-equal (unordered, signaling) \n
   1732 ///    0x15: Not-less-than (unordered, non-signaling) \n
   1733 ///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
   1734 ///    0x17: Ordered (signaling) \n
   1735 ///    0x18: Equal (unordered, signaling) \n
   1736 ///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
   1737 ///    0x1A: Not-greater-than (unordered, non-signaling) \n
   1738 ///    0x1B: False (ordered, signaling) \n
   1739 ///    0x1C: Not-equal (ordered, signaling) \n
   1740 ///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
   1741 ///    0x1E: Greater-than (ordered, non-signaling) \n
   1742 ///    0x1F: True (unordered, signaling)
   1743 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
   1744 /// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)
   1745 
   1746 /// Compares each of the corresponding double-precision values of two
   1747 ///    256-bit vectors of [4 x double], using the operation specified by the
   1748 ///    immediate integer operand.
   1749 ///
   1750 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
   1751 ///    If either value in a comparison is NaN, comparisons that are ordered
   1752 ///    return false, and comparisons that are unordered return true.
   1753 ///
   1754 /// \headerfile <x86intrin.h>
   1755 ///
   1756 /// \code
   1757 /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
   1758 /// \endcode
   1759 ///
   1760 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
   1761 ///
   1762 /// \param a
   1763 ///    A 256-bit vector of [4 x double].
   1764 /// \param b
   1765 ///    A 256-bit vector of [4 x double].
   1766 /// \param c
   1767 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1768 ///    operation to use: \n
   1769 ///    0x00: Equal (ordered, non-signaling) \n
   1770 ///    0x01: Less-than (ordered, signaling) \n
   1771 ///    0x02: Less-than-or-equal (ordered, signaling) \n
   1772 ///    0x03: Unordered (non-signaling) \n
   1773 ///    0x04: Not-equal (unordered, non-signaling) \n
   1774 ///    0x05: Not-less-than (unordered, signaling) \n
   1775 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
   1776 ///    0x07: Ordered (non-signaling) \n
   1777 ///    0x08: Equal (unordered, non-signaling) \n
   1778 ///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
   1779 ///    0x0A: Not-greater-than (unordered, signaling) \n
   1780 ///    0x0B: False (ordered, non-signaling) \n
   1781 ///    0x0C: Not-equal (ordered, non-signaling) \n
   1782 ///    0x0D: Greater-than-or-equal (ordered, signaling) \n
   1783 ///    0x0E: Greater-than (ordered, signaling) \n
   1784 ///    0x0F: True (unordered, non-signaling) \n
   1785 ///    0x10: Equal (ordered, signaling) \n
   1786 ///    0x11: Less-than (ordered, non-signaling) \n
   1787 ///    0x12: Less-than-or-equal (ordered, non-signaling) \n
   1788 ///    0x13: Unordered (signaling) \n
   1789 ///    0x14: Not-equal (unordered, signaling) \n
   1790 ///    0x15: Not-less-than (unordered, non-signaling) \n
   1791 ///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
   1792 ///    0x17: Ordered (signaling) \n
   1793 ///    0x18: Equal (unordered, signaling) \n
   1794 ///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
   1795 ///    0x1A: Not-greater-than (unordered, non-signaling) \n
   1796 ///    0x1B: False (ordered, signaling) \n
   1797 ///    0x1C: Not-equal (ordered, signaling) \n
   1798 ///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
   1799 ///    0x1E: Greater-than (ordered, non-signaling) \n
   1800 ///    0x1F: True (unordered, signaling)
   1801 /// \returns A 256-bit vector of [4 x double] containing the comparison results.
   1802 #define _mm256_cmp_pd(a, b, c) \
   1803   ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
   1804                                     (__v4df)(__m256d)(b), (c)))
   1805 
   1806 /// Compares each of the corresponding values of two 256-bit vectors of
   1807 ///    [8 x float], using the operation specified by the immediate integer
   1808 ///    operand.
   1809 ///
   1810 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
   1811 ///    If either value in a comparison is NaN, comparisons that are ordered
   1812 ///    return false, and comparisons that are unordered return true.
   1813 ///
   1814 /// \headerfile <x86intrin.h>
   1815 ///
   1816 /// \code
   1817 /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
   1818 /// \endcode
   1819 ///
   1820 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
   1821 ///
   1822 /// \param a
   1823 ///    A 256-bit vector of [8 x float].
   1824 /// \param b
   1825 ///    A 256-bit vector of [8 x float].
   1826 /// \param c
   1827 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1828 ///    operation to use: \n
   1829 ///    0x00: Equal (ordered, non-signaling) \n
   1830 ///    0x01: Less-than (ordered, signaling) \n
   1831 ///    0x02: Less-than-or-equal (ordered, signaling) \n
   1832 ///    0x03: Unordered (non-signaling) \n
   1833 ///    0x04: Not-equal (unordered, non-signaling) \n
   1834 ///    0x05: Not-less-than (unordered, signaling) \n
   1835 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
   1836 ///    0x07: Ordered (non-signaling) \n
   1837 ///    0x08: Equal (unordered, non-signaling) \n
   1838 ///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
   1839 ///    0x0A: Not-greater-than (unordered, signaling) \n
   1840 ///    0x0B: False (ordered, non-signaling) \n
   1841 ///    0x0C: Not-equal (ordered, non-signaling) \n
   1842 ///    0x0D: Greater-than-or-equal (ordered, signaling) \n
   1843 ///    0x0E: Greater-than (ordered, signaling) \n
   1844 ///    0x0F: True (unordered, non-signaling) \n
   1845 ///    0x10: Equal (ordered, signaling) \n
   1846 ///    0x11: Less-than (ordered, non-signaling) \n
   1847 ///    0x12: Less-than-or-equal (ordered, non-signaling) \n
   1848 ///    0x13: Unordered (signaling) \n
   1849 ///    0x14: Not-equal (unordered, signaling) \n
   1850 ///    0x15: Not-less-than (unordered, non-signaling) \n
   1851 ///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
   1852 ///    0x17: Ordered (signaling) \n
   1853 ///    0x18: Equal (unordered, signaling) \n
   1854 ///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
   1855 ///    0x1A: Not-greater-than (unordered, non-signaling) \n
   1856 ///    0x1B: False (ordered, signaling) \n
   1857 ///    0x1C: Not-equal (ordered, signaling) \n
   1858 ///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
   1859 ///    0x1E: Greater-than (ordered, non-signaling) \n
   1860 ///    0x1F: True (unordered, signaling)
   1861 /// \returns A 256-bit vector of [8 x float] containing the comparison results.
   1862 #define _mm256_cmp_ps(a, b, c) \
   1863   ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
   1864                                    (__v8sf)(__m256)(b), (c)))
   1865 
   1866 /* Below intrinsic defined in emmintrin.h can be used for AVX */
   1867 /// Compares each of the corresponding scalar double-precision values of
   1868 ///    two 128-bit vectors of [2 x double], using the operation specified by the
   1869 ///    immediate integer operand.
   1870 ///
   1871 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
   1872 ///    If either value in a comparison is NaN, comparisons that are ordered
   1873 ///    return false, and comparisons that are unordered return true.
   1874 ///
   1875 /// \headerfile <x86intrin.h>
   1876 ///
   1877 /// \code
   1878 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
   1879 /// \endcode
   1880 ///
   1881 /// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
   1882 ///
   1883 /// \param a
   1884 ///    A 128-bit vector of [2 x double].
   1885 /// \param b
   1886 ///    A 128-bit vector of [2 x double].
   1887 /// \param c
   1888 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1889 ///    operation to use: \n
   1890 ///    0x00: Equal (ordered, non-signaling) \n
   1891 ///    0x01: Less-than (ordered, signaling) \n
   1892 ///    0x02: Less-than-or-equal (ordered, signaling) \n
   1893 ///    0x03: Unordered (non-signaling) \n
   1894 ///    0x04: Not-equal (unordered, non-signaling) \n
   1895 ///    0x05: Not-less-than (unordered, signaling) \n
   1896 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
   1897 ///    0x07: Ordered (non-signaling) \n
   1898 ///    0x08: Equal (unordered, non-signaling) \n
   1899 ///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
   1900 ///    0x0A: Not-greater-than (unordered, signaling) \n
   1901 ///    0x0B: False (ordered, non-signaling) \n
   1902 ///    0x0C: Not-equal (ordered, non-signaling) \n
   1903 ///    0x0D: Greater-than-or-equal (ordered, signaling) \n
   1904 ///    0x0E: Greater-than (ordered, signaling) \n
   1905 ///    0x0F: True (unordered, non-signaling) \n
   1906 ///    0x10: Equal (ordered, signaling) \n
   1907 ///    0x11: Less-than (ordered, non-signaling) \n
   1908 ///    0x12: Less-than-or-equal (ordered, non-signaling) \n
   1909 ///    0x13: Unordered (signaling) \n
   1910 ///    0x14: Not-equal (unordered, signaling) \n
   1911 ///    0x15: Not-less-than (unordered, non-signaling) \n
   1912 ///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
   1913 ///    0x17: Ordered (signaling) \n
   1914 ///    0x18: Equal (unordered, signaling) \n
   1915 ///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
   1916 ///    0x1A: Not-greater-than (unordered, non-signaling) \n
   1917 ///    0x1B: False (ordered, signaling) \n
   1918 ///    0x1C: Not-equal (ordered, signaling) \n
   1919 ///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
   1920 ///    0x1E: Greater-than (ordered, non-signaling) \n
   1921 ///    0x1F: True (unordered, signaling)
   1922 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
   1923 /// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)
   1924 
   1925 /* Below intrinsic defined in xmmintrin.h can be used for AVX */
   1926 /// Compares each of the corresponding scalar values of two 128-bit
   1927 ///    vectors of [4 x float], using the operation specified by the immediate
   1928 ///    integer operand.
   1929 ///
   1930 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
   1931 ///    If either value in a comparison is NaN, comparisons that are ordered
   1932 ///    return false, and comparisons that are unordered return true.
   1933 ///
   1934 /// \headerfile <x86intrin.h>
   1935 ///
   1936 /// \code
   1937 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
   1938 /// \endcode
   1939 ///
   1940 /// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
   1941 ///
   1942 /// \param a
   1943 ///    A 128-bit vector of [4 x float].
   1944 /// \param b
   1945 ///    A 128-bit vector of [4 x float].
   1946 /// \param c
   1947 ///    An immediate integer operand, with bits [4:0] specifying which comparison
   1948 ///    operation to use: \n
   1949 ///    0x00: Equal (ordered, non-signaling) \n
   1950 ///    0x01: Less-than (ordered, signaling) \n
   1951 ///    0x02: Less-than-or-equal (ordered, signaling) \n
   1952 ///    0x03: Unordered (non-signaling) \n
   1953 ///    0x04: Not-equal (unordered, non-signaling) \n
   1954 ///    0x05: Not-less-than (unordered, signaling) \n
   1955 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
   1956 ///    0x07: Ordered (non-signaling) \n
   1957 ///    0x08: Equal (unordered, non-signaling) \n
   1958 ///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
   1959 ///    0x0A: Not-greater-than (unordered, signaling) \n
   1960 ///    0x0B: False (ordered, non-signaling) \n
   1961 ///    0x0C: Not-equal (ordered, non-signaling) \n
   1962 ///    0x0D: Greater-than-or-equal (ordered, signaling) \n
   1963 ///    0x0E: Greater-than (ordered, signaling) \n
   1964 ///    0x0F: True (unordered, non-signaling) \n
   1965 ///    0x10: Equal (ordered, signaling) \n
   1966 ///    0x11: Less-than (ordered, non-signaling) \n
   1967 ///    0x12: Less-than-or-equal (ordered, non-signaling) \n
   1968 ///    0x13: Unordered (signaling) \n
   1969 ///    0x14: Not-equal (unordered, signaling) \n
   1970 ///    0x15: Not-less-than (unordered, non-signaling) \n
   1971 ///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
   1972 ///    0x17: Ordered (signaling) \n
   1973 ///    0x18: Equal (unordered, signaling) \n
   1974 ///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
   1975 ///    0x1A: Not-greater-than (unordered, non-signaling) \n
   1976 ///    0x1B: False (ordered, signaling) \n
   1977 ///    0x1C: Not-equal (ordered, signaling) \n
   1978 ///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
   1979 ///    0x1E: Greater-than (ordered, non-signaling) \n
   1980 ///    0x1F: True (unordered, signaling)
   1981 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
   1982 /// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)
   1983 
   1984 /// Takes a [8 x i32] vector and returns the vector element value
   1985 ///    indexed by the immediate constant operand.
   1986 ///
   1987 /// \headerfile <x86intrin.h>
   1988 ///
   1989 /// \code
   1990 /// int _mm256_extract_epi32(__m256i X, const int N);
   1991 /// \endcode
   1992 ///
   1993 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
   1994 ///   instruction.
   1995 ///
   1996 /// \param X
   1997 ///    A 256-bit vector of [8 x i32].
   1998 /// \param N
   1999 ///    An immediate integer operand with bits [2:0] determining which vector
   2000 ///    element is extracted and returned.
   2001 /// \returns A 32-bit integer containing the extracted 32 bits of extended
   2002 ///    packed data.
   2003 #define _mm256_extract_epi32(X, N) \
   2004   ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
   2005 
   2006 /// Takes a [16 x i16] vector and returns the vector element value
   2007 ///    indexed by the immediate constant operand.
   2008 ///
   2009 /// \headerfile <x86intrin.h>
   2010 ///
   2011 /// \code
   2012 /// int _mm256_extract_epi16(__m256i X, const int N);
   2013 /// \endcode
   2014 ///
   2015 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
   2016 ///   instruction.
   2017 ///
   2018 /// \param X
   2019 ///    A 256-bit integer vector of [16 x i16].
   2020 /// \param N
   2021 ///    An immediate integer operand with bits [3:0] determining which vector
   2022 ///    element is extracted and returned.
   2023 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended
   2024 ///    packed data.
   2025 #define _mm256_extract_epi16(X, N) \
   2026   ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
   2027                                                      (int)(N)))
   2028 
   2029 /// Takes a [32 x i8] vector and returns the vector element value
   2030 ///    indexed by the immediate constant operand.
   2031 ///
   2032 /// \headerfile <x86intrin.h>
   2033 ///
   2034 /// \code
   2035 /// int _mm256_extract_epi8(__m256i X, const int N);
   2036 /// \endcode
   2037 ///
   2038 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
   2039 ///   instruction.
   2040 ///
   2041 /// \param X
   2042 ///    A 256-bit integer vector of [32 x i8].
   2043 /// \param N
   2044 ///    An immediate integer operand with bits [4:0] determining which vector
   2045 ///    element is extracted and returned.
   2046 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended
   2047 ///    packed data.
   2048 #define _mm256_extract_epi8(X, N) \
   2049   ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
   2050                                                     (int)(N)))
   2051 
   2052 #ifdef __x86_64__
   2053 /// Takes a [4 x i64] vector and returns the vector element value
   2054 ///    indexed by the immediate constant operand.
   2055 ///
   2056 /// \headerfile <x86intrin.h>
   2057 ///
   2058 /// \code
   2059 /// long long _mm256_extract_epi64(__m256i X, const int N);
   2060 /// \endcode
   2061 ///
   2062 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
   2063 ///   instruction.
   2064 ///
   2065 /// \param X
   2066 ///    A 256-bit integer vector of [4 x i64].
   2067 /// \param N
   2068 ///    An immediate integer operand with bits [1:0] determining which vector
   2069 ///    element is extracted and returned.
   2070 /// \returns A 64-bit integer containing the extracted 64 bits of extended
   2071 ///    packed data.
   2072 #define _mm256_extract_epi64(X, N) \
   2073   ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
   2074 #endif
   2075 
   2076 /// Takes a [8 x i32] vector and replaces the vector element value
   2077 ///    indexed by the immediate constant operand by a new value. Returns the
   2078 ///    modified vector.
   2079 ///
   2080 /// \headerfile <x86intrin.h>
   2081 ///
   2082 /// \code
   2083 /// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
   2084 /// \endcode
   2085 ///
   2086 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
   2087 ///   instruction.
   2088 ///
   2089 /// \param X
   2090 ///    A vector of [8 x i32] to be used by the insert operation.
   2091 /// \param I
   2092 ///    An integer value. The replacement value for the insert operation.
   2093 /// \param N
   2094 ///    An immediate integer specifying the index of the vector element to be
   2095 ///    replaced.
   2096 /// \returns A copy of vector \a X, after replacing its element indexed by
   2097 ///    \a N with \a I.
   2098 #define _mm256_insert_epi32(X, I, N) \
   2099   ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
   2100                                         (int)(I), (int)(N)))
   2101 
   2102 
   2103 /// Takes a [16 x i16] vector and replaces the vector element value
   2104 ///    indexed by the immediate constant operand with a new value. Returns the
   2105 ///    modified vector.
   2106 ///
   2107 /// \headerfile <x86intrin.h>
   2108 ///
   2109 /// \code
   2110 /// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
   2111 /// \endcode
   2112 ///
   2113 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
   2114 ///   instruction.
   2115 ///
   2116 /// \param X
   2117 ///    A vector of [16 x i16] to be used by the insert operation.
   2118 /// \param I
   2119 ///    An i16 integer value. The replacement value for the insert operation.
   2120 /// \param N
   2121 ///    An immediate integer specifying the index of the vector element to be
   2122 ///    replaced.
   2123 /// \returns A copy of vector \a X, after replacing its element indexed by
   2124 ///    \a N with \a I.
   2125 #define _mm256_insert_epi16(X, I, N) \
   2126   ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
   2127                                          (int)(I), (int)(N)))
   2128 
   2129 /// Takes a [32 x i8] vector and replaces the vector element value
   2130 ///    indexed by the immediate constant operand with a new value. Returns the
   2131 ///    modified vector.
   2132 ///
   2133 /// \headerfile <x86intrin.h>
   2134 ///
   2135 /// \code
   2136 /// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
   2137 /// \endcode
   2138 ///
   2139 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
   2140 ///   instruction.
   2141 ///
   2142 /// \param X
   2143 ///    A vector of [32 x i8] to be used by the insert operation.
   2144 /// \param I
   2145 ///    An i8 integer value. The replacement value for the insert operation.
   2146 /// \param N
   2147 ///    An immediate integer specifying the index of the vector element to be
   2148 ///    replaced.
   2149 /// \returns A copy of vector \a X, after replacing its element indexed by
   2150 ///    \a N with \a I.
   2151 #define _mm256_insert_epi8(X, I, N) \
   2152   ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
   2153                                          (int)(I), (int)(N)))
   2154 
   2155 #ifdef __x86_64__
   2156 /// Takes a [4 x i64] vector and replaces the vector element value
   2157 ///    indexed by the immediate constant operand with a new value. Returns the
   2158 ///    modified vector.
   2159 ///
   2160 /// \headerfile <x86intrin.h>
   2161 ///
   2162 /// \code
   2163 /// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
   2164 /// \endcode
   2165 ///
   2166 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
   2167 ///   instruction.
   2168 ///
   2169 /// \param X
   2170 ///    A vector of [4 x i64] to be used by the insert operation.
   2171 /// \param I
   2172 ///    A 64-bit integer value. The replacement value for the insert operation.
   2173 /// \param N
   2174 ///    An immediate integer specifying the index of the vector element to be
   2175 ///    replaced.
   2176 /// \returns A copy of vector \a X, after replacing its element indexed by
   2177 ///     \a N with \a I.
   2178 #define _mm256_insert_epi64(X, I, N) \
   2179   ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
   2180                                         (long long)(I), (int)(N)))
   2181 #endif
   2182 
   2183 /* Conversion */
   2184 /// Converts a vector of [4 x i32] into a vector of [4 x double].
   2185 ///
   2186 /// \headerfile <x86intrin.h>
   2187 ///
   2188 /// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
   2189 ///
   2190 /// \param __a
   2191 ///    A 128-bit integer vector of [4 x i32].
   2192 /// \returns A 256-bit vector of [4 x double] containing the converted values.
   2193 static __inline __m256d __DEFAULT_FN_ATTRS
   2194 _mm256_cvtepi32_pd(__m128i __a)
   2195 {
   2196   return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
   2197 }
   2198 
   2199 /// Converts a vector of [8 x i32] into a vector of [8 x float].
   2200 ///
   2201 /// \headerfile <x86intrin.h>
   2202 ///
   2203 /// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
   2204 ///
   2205 /// \param __a
   2206 ///    A 256-bit integer vector.
   2207 /// \returns A 256-bit vector of [8 x float] containing the converted values.
   2208 static __inline __m256 __DEFAULT_FN_ATTRS
   2209 _mm256_cvtepi32_ps(__m256i __a)
   2210 {
   2211   return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
   2212 }
   2213 
   2214 /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
   2215 ///    [4 x float].
   2216 ///
   2217 /// \headerfile <x86intrin.h>
   2218 ///
   2219 /// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
   2220 ///
   2221 /// \param __a
   2222 ///    A 256-bit vector of [4 x double].
   2223 /// \returns A 128-bit vector of [4 x float] containing the converted values.
   2224 static __inline __m128 __DEFAULT_FN_ATTRS
   2225 _mm256_cvtpd_ps(__m256d __a)
   2226 {
   2227   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
   2228 }
   2229 
   2230 /// Converts a vector of [8 x float] into a vector of [8 x i32].
   2231 ///
   2232 ///    If a converted value does not fit in a 32-bit integer, raises a
   2233 ///    floating-point invalid exception. If the exception is masked, returns
   2234 ///    the most negative integer.
   2235 ///
   2236 /// \headerfile <x86intrin.h>
   2237 ///
   2238 /// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
   2239 ///
   2240 /// \param __a
   2241 ///    A 256-bit vector of [8 x float].
   2242 /// \returns A 256-bit integer vector containing the converted values.
   2243 static __inline __m256i __DEFAULT_FN_ATTRS
   2244 _mm256_cvtps_epi32(__m256 __a)
   2245 {
   2246   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
   2247 }
   2248 
   2249 /// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
   2250 ///    x double].
   2251 ///
   2252 /// \headerfile <x86intrin.h>
   2253 ///
   2254 /// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
   2255 ///
   2256 /// \param __a
   2257 ///    A 128-bit vector of [4 x float].
   2258 /// \returns A 256-bit vector of [4 x double] containing the converted values.
   2259 static __inline __m256d __DEFAULT_FN_ATTRS
   2260 _mm256_cvtps_pd(__m128 __a)
   2261 {
   2262   return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
   2263 }
   2264 
   2265 /// Converts a 256-bit vector of [4 x double] into four signed truncated
   2266 ///    (rounded toward zero) 32-bit integers returned in a 128-bit vector of
   2267 ///    [4 x i32].
   2268 ///
   2269 ///    If a converted value does not fit in a 32-bit integer, raises a
   2270 ///    floating-point invalid exception. If the exception is masked, returns
   2271 ///    the most negative integer.
   2272 ///
   2273 /// \headerfile <x86intrin.h>
   2274 ///
   2275 /// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
   2276 ///
   2277 /// \param __a
   2278 ///    A 256-bit vector of [4 x double].
   2279 /// \returns A 128-bit integer vector containing the converted values.
   2280 static __inline __m128i __DEFAULT_FN_ATTRS
   2281 _mm256_cvttpd_epi32(__m256d __a)
   2282 {
   2283   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
   2284 }
   2285 
   2286 /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
   2287 ///    [4 x i32].
   2288 ///
   2289 ///    If a converted value does not fit in a 32-bit integer, raises a
   2290 ///    floating-point invalid exception. If the exception is masked, returns
   2291 ///    the most negative integer.
   2292 ///
   2293 /// \headerfile <x86intrin.h>
   2294 ///
   2295 /// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
   2296 ///
   2297 /// \param __a
   2298 ///    A 256-bit vector of [4 x double].
   2299 /// \returns A 128-bit integer vector containing the converted values.
   2300 static __inline __m128i __DEFAULT_FN_ATTRS
   2301 _mm256_cvtpd_epi32(__m256d __a)
   2302 {
   2303   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
   2304 }
   2305 
   2306 /// Converts a vector of [8 x float] into eight signed truncated (rounded
   2307 ///    toward zero) 32-bit integers returned in a vector of [8 x i32].
   2308 ///
   2309 ///    If a converted value does not fit in a 32-bit integer, raises a
   2310 ///    floating-point invalid exception. If the exception is masked, returns
   2311 ///    the most negative integer.
   2312 ///
   2313 /// \headerfile <x86intrin.h>
   2314 ///
   2315 /// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
   2316 ///
   2317 /// \param __a
   2318 ///    A 256-bit vector of [8 x float].
   2319 /// \returns A 256-bit integer vector containing the converted values.
   2320 static __inline __m256i __DEFAULT_FN_ATTRS
   2321 _mm256_cvttps_epi32(__m256 __a)
   2322 {
   2323   return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
   2324 }
   2325 
   2326 /// Returns the first element of the input vector of [4 x double].
   2327 ///
   2328 /// \headerfile <x86intrin.h>
   2329 ///
   2330 /// This intrinsic is a utility function and does not correspond to a specific
   2331 ///    instruction.
   2332 ///
   2333 /// \param __a
   2334 ///    A 256-bit vector of [4 x double].
   2335 /// \returns A 64 bit double containing the first element of the input vector.
   2336 static __inline double __DEFAULT_FN_ATTRS
   2337 _mm256_cvtsd_f64(__m256d __a)
   2338 {
   2339  return __a[0];
   2340 }
   2341 
   2342 /// Returns the first element of the input vector of [8 x i32].
   2343 ///
   2344 /// \headerfile <x86intrin.h>
   2345 ///
   2346 /// This intrinsic is a utility function and does not correspond to a specific
   2347 ///    instruction.
   2348 ///
   2349 /// \param __a
   2350 ///    A 256-bit vector of [8 x i32].
   2351 /// \returns A 32 bit integer containing the first element of the input vector.
   2352 static __inline int __DEFAULT_FN_ATTRS
   2353 _mm256_cvtsi256_si32(__m256i __a)
   2354 {
   2355  __v8si __b = (__v8si)__a;
   2356  return __b[0];
   2357 }
   2358 
   2359 /// Returns the first element of the input vector of [8 x float].
   2360 ///
   2361 /// \headerfile <x86intrin.h>
   2362 ///
   2363 /// This intrinsic is a utility function and does not correspond to a specific
   2364 ///    instruction.
   2365 ///
   2366 /// \param __a
   2367 ///    A 256-bit vector of [8 x float].
   2368 /// \returns A 32 bit float containing the first element of the input vector.
   2369 static __inline float __DEFAULT_FN_ATTRS
   2370 _mm256_cvtss_f32(__m256 __a)
   2371 {
   2372  return __a[0];
   2373 }
   2374 
   2375 /* Vector replicate */
   2376 /// Moves and duplicates odd-indexed values from a 256-bit vector of
   2377 ///    [8 x float] to float values in a 256-bit vector of [8 x float].
   2378 ///
   2379 /// \headerfile <x86intrin.h>
   2380 ///
   2381 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
   2382 ///
   2383 /// \param __a
   2384 ///    A 256-bit vector of [8 x float]. \n
   2385 ///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
   2386 ///    the return value. \n
   2387 ///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
   2388 ///    the return value. \n
   2389 ///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
   2390 ///    return value. \n
   2391 ///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
   2392 ///    return value.
   2393 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
   2394 ///    values.
   2395 static __inline __m256 __DEFAULT_FN_ATTRS
   2396 _mm256_movehdup_ps(__m256 __a)
   2397 {
   2398   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
   2399 }
   2400 
   2401 /// Moves and duplicates even-indexed values from a 256-bit vector of
   2402 ///    [8 x float] to float values in a 256-bit vector of [8 x float].
   2403 ///
   2404 /// \headerfile <x86intrin.h>
   2405 ///
   2406 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
   2407 ///
   2408 /// \param __a
   2409 ///    A 256-bit vector of [8 x float]. \n
   2410 ///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
   2411 ///    the return value. \n
   2412 ///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
   2413 ///    the return value. \n
   2414 ///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
   2415 ///    return value. \n
   2416 ///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
   2417 ///    return value.
   2418 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
   2419 ///    values.
   2420 static __inline __m256 __DEFAULT_FN_ATTRS
   2421 _mm256_moveldup_ps(__m256 __a)
   2422 {
   2423   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
   2424 }
   2425 
   2426 /// Moves and duplicates double-precision floating point values from a
   2427 ///    256-bit vector of [4 x double] to double-precision values in a 256-bit
   2428 ///    vector of [4 x double].
   2429 ///
   2430 /// \headerfile <x86intrin.h>
   2431 ///
   2432 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
   2433 ///
   2434 /// \param __a
   2435 ///    A 256-bit vector of [4 x double]. \n
   2436 ///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
   2437 ///    return value. \n
   2438 ///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
   2439 ///    the return value.
   2440 /// \returns A 256-bit vector of [4 x double] containing the moved and
   2441 ///    duplicated values.
   2442 static __inline __m256d __DEFAULT_FN_ATTRS
   2443 _mm256_movedup_pd(__m256d __a)
   2444 {
   2445   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
   2446 }
   2447 
   2448 /* Unpack and Interleave */
   2449 /// Unpacks the odd-indexed vector elements from two 256-bit vectors of
   2450 ///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
   2451 ///
   2452 /// \headerfile <x86intrin.h>
   2453 ///
   2454 /// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
   2455 ///
   2456 /// \param __a
   2457 ///    A 256-bit floating-point vector of [4 x double]. \n
   2458 ///    Bits [127:64] are written to bits [63:0] of the return value. \n
   2459 ///    Bits [255:192] are written to bits [191:128] of the return value. \n
   2460 /// \param __b
   2461 ///    A 256-bit floating-point vector of [4 x double]. \n
   2462 ///    Bits [127:64] are written to bits [127:64] of the return value. \n
   2463 ///    Bits [255:192] are written to bits [255:192] of the return value. \n
   2464 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
   2465 static __inline __m256d __DEFAULT_FN_ATTRS
   2466 _mm256_unpackhi_pd(__m256d __a, __m256d __b)
   2467 {
   2468   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
   2469 }
   2470 
   2471 /// Unpacks the even-indexed vector elements from two 256-bit vectors of
   2472 ///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
   2473 ///
   2474 /// \headerfile <x86intrin.h>
   2475 ///
   2476 /// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
   2477 ///
   2478 /// \param __a
   2479 ///    A 256-bit floating-point vector of [4 x double]. \n
   2480 ///    Bits [63:0] are written to bits [63:0] of the return value. \n
   2481 ///    Bits [191:128] are written to bits [191:128] of the return value.
   2482 /// \param __b
   2483 ///    A 256-bit floating-point vector of [4 x double]. \n
   2484 ///    Bits [63:0] are written to bits [127:64] of the return value. \n
   2485 ///    Bits [191:128] are written to bits [255:192] of the return value. \n
   2486 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
   2487 static __inline __m256d __DEFAULT_FN_ATTRS
   2488 _mm256_unpacklo_pd(__m256d __a, __m256d __b)
   2489 {
   2490   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
   2491 }
   2492 
   2493 /// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
   2494 ///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
   2495 ///    vector of [8 x float].
   2496 ///
   2497 /// \headerfile <x86intrin.h>
   2498 ///
   2499 /// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
   2500 ///
   2501 /// \param __a
   2502 ///    A 256-bit vector of [8 x float]. \n
   2503 ///    Bits [95:64] are written to bits [31:0] of the return value. \n
   2504 ///    Bits [127:96] are written to bits [95:64] of the return value. \n
   2505 ///    Bits [223:192] are written to bits [159:128] of the return value. \n
   2506 ///    Bits [255:224] are written to bits [223:192] of the return value.
   2507 /// \param __b
   2508 ///    A 256-bit vector of [8 x float]. \n
   2509 ///    Bits [95:64] are written to bits [63:32] of the return value. \n
   2510 ///    Bits [127:96] are written to bits [127:96] of the return value. \n
   2511 ///    Bits [223:192] are written to bits [191:160] of the return value. \n
   2512 ///    Bits [255:224] are written to bits [255:224] of the return value.
   2513 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
   2514 static __inline __m256 __DEFAULT_FN_ATTRS
   2515 _mm256_unpackhi_ps(__m256 __a, __m256 __b)
   2516 {
   2517   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
   2518 }
   2519 
   2520 /// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
   2521 ///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
   2522 ///    vector of [8 x float].
   2523 ///
   2524 /// \headerfile <x86intrin.h>
   2525 ///
   2526 /// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
   2527 ///
   2528 /// \param __a
   2529 ///    A 256-bit vector of [8 x float]. \n
   2530 ///    Bits [31:0] are written to bits [31:0] of the return value. \n
   2531 ///    Bits [63:32] are written to bits [95:64] of the return value. \n
   2532 ///    Bits [159:128] are written to bits [159:128] of the return value. \n
   2533 ///    Bits [191:160] are written to bits [223:192] of the return value.
   2534 /// \param __b
   2535 ///    A 256-bit vector of [8 x float]. \n
   2536 ///    Bits [31:0] are written to bits [63:32] of the return value. \n
   2537 ///    Bits [63:32] are written to bits [127:96] of the return value. \n
   2538 ///    Bits [159:128] are written to bits [191:160] of the return value. \n
   2539 ///    Bits [191:160] are written to bits [255:224] of the return value.
   2540 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
   2541 static __inline __m256 __DEFAULT_FN_ATTRS
   2542 _mm256_unpacklo_ps(__m256 __a, __m256 __b)
   2543 {
   2544   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
   2545 }
   2546 
   2547 /* Bit Test */
   2548 /// Given two 128-bit floating-point vectors of [2 x double], perform an
   2549 ///    element-by-element comparison of the double-precision element in the
   2550 ///    first source vector and the corresponding element in the second source
   2551 ///    vector.
   2552 ///
   2553 ///    The EFLAGS register is updated as follows: \n
   2554 ///    If there is at least one pair of double-precision elements where the
   2555 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2556 ///    ZF flag is set to 1. \n
   2557 ///    If there is at least one pair of double-precision elements where the
   2558 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2559 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2560 ///    This intrinsic returns the value of the ZF flag.
   2561 ///
   2562 /// \headerfile <x86intrin.h>
   2563 ///
   2564 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2565 ///
   2566 /// \param __a
   2567 ///    A 128-bit vector of [2 x double].
   2568 /// \param __b
   2569 ///    A 128-bit vector of [2 x double].
   2570 /// \returns the ZF flag in the EFLAGS register.
   2571 static __inline int __DEFAULT_FN_ATTRS128
   2572 _mm_testz_pd(__m128d __a, __m128d __b)
   2573 {
   2574   return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
   2575 }
   2576 
   2577 /// Given two 128-bit floating-point vectors of [2 x double], perform an
   2578 ///    element-by-element comparison of the double-precision element in the
   2579 ///    first source vector and the corresponding element in the second source
   2580 ///    vector.
   2581 ///
   2582 ///    The EFLAGS register is updated as follows: \n
   2583 ///    If there is at least one pair of double-precision elements where the
   2584 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2585 ///    ZF flag is set to 1. \n
   2586 ///    If there is at least one pair of double-precision elements where the
   2587 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2588 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2589 ///    This intrinsic returns the value of the CF flag.
   2590 ///
   2591 /// \headerfile <x86intrin.h>
   2592 ///
   2593 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2594 ///
   2595 /// \param __a
   2596 ///    A 128-bit vector of [2 x double].
   2597 /// \param __b
   2598 ///    A 128-bit vector of [2 x double].
   2599 /// \returns the CF flag in the EFLAGS register.
   2600 static __inline int __DEFAULT_FN_ATTRS128
   2601 _mm_testc_pd(__m128d __a, __m128d __b)
   2602 {
   2603   return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
   2604 }
   2605 
   2606 /// Given two 128-bit floating-point vectors of [2 x double], perform an
   2607 ///    element-by-element comparison of the double-precision element in the
   2608 ///    first source vector and the corresponding element in the second source
   2609 ///    vector.
   2610 ///
   2611 ///    The EFLAGS register is updated as follows: \n
   2612 ///    If there is at least one pair of double-precision elements where the
   2613 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2614 ///    ZF flag is set to 1. \n
   2615 ///    If there is at least one pair of double-precision elements where the
   2616 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2617 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2618 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2619 ///    otherwise it returns 0.
   2620 ///
   2621 /// \headerfile <x86intrin.h>
   2622 ///
   2623 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2624 ///
   2625 /// \param __a
   2626 ///    A 128-bit vector of [2 x double].
   2627 /// \param __b
   2628 ///    A 128-bit vector of [2 x double].
   2629 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2630 static __inline int __DEFAULT_FN_ATTRS128
   2631 _mm_testnzc_pd(__m128d __a, __m128d __b)
   2632 {
   2633   return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
   2634 }
   2635 
   2636 /// Given two 128-bit floating-point vectors of [4 x float], perform an
   2637 ///    element-by-element comparison of the single-precision element in the
   2638 ///    first source vector and the corresponding element in the second source
   2639 ///    vector.
   2640 ///
   2641 ///    The EFLAGS register is updated as follows: \n
   2642 ///    If there is at least one pair of single-precision elements where the
   2643 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2644 ///    ZF flag is set to 1. \n
   2645 ///    If there is at least one pair of single-precision elements where the
   2646 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2647 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2648 ///    This intrinsic returns the value of the ZF flag.
   2649 ///
   2650 /// \headerfile <x86intrin.h>
   2651 ///
   2652 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2653 ///
   2654 /// \param __a
   2655 ///    A 128-bit vector of [4 x float].
   2656 /// \param __b
   2657 ///    A 128-bit vector of [4 x float].
   2658 /// \returns the ZF flag.
   2659 static __inline int __DEFAULT_FN_ATTRS128
   2660 _mm_testz_ps(__m128 __a, __m128 __b)
   2661 {
   2662   return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
   2663 }
   2664 
   2665 /// Given two 128-bit floating-point vectors of [4 x float], perform an
   2666 ///    element-by-element comparison of the single-precision element in the
   2667 ///    first source vector and the corresponding element in the second source
   2668 ///    vector.
   2669 ///
   2670 ///    The EFLAGS register is updated as follows: \n
   2671 ///    If there is at least one pair of single-precision elements where the
   2672 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2673 ///    ZF flag is set to 1. \n
   2674 ///    If there is at least one pair of single-precision elements where the
   2675 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2676 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2677 ///    This intrinsic returns the value of the CF flag.
   2678 ///
   2679 /// \headerfile <x86intrin.h>
   2680 ///
   2681 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2682 ///
   2683 /// \param __a
   2684 ///    A 128-bit vector of [4 x float].
   2685 /// \param __b
   2686 ///    A 128-bit vector of [4 x float].
   2687 /// \returns the CF flag.
   2688 static __inline int __DEFAULT_FN_ATTRS128
   2689 _mm_testc_ps(__m128 __a, __m128 __b)
   2690 {
   2691   return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
   2692 }
   2693 
   2694 /// Given two 128-bit floating-point vectors of [4 x float], perform an
   2695 ///    element-by-element comparison of the single-precision element in the
   2696 ///    first source vector and the corresponding element in the second source
   2697 ///    vector.
   2698 ///
   2699 ///    The EFLAGS register is updated as follows: \n
   2700 ///    If there is at least one pair of single-precision elements where the
   2701 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2702 ///    ZF flag is set to 1. \n
   2703 ///    If there is at least one pair of single-precision elements where the
   2704 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2705 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2706 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2707 ///    otherwise it returns 0.
   2708 ///
   2709 /// \headerfile <x86intrin.h>
   2710 ///
   2711 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2712 ///
   2713 /// \param __a
   2714 ///    A 128-bit vector of [4 x float].
   2715 /// \param __b
   2716 ///    A 128-bit vector of [4 x float].
   2717 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2718 static __inline int __DEFAULT_FN_ATTRS128
   2719 _mm_testnzc_ps(__m128 __a, __m128 __b)
   2720 {
   2721   return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
   2722 }
   2723 
   2724 /// Given two 256-bit floating-point vectors of [4 x double], perform an
   2725 ///    element-by-element comparison of the double-precision elements in the
   2726 ///    first source vector and the corresponding elements in the second source
   2727 ///    vector.
   2728 ///
   2729 ///    The EFLAGS register is updated as follows: \n
   2730 ///    If there is at least one pair of double-precision elements where the
   2731 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2732 ///    ZF flag is set to 1. \n
   2733 ///    If there is at least one pair of double-precision elements where the
   2734 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2735 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2736 ///    This intrinsic returns the value of the ZF flag.
   2737 ///
   2738 /// \headerfile <x86intrin.h>
   2739 ///
   2740 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2741 ///
   2742 /// \param __a
   2743 ///    A 256-bit vector of [4 x double].
   2744 /// \param __b
   2745 ///    A 256-bit vector of [4 x double].
   2746 /// \returns the ZF flag.
   2747 static __inline int __DEFAULT_FN_ATTRS
   2748 _mm256_testz_pd(__m256d __a, __m256d __b)
   2749 {
   2750   return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
   2751 }
   2752 
   2753 /// Given two 256-bit floating-point vectors of [4 x double], perform an
   2754 ///    element-by-element comparison of the double-precision elements in the
   2755 ///    first source vector and the corresponding elements in the second source
   2756 ///    vector.
   2757 ///
   2758 ///    The EFLAGS register is updated as follows: \n
   2759 ///    If there is at least one pair of double-precision elements where the
   2760 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2761 ///    ZF flag is set to 1. \n
   2762 ///    If there is at least one pair of double-precision elements where the
   2763 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2764 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2765 ///    This intrinsic returns the value of the CF flag.
   2766 ///
   2767 /// \headerfile <x86intrin.h>
   2768 ///
   2769 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2770 ///
   2771 /// \param __a
   2772 ///    A 256-bit vector of [4 x double].
   2773 /// \param __b
   2774 ///    A 256-bit vector of [4 x double].
   2775 /// \returns the CF flag.
   2776 static __inline int __DEFAULT_FN_ATTRS
   2777 _mm256_testc_pd(__m256d __a, __m256d __b)
   2778 {
   2779   return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
   2780 }
   2781 
   2782 /// Given two 256-bit floating-point vectors of [4 x double], perform an
   2783 ///    element-by-element comparison of the double-precision elements in the
   2784 ///    first source vector and the corresponding elements in the second source
   2785 ///    vector.
   2786 ///
   2787 ///    The EFLAGS register is updated as follows: \n
   2788 ///    If there is at least one pair of double-precision elements where the
   2789 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2790 ///    ZF flag is set to 1. \n
   2791 ///    If there is at least one pair of double-precision elements where the
   2792 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2793 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2794 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2795 ///    otherwise it returns 0.
   2796 ///
   2797 /// \headerfile <x86intrin.h>
   2798 ///
   2799 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
   2800 ///
   2801 /// \param __a
   2802 ///    A 256-bit vector of [4 x double].
   2803 /// \param __b
   2804 ///    A 256-bit vector of [4 x double].
   2805 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2806 static __inline int __DEFAULT_FN_ATTRS
   2807 _mm256_testnzc_pd(__m256d __a, __m256d __b)
   2808 {
   2809   return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
   2810 }
   2811 
   2812 /// Given two 256-bit floating-point vectors of [8 x float], perform an
   2813 ///    element-by-element comparison of the single-precision element in the
   2814 ///    first source vector and the corresponding element in the second source
   2815 ///    vector.
   2816 ///
   2817 ///    The EFLAGS register is updated as follows: \n
   2818 ///    If there is at least one pair of single-precision elements where the
   2819 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2820 ///    ZF flag is set to 1. \n
   2821 ///    If there is at least one pair of single-precision elements where the
   2822 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2823 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2824 ///    This intrinsic returns the value of the ZF flag.
   2825 ///
   2826 /// \headerfile <x86intrin.h>
   2827 ///
   2828 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2829 ///
   2830 /// \param __a
   2831 ///    A 256-bit vector of [8 x float].
   2832 /// \param __b
   2833 ///    A 256-bit vector of [8 x float].
   2834 /// \returns the ZF flag.
   2835 static __inline int __DEFAULT_FN_ATTRS
   2836 _mm256_testz_ps(__m256 __a, __m256 __b)
   2837 {
   2838   return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
   2839 }
   2840 
   2841 /// Given two 256-bit floating-point vectors of [8 x float], perform an
   2842 ///    element-by-element comparison of the single-precision element in the
   2843 ///    first source vector and the corresponding element in the second source
   2844 ///    vector.
   2845 ///
   2846 ///    The EFLAGS register is updated as follows: \n
   2847 ///    If there is at least one pair of single-precision elements where the
   2848 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2849 ///    ZF flag is set to 1. \n
   2850 ///    If there is at least one pair of single-precision elements where the
   2851 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2852 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2853 ///    This intrinsic returns the value of the CF flag.
   2854 ///
   2855 /// \headerfile <x86intrin.h>
   2856 ///
   2857 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2858 ///
   2859 /// \param __a
   2860 ///    A 256-bit vector of [8 x float].
   2861 /// \param __b
   2862 ///    A 256-bit vector of [8 x float].
   2863 /// \returns the CF flag.
   2864 static __inline int __DEFAULT_FN_ATTRS
   2865 _mm256_testc_ps(__m256 __a, __m256 __b)
   2866 {
   2867   return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
   2868 }
   2869 
   2870 /// Given two 256-bit floating-point vectors of [8 x float], perform an
   2871 ///    element-by-element comparison of the single-precision elements in the
   2872 ///    first source vector and the corresponding elements in the second source
   2873 ///    vector.
   2874 ///
   2875 ///    The EFLAGS register is updated as follows: \n
   2876 ///    If there is at least one pair of single-precision elements where the
   2877 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
   2878 ///    ZF flag is set to 1. \n
   2879 ///    If there is at least one pair of single-precision elements where the
   2880 ///    sign-bit of the first element is 0 and the sign-bit of the second element
   2881 ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
   2882 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2883 ///    otherwise it returns 0.
   2884 ///
   2885 /// \headerfile <x86intrin.h>
   2886 ///
   2887 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
   2888 ///
   2889 /// \param __a
   2890 ///    A 256-bit vector of [8 x float].
   2891 /// \param __b
   2892 ///    A 256-bit vector of [8 x float].
   2893 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2894 static __inline int __DEFAULT_FN_ATTRS
   2895 _mm256_testnzc_ps(__m256 __a, __m256 __b)
   2896 {
   2897   return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
   2898 }
   2899 
   2900 /// Given two 256-bit integer vectors, perform a bit-by-bit comparison
   2901 ///    of the two source vectors.
   2902 ///
   2903 ///    The EFLAGS register is updated as follows: \n
   2904 ///    If there is at least one pair of bits where both bits are 1, the ZF flag
   2905 ///    is set to 0. Otherwise the ZF flag is set to 1. \n
   2906 ///    If there is at least one pair of bits where the bit from the first source
   2907 ///    vector is 0 and the bit from the second source vector is 1, the CF flag
   2908 ///    is set to 0. Otherwise the CF flag is set to 1. \n
   2909 ///    This intrinsic returns the value of the ZF flag.
   2910 ///
   2911 /// \headerfile <x86intrin.h>
   2912 ///
   2913 /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
   2914 ///
   2915 /// \param __a
   2916 ///    A 256-bit integer vector.
   2917 /// \param __b
   2918 ///    A 256-bit integer vector.
   2919 /// \returns the ZF flag.
   2920 static __inline int __DEFAULT_FN_ATTRS
   2921 _mm256_testz_si256(__m256i __a, __m256i __b)
   2922 {
   2923   return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
   2924 }
   2925 
   2926 /// Given two 256-bit integer vectors, perform a bit-by-bit comparison
   2927 ///    of the two source vectors.
   2928 ///
   2929 ///    The EFLAGS register is updated as follows: \n
   2930 ///    If there is at least one pair of bits where both bits are 1, the ZF flag
   2931 ///    is set to 0. Otherwise the ZF flag is set to 1. \n
   2932 ///    If there is at least one pair of bits where the bit from the first source
   2933 ///    vector is 0 and the bit from the second source vector is 1, the CF flag
   2934 ///    is set to 0. Otherwise the CF flag is set to 1. \n
   2935 ///    This intrinsic returns the value of the CF flag.
   2936 ///
   2937 /// \headerfile <x86intrin.h>
   2938 ///
   2939 /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
   2940 ///
   2941 /// \param __a
   2942 ///    A 256-bit integer vector.
   2943 /// \param __b
   2944 ///    A 256-bit integer vector.
   2945 /// \returns the CF flag.
   2946 static __inline int __DEFAULT_FN_ATTRS
   2947 _mm256_testc_si256(__m256i __a, __m256i __b)
   2948 {
   2949   return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
   2950 }
   2951 
   2952 /// Given two 256-bit integer vectors, perform a bit-by-bit comparison
   2953 ///    of the two source vectors.
   2954 ///
   2955 ///    The EFLAGS register is updated as follows: \n
   2956 ///    If there is at least one pair of bits where both bits are 1, the ZF flag
   2957 ///    is set to 0. Otherwise the ZF flag is set to 1. \n
   2958 ///    If there is at least one pair of bits where the bit from the first source
   2959 ///    vector is 0 and the bit from the second source vector is 1, the CF flag
   2960 ///    is set to 0. Otherwise the CF flag is set to 1. \n
   2961 ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
   2962 ///    otherwise it returns 0.
   2963 ///
   2964 /// \headerfile <x86intrin.h>
   2965 ///
   2966 /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
   2967 ///
   2968 /// \param __a
   2969 ///    A 256-bit integer vector.
   2970 /// \param __b
   2971 ///    A 256-bit integer vector.
   2972 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
   2973 static __inline int __DEFAULT_FN_ATTRS
   2974 _mm256_testnzc_si256(__m256i __a, __m256i __b)
   2975 {
   2976   return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
   2977 }
   2978 
   2979 /* Vector extract sign mask */
   2980 /// Extracts the sign bits of double-precision floating point elements
   2981 ///    in a 256-bit vector of [4 x double] and writes them to the lower order
   2982 ///    bits of the return value.
   2983 ///
   2984 /// \headerfile <x86intrin.h>
   2985 ///
   2986 /// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
   2987 ///
   2988 /// \param __a
   2989 ///    A 256-bit vector of [4 x double] containing the double-precision
   2990 ///    floating point values with sign bits to be extracted.
   2991 /// \returns The sign bits from the operand, written to bits [3:0].
   2992 static __inline int __DEFAULT_FN_ATTRS
   2993 _mm256_movemask_pd(__m256d __a)
   2994 {
   2995   return __builtin_ia32_movmskpd256((__v4df)__a);
   2996 }
   2997 
   2998 /// Extracts the sign bits of single-precision floating point elements
   2999 ///    in a 256-bit vector of [8 x float] and writes them to the lower order
   3000 ///    bits of the return value.
   3001 ///
   3002 /// \headerfile <x86intrin.h>
   3003 ///
   3004 /// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
   3005 ///
   3006 /// \param __a
   3007 ///    A 256-bit vector of [8 x float] containing the single-precision floating
   3008 ///    point values with sign bits to be extracted.
   3009 /// \returns The sign bits from the operand, written to bits [7:0].
   3010 static __inline int __DEFAULT_FN_ATTRS
   3011 _mm256_movemask_ps(__m256 __a)
   3012 {
   3013   return __builtin_ia32_movmskps256((__v8sf)__a);
   3014 }
   3015 
   3016 /* Vector __zero */
   3017 /// Zeroes the contents of all XMM or YMM registers.
   3018 ///
   3019 /// \headerfile <x86intrin.h>
   3020 ///
   3021 /// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
   3022 static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
   3023 _mm256_zeroall(void)
   3024 {
   3025   __builtin_ia32_vzeroall();
   3026 }
   3027 
   3028 /// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
   3029 ///
   3030 /// \headerfile <x86intrin.h>
   3031 ///
   3032 /// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
   3033 static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
   3034 _mm256_zeroupper(void)
   3035 {
   3036   __builtin_ia32_vzeroupper();
   3037 }
   3038 
   3039 /* Vector load with broadcast */
   3040 /// Loads a scalar single-precision floating point value from the
   3041 ///    specified address pointed to by \a __a and broadcasts it to the elements
   3042 ///    of a [4 x float] vector.
   3043 ///
   3044 /// \headerfile <x86intrin.h>
   3045 ///
   3046 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
   3047 ///
   3048 /// \param __a
   3049 ///    The single-precision floating point value to be broadcast.
   3050 /// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
   3051 ///    equal to the broadcast value.
   3052 static __inline __m128 __DEFAULT_FN_ATTRS128
   3053 _mm_broadcast_ss(float const *__a)
   3054 {
   3055   struct __mm_broadcast_ss_struct {
   3056     float __f;
   3057   } __attribute__((__packed__, __may_alias__));
   3058   float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
   3059   return __extension__ (__m128){ __f, __f, __f, __f };
   3060 }
   3061 
   3062 /// Loads a scalar double-precision floating point value from the
   3063 ///    specified address pointed to by \a __a and broadcasts it to the elements
   3064 ///    of a [4 x double] vector.
   3065 ///
   3066 /// \headerfile <x86intrin.h>
   3067 ///
   3068 /// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
   3069 ///
   3070 /// \param __a
   3071 ///    The double-precision floating point value to be broadcast.
   3072 /// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
   3073 ///    equal to the broadcast value.
   3074 static __inline __m256d __DEFAULT_FN_ATTRS
   3075 _mm256_broadcast_sd(double const *__a)
   3076 {
   3077   struct __mm256_broadcast_sd_struct {
   3078     double __d;
   3079   } __attribute__((__packed__, __may_alias__));
   3080   double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
   3081   return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
   3082 }
   3083 
   3084 /// Loads a scalar single-precision floating point value from the
   3085 ///    specified address pointed to by \a __a and broadcasts it to the elements
   3086 ///    of a [8 x float] vector.
   3087 ///
   3088 /// \headerfile <x86intrin.h>
   3089 ///
   3090 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
   3091 ///
   3092 /// \param __a
   3093 ///    The single-precision floating point value to be broadcast.
   3094 /// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
   3095 ///    equal to the broadcast value.
   3096 static __inline __m256 __DEFAULT_FN_ATTRS
   3097 _mm256_broadcast_ss(float const *__a)
   3098 {
   3099   struct __mm256_broadcast_ss_struct {
   3100     float __f;
   3101   } __attribute__((__packed__, __may_alias__));
   3102   float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
   3103   return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
   3104 }
   3105 
   3106 /// Loads the data from a 128-bit vector of [2 x double] from the
   3107 ///    specified address pointed to by \a __a and broadcasts it to 128-bit
   3108 ///    elements in a 256-bit vector of [4 x double].
   3109 ///
   3110 /// \headerfile <x86intrin.h>
   3111 ///
   3112 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
   3113 ///
   3114 /// \param __a
   3115 ///    The 128-bit vector of [2 x double] to be broadcast.
   3116 /// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
   3117 ///    equal to the broadcast value.
   3118 static __inline __m256d __DEFAULT_FN_ATTRS
   3119 _mm256_broadcast_pd(__m128d const *__a)
   3120 {
   3121   __m128d __b = _mm_loadu_pd((const double *)__a);
   3122   return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
   3123                                           0, 1, 0, 1);
   3124 }
   3125 
   3126 /// Loads the data from a 128-bit vector of [4 x float] from the
   3127 ///    specified address pointed to by \a __a and broadcasts it to 128-bit
   3128 ///    elements in a 256-bit vector of [8 x float].
   3129 ///
   3130 /// \headerfile <x86intrin.h>
   3131 ///
   3132 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
   3133 ///
   3134 /// \param __a
   3135 ///    The 128-bit vector of [4 x float] to be broadcast.
   3136 /// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
   3137 ///    equal to the broadcast value.
   3138 static __inline __m256 __DEFAULT_FN_ATTRS
   3139 _mm256_broadcast_ps(__m128 const *__a)
   3140 {
   3141   __m128 __b = _mm_loadu_ps((const float *)__a);
   3142   return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
   3143                                          0, 1, 2, 3, 0, 1, 2, 3);
   3144 }
   3145 
   3146 /* SIMD load ops */
   3147 /// Loads 4 double-precision floating point values from a 32-byte aligned
   3148 ///    memory location pointed to by \a __p into a vector of [4 x double].
   3149 ///
   3150 /// \headerfile <x86intrin.h>
   3151 ///
   3152 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
   3153 ///
   3154 /// \param __p
   3155 ///    A 32-byte aligned pointer to a memory location containing
   3156 ///    double-precision floating point values.
   3157 /// \returns A 256-bit vector of [4 x double] containing the moved values.
   3158 static __inline __m256d __DEFAULT_FN_ATTRS
   3159 _mm256_load_pd(double const *__p)
   3160 {
   3161   return *(const __m256d *)__p;
   3162 }
   3163 
   3164 /// Loads 8 single-precision floating point values from a 32-byte aligned
   3165 ///    memory location pointed to by \a __p into a vector of [8 x float].
   3166 ///
   3167 /// \headerfile <x86intrin.h>
   3168 ///
   3169 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
   3170 ///
   3171 /// \param __p
   3172 ///    A 32-byte aligned pointer to a memory location containing float values.
   3173 /// \returns A 256-bit vector of [8 x float] containing the moved values.
   3174 static __inline __m256 __DEFAULT_FN_ATTRS
   3175 _mm256_load_ps(float const *__p)
   3176 {
   3177   return *(const __m256 *)__p;
   3178 }
   3179 
   3180 /// Loads 4 double-precision floating point values from an unaligned
   3181 ///    memory location pointed to by \a __p into a vector of [4 x double].
   3182 ///
   3183 /// \headerfile <x86intrin.h>
   3184 ///
   3185 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
   3186 ///
   3187 /// \param __p
   3188 ///    A pointer to a memory location containing double-precision floating
   3189 ///    point values.
   3190 /// \returns A 256-bit vector of [4 x double] containing the moved values.
   3191 static __inline __m256d __DEFAULT_FN_ATTRS
   3192 _mm256_loadu_pd(double const *__p)
   3193 {
   3194   struct __loadu_pd {
   3195     __m256d_u __v;
   3196   } __attribute__((__packed__, __may_alias__));
   3197   return ((const struct __loadu_pd*)__p)->__v;
   3198 }
   3199 
   3200 /// Loads 8 single-precision floating point values from an unaligned
   3201 ///    memory location pointed to by \a __p into a vector of [8 x float].
   3202 ///
   3203 /// \headerfile <x86intrin.h>
   3204 ///
   3205 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
   3206 ///
   3207 /// \param __p
   3208 ///    A pointer to a memory location containing single-precision floating
   3209 ///    point values.
   3210 /// \returns A 256-bit vector of [8 x float] containing the moved values.
   3211 static __inline __m256 __DEFAULT_FN_ATTRS
   3212 _mm256_loadu_ps(float const *__p)
   3213 {
   3214   struct __loadu_ps {
   3215     __m256_u __v;
   3216   } __attribute__((__packed__, __may_alias__));
   3217   return ((const struct __loadu_ps*)__p)->__v;
   3218 }
   3219 
   3220 /// Loads 256 bits of integer data from a 32-byte aligned memory
   3221 ///    location pointed to by \a __p into elements of a 256-bit integer vector.
   3222 ///
   3223 /// \headerfile <x86intrin.h>
   3224 ///
   3225 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
   3226 ///
   3227 /// \param __p
   3228 ///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
   3229 ///    values.
   3230 /// \returns A 256-bit integer vector containing the moved values.
   3231 static __inline __m256i __DEFAULT_FN_ATTRS
   3232 _mm256_load_si256(__m256i const *__p)
   3233 {
   3234   return *__p;
   3235 }
   3236 
   3237 /// Loads 256 bits of integer data from an unaligned memory location
   3238 ///    pointed to by \a __p into a 256-bit integer vector.
   3239 ///
   3240 /// \headerfile <x86intrin.h>
   3241 ///
   3242 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
   3243 ///
   3244 /// \param __p
   3245 ///    A pointer to a 256-bit integer vector containing integer values.
   3246 /// \returns A 256-bit integer vector containing the moved values.
   3247 static __inline __m256i __DEFAULT_FN_ATTRS
   3248 _mm256_loadu_si256(__m256i_u const *__p)
   3249 {
   3250   struct __loadu_si256 {
   3251     __m256i_u __v;
   3252   } __attribute__((__packed__, __may_alias__));
   3253   return ((const struct __loadu_si256*)__p)->__v;
   3254 }
   3255 
   3256 /// Loads 256 bits of integer data from an unaligned memory location
   3257 ///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
   3258 ///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
   3259 ///    line boundary.
   3260 ///
   3261 /// \headerfile <x86intrin.h>
   3262 ///
   3263 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
   3264 ///
   3265 /// \param __p
   3266 ///    A pointer to a 256-bit integer vector containing integer values.
   3267 /// \returns A 256-bit integer vector containing the moved values.
   3268 static __inline __m256i __DEFAULT_FN_ATTRS
   3269 _mm256_lddqu_si256(__m256i_u const *__p)
   3270 {
   3271   return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
   3272 }
   3273 
   3274 /* SIMD store ops */
   3275 /// Stores double-precision floating point values from a 256-bit vector
   3276 ///    of [4 x double] to a 32-byte aligned memory location pointed to by
   3277 ///    \a __p.
   3278 ///
   3279 /// \headerfile <x86intrin.h>
   3280 ///
   3281 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
   3282 ///
   3283 /// \param __p
   3284 ///    A 32-byte aligned pointer to a memory location that will receive the
   3285 ///    double-precision floaing point values.
   3286 /// \param __a
   3287 ///    A 256-bit vector of [4 x double] containing the values to be moved.
   3288 static __inline void __DEFAULT_FN_ATTRS
   3289 _mm256_store_pd(double *__p, __m256d __a)
   3290 {
   3291   *(__m256d *)__p = __a;
   3292 }
   3293 
   3294 /// Stores single-precision floating point values from a 256-bit vector
   3295 ///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
   3296 ///
   3297 /// \headerfile <x86intrin.h>
   3298 ///
   3299 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
   3300 ///
   3301 /// \param __p
   3302 ///    A 32-byte aligned pointer to a memory location that will receive the
   3303 ///    float values.
   3304 /// \param __a
   3305 ///    A 256-bit vector of [8 x float] containing the values to be moved.
   3306 static __inline void __DEFAULT_FN_ATTRS
   3307 _mm256_store_ps(float *__p, __m256 __a)
   3308 {
   3309   *(__m256 *)__p = __a;
   3310 }
   3311 
   3312 /// Stores double-precision floating point values from a 256-bit vector
   3313 ///    of [4 x double] to an unaligned memory location pointed to by \a __p.
   3314 ///
   3315 /// \headerfile <x86intrin.h>
   3316 ///
   3317 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
   3318 ///
   3319 /// \param __p
   3320 ///    A pointer to a memory location that will receive the double-precision
   3321 ///    floating point values.
   3322 /// \param __a
   3323 ///    A 256-bit vector of [4 x double] containing the values to be moved.
   3324 static __inline void __DEFAULT_FN_ATTRS
   3325 _mm256_storeu_pd(double *__p, __m256d __a)
   3326 {
   3327   struct __storeu_pd {
   3328     __m256d_u __v;
   3329   } __attribute__((__packed__, __may_alias__));
   3330   ((struct __storeu_pd*)__p)->__v = __a;
   3331 }
   3332 
   3333 /// Stores single-precision floating point values from a 256-bit vector
   3334 ///    of [8 x float] to an unaligned memory location pointed to by \a __p.
   3335 ///
   3336 /// \headerfile <x86intrin.h>
   3337 ///
   3338 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
   3339 ///
   3340 /// \param __p
   3341 ///    A pointer to a memory location that will receive the float values.
   3342 /// \param __a
   3343 ///    A 256-bit vector of [8 x float] containing the values to be moved.
   3344 static __inline void __DEFAULT_FN_ATTRS
   3345 _mm256_storeu_ps(float *__p, __m256 __a)
   3346 {
   3347   struct __storeu_ps {
   3348     __m256_u __v;
   3349   } __attribute__((__packed__, __may_alias__));
   3350   ((struct __storeu_ps*)__p)->__v = __a;
   3351 }
   3352 
   3353 /// Stores integer values from a 256-bit integer vector to a 32-byte
   3354 ///    aligned memory location pointed to by \a __p.
   3355 ///
   3356 /// \headerfile <x86intrin.h>
   3357 ///
   3358 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
   3359 ///
   3360 /// \param __p
   3361 ///    A 32-byte aligned pointer to a memory location that will receive the
   3362 ///    integer values.
   3363 /// \param __a
   3364 ///    A 256-bit integer vector containing the values to be moved.
   3365 static __inline void __DEFAULT_FN_ATTRS
   3366 _mm256_store_si256(__m256i *__p, __m256i __a)
   3367 {
   3368   *__p = __a;
   3369 }
   3370 
   3371 /// Stores integer values from a 256-bit integer vector to an unaligned
   3372 ///    memory location pointed to by \a __p.
   3373 ///
   3374 /// \headerfile <x86intrin.h>
   3375 ///
   3376 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
   3377 ///
   3378 /// \param __p
   3379 ///    A pointer to a memory location that will receive the integer values.
   3380 /// \param __a
   3381 ///    A 256-bit integer vector containing the values to be moved.
   3382 static __inline void __DEFAULT_FN_ATTRS
   3383 _mm256_storeu_si256(__m256i_u *__p, __m256i __a)
   3384 {
   3385   struct __storeu_si256 {
   3386     __m256i_u __v;
   3387   } __attribute__((__packed__, __may_alias__));
   3388   ((struct __storeu_si256*)__p)->__v = __a;
   3389 }
   3390 
   3391 /* Conditional load ops */
   3392 /// Conditionally loads double-precision floating point elements from a
   3393 ///    memory location pointed to by \a __p into a 128-bit vector of
   3394 ///    [2 x double], depending on the mask bits associated with each data
   3395 ///    element.
   3396 ///
   3397 /// \headerfile <x86intrin.h>
   3398 ///
   3399 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
   3400 ///
   3401 /// \param __p
   3402 ///    A pointer to a memory location that contains the double-precision
   3403 ///    floating point values.
   3404 /// \param __m
   3405 ///    A 128-bit integer vector containing the mask. The most significant bit of
   3406 ///    each data element represents the mask bits. If a mask bit is zero, the
   3407 ///    corresponding value in the memory location is not loaded and the
   3408 ///    corresponding field in the return value is set to zero.
   3409 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
   3410 static __inline __m128d __DEFAULT_FN_ATTRS128
   3411 _mm_maskload_pd(double const *__p, __m128i __m)
   3412 {
   3413   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
   3414 }
   3415 
   3416 /// Conditionally loads double-precision floating point elements from a
   3417 ///    memory location pointed to by \a __p into a 256-bit vector of
   3418 ///    [4 x double], depending on the mask bits associated with each data
   3419 ///    element.
   3420 ///
   3421 /// \headerfile <x86intrin.h>
   3422 ///
   3423 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
   3424 ///
   3425 /// \param __p
   3426 ///    A pointer to a memory location that contains the double-precision
   3427 ///    floating point values.
   3428 /// \param __m
   3429 ///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
   3430 ///    significant bit of each quadword element represents the mask bits. If a
   3431 ///    mask bit is zero, the corresponding value in the memory location is not
   3432 ///    loaded and the corresponding field in the return value is set to zero.
   3433 /// \returns A 256-bit vector of [4 x double] containing the loaded values.
   3434 static __inline __m256d __DEFAULT_FN_ATTRS
   3435 _mm256_maskload_pd(double const *__p, __m256i __m)
   3436 {
   3437   return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
   3438                                                (__v4di)__m);
   3439 }
   3440 
   3441 /// Conditionally loads single-precision floating point elements from a
   3442 ///    memory location pointed to by \a __p into a 128-bit vector of
   3443 ///    [4 x float], depending on the mask bits associated with each data
   3444 ///    element.
   3445 ///
   3446 /// \headerfile <x86intrin.h>
   3447 ///
   3448 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
   3449 ///
   3450 /// \param __p
   3451 ///    A pointer to a memory location that contains the single-precision
   3452 ///    floating point values.
   3453 /// \param __m
   3454 ///    A 128-bit integer vector containing the mask. The most significant bit of
   3455 ///    each data element represents the mask bits. If a mask bit is zero, the
   3456 ///    corresponding value in the memory location is not loaded and the
   3457 ///    corresponding field in the return value is set to zero.
   3458 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
   3459 static __inline __m128 __DEFAULT_FN_ATTRS128
   3460 _mm_maskload_ps(float const *__p, __m128i __m)
   3461 {
   3462   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
   3463 }
   3464 
   3465 /// Conditionally loads single-precision floating point elements from a
   3466 ///    memory location pointed to by \a __p into a 256-bit vector of
   3467 ///    [8 x float], depending on the mask bits associated with each data
   3468 ///    element.
   3469 ///
   3470 /// \headerfile <x86intrin.h>
   3471 ///
   3472 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
   3473 ///
   3474 /// \param __p
   3475 ///    A pointer to a memory location that contains the single-precision
   3476 ///    floating point values.
   3477 /// \param __m
   3478 ///    A 256-bit integer vector of [8 x dword] containing the mask. The most
   3479 ///    significant bit of each dword element represents the mask bits. If a mask
   3480 ///    bit is zero, the corresponding value in the memory location is not loaded
   3481 ///    and the corresponding field in the return value is set to zero.
   3482 /// \returns A 256-bit vector of [8 x float] containing the loaded values.
   3483 static __inline __m256 __DEFAULT_FN_ATTRS
   3484 _mm256_maskload_ps(float const *__p, __m256i __m)
   3485 {
   3486   return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
   3487 }
   3488 
   3489 /* Conditional store ops */
   3490 /// Moves single-precision floating point values from a 256-bit vector
   3491 ///    of [8 x float] to a memory location pointed to by \a __p, according to
   3492 ///    the specified mask.
   3493 ///
   3494 /// \headerfile <x86intrin.h>
   3495 ///
   3496 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
   3497 ///
   3498 /// \param __p
   3499 ///    A pointer to a memory location that will receive the float values.
   3500 /// \param __m
   3501 ///    A 256-bit integer vector of [8 x dword] containing the mask. The most
   3502 ///    significant bit of each dword element in the mask vector represents the
   3503 ///    mask bits. If a mask bit is zero, the corresponding value from vector
   3504 ///    \a __a is not stored and the corresponding field in the memory location
   3505 ///    pointed to by \a __p is not changed.
   3506 /// \param __a
   3507 ///    A 256-bit vector of [8 x float] containing the values to be stored.
   3508 static __inline void __DEFAULT_FN_ATTRS
   3509 _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
   3510 {
   3511   __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
   3512 }
   3513 
   3514 /// Moves double-precision values from a 128-bit vector of [2 x double]
   3515 ///    to a memory location pointed to by \a __p, according to the specified
   3516 ///    mask.
   3517 ///
   3518 /// \headerfile <x86intrin.h>
   3519 ///
   3520 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
   3521 ///
   3522 /// \param __p
   3523 ///    A pointer to a memory location that will receive the float values.
   3524 /// \param __m
   3525 ///    A 128-bit integer vector containing the mask. The most significant bit of
   3526 ///    each field in the mask vector represents the mask bits. If a mask bit is
   3527 ///    zero, the corresponding value from vector \a __a is not stored and the
   3528 ///    corresponding field in the memory location pointed to by \a __p is not
   3529 ///    changed.
   3530 /// \param __a
   3531 ///    A 128-bit vector of [2 x double] containing the values to be stored.
   3532 static __inline void __DEFAULT_FN_ATTRS128
   3533 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
   3534 {
   3535   __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
   3536 }
   3537 
   3538 /// Moves double-precision values from a 256-bit vector of [4 x double]
   3539 ///    to a memory location pointed to by \a __p, according to the specified
   3540 ///    mask.
   3541 ///
   3542 /// \headerfile <x86intrin.h>
   3543 ///
   3544 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
   3545 ///
   3546 /// \param __p
   3547 ///    A pointer to a memory location that will receive the float values.
   3548 /// \param __m
   3549 ///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
   3550 ///    significant bit of each quadword element in the mask vector represents
   3551 ///    the mask bits. If a mask bit is zero, the corresponding value from vector
   3552 ///    __a is not stored and the corresponding field in the memory location
   3553 ///    pointed to by \a __p is not changed.
   3554 /// \param __a
   3555 ///    A 256-bit vector of [4 x double] containing the values to be stored.
   3556 static __inline void __DEFAULT_FN_ATTRS
   3557 _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
   3558 {
   3559   __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
   3560 }
   3561 
   3562 /// Moves single-precision floating point values from a 128-bit vector
   3563 ///    of [4 x float] to a memory location pointed to by \a __p, according to
   3564 ///    the specified mask.
   3565 ///
   3566 /// \headerfile <x86intrin.h>
   3567 ///
   3568 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
   3569 ///
   3570 /// \param __p
   3571 ///    A pointer to a memory location that will receive the float values.
   3572 /// \param __m
   3573 ///    A 128-bit integer vector containing the mask. The most significant bit of
   3574 ///    each field in the mask vector represents the mask bits. If a mask bit is
   3575 ///    zero, the corresponding value from vector __a is not stored and the
   3576 ///    corresponding field in the memory location pointed to by \a __p is not
   3577 ///    changed.
   3578 /// \param __a
   3579 ///    A 128-bit vector of [4 x float] containing the values to be stored.
   3580 static __inline void __DEFAULT_FN_ATTRS128
   3581 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
   3582 {
   3583   __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
   3584 }
   3585 
   3586 /* Cacheability support ops */
   3587 /// Moves integer data from a 256-bit integer vector to a 32-byte
   3588 ///    aligned memory location. To minimize caching, the data is flagged as
   3589 ///    non-temporal (unlikely to be used again soon).
   3590 ///
   3591 /// \headerfile <x86intrin.h>
   3592 ///
   3593 /// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
   3594 ///
   3595 /// \param __a
   3596 ///    A pointer to a 32-byte aligned memory location that will receive the
   3597 ///    integer values.
   3598 /// \param __b
   3599 ///    A 256-bit integer vector containing the values to be moved.
   3600 static __inline void __DEFAULT_FN_ATTRS
   3601 _mm256_stream_si256(void *__a, __m256i __b)
   3602 {
   3603   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
   3604   __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
   3605 }
   3606 
   3607 /// Moves double-precision values from a 256-bit vector of [4 x double]
   3608 ///    to a 32-byte aligned memory location. To minimize caching, the data is
   3609 ///    flagged as non-temporal (unlikely to be used again soon).
   3610 ///
   3611 /// \headerfile <x86intrin.h>
   3612 ///
   3613 /// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
   3614 ///
   3615 /// \param __a
   3616 ///    A pointer to a 32-byte aligned memory location that will receive the
   3617 ///    double-precision floating-point values.
   3618 /// \param __b
   3619 ///    A 256-bit vector of [4 x double] containing the values to be moved.
   3620 static __inline void __DEFAULT_FN_ATTRS
   3621 _mm256_stream_pd(void *__a, __m256d __b)
   3622 {
   3623   typedef __v4df __v4df_aligned __attribute__((aligned(32)));
   3624   __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
   3625 }
   3626 
   3627 /// Moves single-precision floating point values from a 256-bit vector
   3628 ///    of [8 x float] to a 32-byte aligned memory location. To minimize
   3629 ///    caching, the data is flagged as non-temporal (unlikely to be used again
   3630 ///    soon).
   3631 ///
   3632 /// \headerfile <x86intrin.h>
   3633 ///
   3634 /// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
   3635 ///
   3636 /// \param __p
   3637 ///    A pointer to a 32-byte aligned memory location that will receive the
   3638 ///    single-precision floating point values.
   3639 /// \param __a
   3640 ///    A 256-bit vector of [8 x float] containing the values to be moved.
   3641 static __inline void __DEFAULT_FN_ATTRS
   3642 _mm256_stream_ps(void *__p, __m256 __a)
   3643 {
   3644   typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
   3645   __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
   3646 }
   3647 
   3648 /* Create vectors */
   3649 /// Create a 256-bit vector of [4 x double] with undefined values.
   3650 ///
   3651 /// \headerfile <x86intrin.h>
   3652 ///
   3653 /// This intrinsic has no corresponding instruction.
   3654 ///
   3655 /// \returns A 256-bit vector of [4 x double] containing undefined values.
   3656 static __inline__ __m256d __DEFAULT_FN_ATTRS
   3657 _mm256_undefined_pd(void)
   3658 {
   3659   return (__m256d)__builtin_ia32_undef256();
   3660 }
   3661 
   3662 /// Create a 256-bit vector of [8 x float] with undefined values.
   3663 ///
   3664 /// \headerfile <x86intrin.h>
   3665 ///
   3666 /// This intrinsic has no corresponding instruction.
   3667 ///
   3668 /// \returns A 256-bit vector of [8 x float] containing undefined values.
   3669 static __inline__ __m256 __DEFAULT_FN_ATTRS
   3670 _mm256_undefined_ps(void)
   3671 {
   3672   return (__m256)__builtin_ia32_undef256();
   3673 }
   3674 
   3675 /// Create a 256-bit integer vector with undefined values.
   3676 ///
   3677 /// \headerfile <x86intrin.h>
   3678 ///
   3679 /// This intrinsic has no corresponding instruction.
   3680 ///
   3681 /// \returns A 256-bit integer vector containing undefined values.
   3682 static __inline__ __m256i __DEFAULT_FN_ATTRS
   3683 _mm256_undefined_si256(void)
   3684 {
   3685   return (__m256i)__builtin_ia32_undef256();
   3686 }
   3687 
   3688 /// Constructs a 256-bit floating-point vector of [4 x double]
   3689 ///    initialized with the specified double-precision floating-point values.
   3690 ///
   3691 /// \headerfile <x86intrin.h>
   3692 ///
   3693 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
   3694 ///   instruction.
   3695 ///
   3696 /// \param __a
   3697 ///    A double-precision floating-point value used to initialize bits [255:192]
   3698 ///    of the result.
   3699 /// \param __b
   3700 ///    A double-precision floating-point value used to initialize bits [191:128]
   3701 ///    of the result.
   3702 /// \param __c
   3703 ///    A double-precision floating-point value used to initialize bits [127:64]
   3704 ///    of the result.
   3705 /// \param __d
   3706 ///    A double-precision floating-point value used to initialize bits [63:0]
   3707 ///    of the result.
   3708 /// \returns An initialized 256-bit floating-point vector of [4 x double].
   3709 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
   3710 _mm256_set_pd(double __a, double __b, double __c, double __d)
   3711 {
   3712   return __extension__ (__m256d){ __d, __c, __b, __a };
   3713 }
   3714 
   3715 /// Constructs a 256-bit floating-point vector of [8 x float] initialized
   3716 ///    with the specified single-precision floating-point values.
   3717 ///
   3718 /// \headerfile <x86intrin.h>
   3719 ///
   3720 /// This intrinsic is a utility function and does not correspond to a specific
   3721 ///   instruction.
   3722 ///
   3723 /// \param __a
   3724 ///    A single-precision floating-point value used to initialize bits [255:224]
   3725 ///    of the result.
   3726 /// \param __b
   3727 ///    A single-precision floating-point value used to initialize bits [223:192]
   3728 ///    of the result.
   3729 /// \param __c
   3730 ///    A single-precision floating-point value used to initialize bits [191:160]
   3731 ///    of the result.
   3732 /// \param __d
   3733 ///    A single-precision floating-point value used to initialize bits [159:128]
   3734 ///    of the result.
   3735 /// \param __e
   3736 ///    A single-precision floating-point value used to initialize bits [127:96]
   3737 ///    of the result.
   3738 /// \param __f
   3739 ///    A single-precision floating-point value used to initialize bits [95:64]
   3740 ///    of the result.
   3741 /// \param __g
   3742 ///    A single-precision floating-point value used to initialize bits [63:32]
   3743 ///    of the result.
   3744 /// \param __h
   3745 ///    A single-precision floating-point value used to initialize bits [31:0]
   3746 ///    of the result.
   3747 /// \returns An initialized 256-bit floating-point vector of [8 x float].
   3748 static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
   3749 _mm256_set_ps(float __a, float __b, float __c, float __d,
   3750               float __e, float __f, float __g, float __h)
   3751 {
   3752   return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
   3753 }
   3754 
   3755 /// Constructs a 256-bit integer vector initialized with the specified
   3756 ///    32-bit integral values.
   3757 ///
   3758 /// \headerfile <x86intrin.h>
   3759 ///
   3760 /// This intrinsic is a utility function and does not correspond to a specific
   3761 ///   instruction.
   3762 ///
   3763 /// \param __i0
   3764 ///    A 32-bit integral value used to initialize bits [255:224] of the result.
   3765 /// \param __i1
   3766 ///    A 32-bit integral value used to initialize bits [223:192] of the result.
   3767 /// \param __i2
   3768 ///    A 32-bit integral value used to initialize bits [191:160] of the result.
   3769 /// \param __i3
   3770 ///    A 32-bit integral value used to initialize bits [159:128] of the result.
   3771 /// \param __i4
   3772 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
   3773 /// \param __i5
   3774 ///    A 32-bit integral value used to initialize bits [95:64] of the result.
   3775 /// \param __i6
   3776 ///    A 32-bit integral value used to initialize bits [63:32] of the result.
   3777 /// \param __i7
   3778 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
   3779 /// \returns An initialized 256-bit integer vector.
   3780 static __inline __m256i __DEFAULT_FN_ATTRS
   3781 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
   3782                  int __i4, int __i5, int __i6, int __i7)
   3783 {
   3784   return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
   3785 }
   3786 
   3787 /// Constructs a 256-bit integer vector initialized with the specified
   3788 ///    16-bit integral values.
   3789 ///
   3790 /// \headerfile <x86intrin.h>
   3791 ///
   3792 /// This intrinsic is a utility function and does not correspond to a specific
   3793 ///   instruction.
   3794 ///
   3795 /// \param __w15
   3796 ///    A 16-bit integral value used to initialize bits [255:240] of the result.
   3797 /// \param __w14
   3798 ///    A 16-bit integral value used to initialize bits [239:224] of the result.
   3799 /// \param __w13
   3800 ///    A 16-bit integral value used to initialize bits [223:208] of the result.
   3801 /// \param __w12
   3802 ///    A 16-bit integral value used to initialize bits [207:192] of the result.
   3803 /// \param __w11
   3804 ///    A 16-bit integral value used to initialize bits [191:176] of the result.
   3805 /// \param __w10
   3806 ///    A 16-bit integral value used to initialize bits [175:160] of the result.
   3807 /// \param __w09
   3808 ///    A 16-bit integral value used to initialize bits [159:144] of the result.
   3809 /// \param __w08
   3810 ///    A 16-bit integral value used to initialize bits [143:128] of the result.
   3811 /// \param __w07
   3812 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
   3813 /// \param __w06
   3814 ///    A 16-bit integral value used to initialize bits [111:96] of the result.
   3815 /// \param __w05
   3816 ///    A 16-bit integral value used to initialize bits [95:80] of the result.
   3817 /// \param __w04
   3818 ///    A 16-bit integral value used to initialize bits [79:64] of the result.
   3819 /// \param __w03
   3820 ///    A 16-bit integral value used to initialize bits [63:48] of the result.
   3821 /// \param __w02
   3822 ///    A 16-bit integral value used to initialize bits [47:32] of the result.
   3823 /// \param __w01
   3824 ///    A 16-bit integral value used to initialize bits [31:16] of the result.
   3825 /// \param __w00
   3826 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
   3827 /// \returns An initialized 256-bit integer vector.
   3828 static __inline __m256i __DEFAULT_FN_ATTRS
   3829 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
   3830                  short __w11, short __w10, short __w09, short __w08,
   3831                  short __w07, short __w06, short __w05, short __w04,
   3832                  short __w03, short __w02, short __w01, short __w00)
   3833 {
   3834   return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
   3835     __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
   3836 }
   3837 
   3838 /// Constructs a 256-bit integer vector initialized with the specified
   3839 ///    8-bit integral values.
   3840 ///
   3841 /// \headerfile <x86intrin.h>
   3842 ///
   3843 /// This intrinsic is a utility function and does not correspond to a specific
   3844 ///   instruction.
   3845 ///
   3846 /// \param __b31
   3847 ///    An 8-bit integral value used to initialize bits [255:248] of the result.
   3848 /// \param __b30
   3849 ///    An 8-bit integral value used to initialize bits [247:240] of the result.
   3850 /// \param __b29
   3851 ///    An 8-bit integral value used to initialize bits [239:232] of the result.
   3852 /// \param __b28
   3853 ///    An 8-bit integral value used to initialize bits [231:224] of the result.
   3854 /// \param __b27
   3855 ///    An 8-bit integral value used to initialize bits [223:216] of the result.
   3856 /// \param __b26
   3857 ///    An 8-bit integral value used to initialize bits [215:208] of the result.
   3858 /// \param __b25
   3859 ///    An 8-bit integral value used to initialize bits [207:200] of the result.
   3860 /// \param __b24
   3861 ///    An 8-bit integral value used to initialize bits [199:192] of the result.
   3862 /// \param __b23
   3863 ///    An 8-bit integral value used to initialize bits [191:184] of the result.
   3864 /// \param __b22
   3865 ///    An 8-bit integral value used to initialize bits [183:176] of the result.
   3866 /// \param __b21
   3867 ///    An 8-bit integral value used to initialize bits [175:168] of the result.
   3868 /// \param __b20
   3869 ///    An 8-bit integral value used to initialize bits [167:160] of the result.
   3870 /// \param __b19
   3871 ///    An 8-bit integral value used to initialize bits [159:152] of the result.
   3872 /// \param __b18
   3873 ///    An 8-bit integral value used to initialize bits [151:144] of the result.
   3874 /// \param __b17
   3875 ///    An 8-bit integral value used to initialize bits [143:136] of the result.
   3876 /// \param __b16
   3877 ///    An 8-bit integral value used to initialize bits [135:128] of the result.
   3878 /// \param __b15
   3879 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
   3880 /// \param __b14
   3881 ///    An 8-bit integral value used to initialize bits [119:112] of the result.
   3882 /// \param __b13
   3883 ///    An 8-bit integral value used to initialize bits [111:104] of the result.
   3884 /// \param __b12
   3885 ///    An 8-bit integral value used to initialize bits [103:96] of the result.
   3886 /// \param __b11
   3887 ///    An 8-bit integral value used to initialize bits [95:88] of the result.
   3888 /// \param __b10
   3889 ///    An 8-bit integral value used to initialize bits [87:80] of the result.
   3890 /// \param __b09
   3891 ///    An 8-bit integral value used to initialize bits [79:72] of the result.
   3892 /// \param __b08
   3893 ///    An 8-bit integral value used to initialize bits [71:64] of the result.
   3894 /// \param __b07
   3895 ///    An 8-bit integral value used to initialize bits [63:56] of the result.
   3896 /// \param __b06
   3897 ///    An 8-bit integral value used to initialize bits [55:48] of the result.
   3898 /// \param __b05
   3899 ///    An 8-bit integral value used to initialize bits [47:40] of the result.
   3900 /// \param __b04
   3901 ///    An 8-bit integral value used to initialize bits [39:32] of the result.
   3902 /// \param __b03
   3903 ///    An 8-bit integral value used to initialize bits [31:24] of the result.
   3904 /// \param __b02
   3905 ///    An 8-bit integral value used to initialize bits [23:16] of the result.
   3906 /// \param __b01
   3907 ///    An 8-bit integral value used to initialize bits [15:8] of the result.
   3908 /// \param __b00
   3909 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
   3910 /// \returns An initialized 256-bit integer vector.
   3911 static __inline __m256i __DEFAULT_FN_ATTRS
   3912 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
   3913                 char __b27, char __b26, char __b25, char __b24,
   3914                 char __b23, char __b22, char __b21, char __b20,
   3915                 char __b19, char __b18, char __b17, char __b16,
   3916                 char __b15, char __b14, char __b13, char __b12,
   3917                 char __b11, char __b10, char __b09, char __b08,
   3918                 char __b07, char __b06, char __b05, char __b04,
   3919                 char __b03, char __b02, char __b01, char __b00)
   3920 {
   3921   return __extension__ (__m256i)(__v32qi){
   3922     __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
   3923     __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
   3924     __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
   3925     __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
   3926   };
   3927 }
   3928 
   3929 /// Constructs a 256-bit integer vector initialized with the specified
   3930 ///    64-bit integral values.
   3931 ///
   3932 /// \headerfile <x86intrin.h>
   3933 ///
   3934 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
   3935 ///   instruction.
   3936 ///
   3937 /// \param __a
   3938 ///    A 64-bit integral value used to initialize bits [255:192] of the result.
   3939 /// \param __b
   3940 ///    A 64-bit integral value used to initialize bits [191:128] of the result.
   3941 /// \param __c
   3942 ///    A 64-bit integral value used to initialize bits [127:64] of the result.
   3943 /// \param __d
   3944 ///    A 64-bit integral value used to initialize bits [63:0] of the result.
   3945 /// \returns An initialized 256-bit integer vector.
   3946 static __inline __m256i __DEFAULT_FN_ATTRS
   3947 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
   3948 {
   3949   return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
   3950 }
   3951 
   3952 /* Create vectors with elements in reverse order */
   3953 /// Constructs a 256-bit floating-point vector of [4 x double],
   3954 ///    initialized in reverse order with the specified double-precision
   3955 ///    floating-point values.
   3956 ///
   3957 /// \headerfile <x86intrin.h>
   3958 ///
   3959 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
   3960 ///   instruction.
   3961 ///
   3962 /// \param __a
   3963 ///    A double-precision floating-point value used to initialize bits [63:0]
   3964 ///    of the result.
   3965 /// \param __b
   3966 ///    A double-precision floating-point value used to initialize bits [127:64]
   3967 ///    of the result.
   3968 /// \param __c
   3969 ///    A double-precision floating-point value used to initialize bits [191:128]
   3970 ///    of the result.
   3971 /// \param __d
   3972 ///    A double-precision floating-point value used to initialize bits [255:192]
   3973 ///    of the result.
   3974 /// \returns An initialized 256-bit floating-point vector of [4 x double].
   3975 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
   3976 _mm256_setr_pd(double __a, double __b, double __c, double __d)
   3977 {
   3978   return _mm256_set_pd(__d, __c, __b, __a);
   3979 }
   3980 
   3981 /// Constructs a 256-bit floating-point vector of [8 x float],
   3982 ///    initialized in reverse order with the specified single-precision
   3983 ///    float-point values.
   3984 ///
   3985 /// \headerfile <x86intrin.h>
   3986 ///
   3987 /// This intrinsic is a utility function and does not correspond to a specific
   3988 ///   instruction.
   3989 ///
   3990 /// \param __a
   3991 ///    A single-precision floating-point value used to initialize bits [31:0]
   3992 ///    of the result.
   3993 /// \param __b
   3994 ///    A single-precision floating-point value used to initialize bits [63:32]
   3995 ///    of the result.
   3996 /// \param __c
   3997 ///    A single-precision floating-point value used to initialize bits [95:64]
   3998 ///    of the result.
   3999 /// \param __d
   4000 ///    A single-precision floating-point value used to initialize bits [127:96]
   4001 ///    of the result.
   4002 /// \param __e
   4003 ///    A single-precision floating-point value used to initialize bits [159:128]
   4004 ///    of the result.
   4005 /// \param __f
   4006 ///    A single-precision floating-point value used to initialize bits [191:160]
   4007 ///    of the result.
   4008 /// \param __g
   4009 ///    A single-precision floating-point value used to initialize bits [223:192]
   4010 ///    of the result.
   4011 /// \param __h
   4012 ///    A single-precision floating-point value used to initialize bits [255:224]
   4013 ///    of the result.
   4014 /// \returns An initialized 256-bit floating-point vector of [8 x float].
   4015 static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
   4016 _mm256_setr_ps(float __a, float __b, float __c, float __d,
   4017                float __e, float __f, float __g, float __h)
   4018 {
   4019   return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
   4020 }
   4021 
   4022 /// Constructs a 256-bit integer vector, initialized in reverse order
   4023 ///    with the specified 32-bit integral values.
   4024 ///
   4025 /// \headerfile <x86intrin.h>
   4026 ///
   4027 /// This intrinsic is a utility function and does not correspond to a specific
   4028 ///   instruction.
   4029 ///
   4030 /// \param __i0
   4031 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
   4032 /// \param __i1
   4033 ///    A 32-bit integral value used to initialize bits [63:32] of the result.
   4034 /// \param __i2
   4035 ///    A 32-bit integral value used to initialize bits [95:64] of the result.
   4036 /// \param __i3
   4037 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
   4038 /// \param __i4
   4039 ///    A 32-bit integral value used to initialize bits [159:128] of the result.
   4040 /// \param __i5
   4041 ///    A 32-bit integral value used to initialize bits [191:160] of the result.
   4042 /// \param __i6
   4043 ///    A 32-bit integral value used to initialize bits [223:192] of the result.
   4044 /// \param __i7
   4045 ///    A 32-bit integral value used to initialize bits [255:224] of the result.
   4046 /// \returns An initialized 256-bit integer vector.
   4047 static __inline __m256i __DEFAULT_FN_ATTRS
   4048 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
   4049                   int __i4, int __i5, int __i6, int __i7)
   4050 {
   4051   return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
   4052 }
   4053 
   4054 /// Constructs a 256-bit integer vector, initialized in reverse order
   4055 ///    with the specified 16-bit integral values.
   4056 ///
   4057 /// \headerfile <x86intrin.h>
   4058 ///
   4059 /// This intrinsic is a utility function and does not correspond to a specific
   4060 ///   instruction.
   4061 ///
   4062 /// \param __w15
   4063 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
   4064 /// \param __w14
   4065 ///    A 16-bit integral value used to initialize bits [31:16] of the result.
   4066 /// \param __w13
   4067 ///    A 16-bit integral value used to initialize bits [47:32] of the result.
   4068 /// \param __w12
   4069 ///    A 16-bit integral value used to initialize bits [63:48] of the result.
   4070 /// \param __w11
   4071 ///    A 16-bit integral value used to initialize bits [79:64] of the result.
   4072 /// \param __w10
   4073 ///    A 16-bit integral value used to initialize bits [95:80] of the result.
   4074 /// \param __w09
   4075 ///    A 16-bit integral value used to initialize bits [111:96] of the result.
   4076 /// \param __w08
   4077 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
   4078 /// \param __w07
   4079 ///    A 16-bit integral value used to initialize bits [143:128] of the result.
   4080 /// \param __w06
   4081 ///    A 16-bit integral value used to initialize bits [159:144] of the result.
   4082 /// \param __w05
   4083 ///    A 16-bit integral value used to initialize bits [175:160] of the result.
   4084 /// \param __w04
   4085 ///    A 16-bit integral value used to initialize bits [191:176] of the result.
   4086 /// \param __w03
   4087 ///    A 16-bit integral value used to initialize bits [207:192] of the result.
   4088 /// \param __w02
   4089 ///    A 16-bit integral value used to initialize bits [223:208] of the result.
   4090 /// \param __w01
   4091 ///    A 16-bit integral value used to initialize bits [239:224] of the result.
   4092 /// \param __w00
   4093 ///    A 16-bit integral value used to initialize bits [255:240] of the result.
   4094 /// \returns An initialized 256-bit integer vector.
   4095 static __inline __m256i __DEFAULT_FN_ATTRS
   4096 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
   4097        short __w11, short __w10, short __w09, short __w08,
   4098        short __w07, short __w06, short __w05, short __w04,
   4099        short __w03, short __w02, short __w01, short __w00)
   4100 {
   4101   return _mm256_set_epi16(__w00, __w01, __w02, __w03,
   4102                           __w04, __w05, __w06, __w07,
   4103                           __w08, __w09, __w10, __w11,
   4104                           __w12, __w13, __w14, __w15);
   4105 }
   4106 
   4107 /// Constructs a 256-bit integer vector, initialized in reverse order
   4108 ///    with the specified 8-bit integral values.
   4109 ///
   4110 /// \headerfile <x86intrin.h>
   4111 ///
   4112 /// This intrinsic is a utility function and does not correspond to a specific
   4113 ///   instruction.
   4114 ///
   4115 /// \param __b31
   4116 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
   4117 /// \param __b30
   4118 ///    An 8-bit integral value used to initialize bits [15:8] of the result.
   4119 /// \param __b29
   4120 ///    An 8-bit integral value used to initialize bits [23:16] of the result.
   4121 /// \param __b28
   4122 ///    An 8-bit integral value used to initialize bits [31:24] of the result.
   4123 /// \param __b27
   4124 ///    An 8-bit integral value used to initialize bits [39:32] of the result.
   4125 /// \param __b26
   4126 ///    An 8-bit integral value used to initialize bits [47:40] of the result.
   4127 /// \param __b25
   4128 ///    An 8-bit integral value used to initialize bits [55:48] of the result.
   4129 /// \param __b24
   4130 ///    An 8-bit integral value used to initialize bits [63:56] of the result.
   4131 /// \param __b23
   4132 ///    An 8-bit integral value used to initialize bits [71:64] of the result.
   4133 /// \param __b22
   4134 ///    An 8-bit integral value used to initialize bits [79:72] of the result.
   4135 /// \param __b21
   4136 ///    An 8-bit integral value used to initialize bits [87:80] of the result.
   4137 /// \param __b20
   4138 ///    An 8-bit integral value used to initialize bits [95:88] of the result.
   4139 /// \param __b19
   4140 ///    An 8-bit integral value used to initialize bits [103:96] of the result.
   4141 /// \param __b18
   4142 ///    An 8-bit integral value used to initialize bits [111:104] of the result.
   4143 /// \param __b17
   4144 ///    An 8-bit integral value used to initialize bits [119:112] of the result.
   4145 /// \param __b16
   4146 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
   4147 /// \param __b15
   4148 ///    An 8-bit integral value used to initialize bits [135:128] of the result.
   4149 /// \param __b14
   4150 ///    An 8-bit integral value used to initialize bits [143:136] of the result.
   4151 /// \param __b13
   4152 ///    An 8-bit integral value used to initialize bits [151:144] of the result.
   4153 /// \param __b12
   4154 ///    An 8-bit integral value used to initialize bits [159:152] of the result.
   4155 /// \param __b11
   4156 ///    An 8-bit integral value used to initialize bits [167:160] of the result.
   4157 /// \param __b10
   4158 ///    An 8-bit integral value used to initialize bits [175:168] of the result.
   4159 /// \param __b09
   4160 ///    An 8-bit integral value used to initialize bits [183:176] of the result.
   4161 /// \param __b08
   4162 ///    An 8-bit integral value used to initialize bits [191:184] of the result.
   4163 /// \param __b07
   4164 ///    An 8-bit integral value used to initialize bits [199:192] of the result.
   4165 /// \param __b06
   4166 ///    An 8-bit integral value used to initialize bits [207:200] of the result.
   4167 /// \param __b05
   4168 ///    An 8-bit integral value used to initialize bits [215:208] of the result.
   4169 /// \param __b04
   4170 ///    An 8-bit integral value used to initialize bits [223:216] of the result.
   4171 /// \param __b03
   4172 ///    An 8-bit integral value used to initialize bits [231:224] of the result.
   4173 /// \param __b02
   4174 ///    An 8-bit integral value used to initialize bits [239:232] of the result.
   4175 /// \param __b01
   4176 ///    An 8-bit integral value used to initialize bits [247:240] of the result.
   4177 /// \param __b00
   4178 ///    An 8-bit integral value used to initialize bits [255:248] of the result.
   4179 /// \returns An initialized 256-bit integer vector.
   4180 static __inline __m256i __DEFAULT_FN_ATTRS
   4181 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
   4182                  char __b27, char __b26, char __b25, char __b24,
   4183                  char __b23, char __b22, char __b21, char __b20,
   4184                  char __b19, char __b18, char __b17, char __b16,
   4185                  char __b15, char __b14, char __b13, char __b12,
   4186                  char __b11, char __b10, char __b09, char __b08,
   4187                  char __b07, char __b06, char __b05, char __b04,
   4188                  char __b03, char __b02, char __b01, char __b00)
   4189 {
   4190   return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
   4191                          __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
   4192                          __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
   4193                          __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
   4194 }
   4195 
   4196 /// Constructs a 256-bit integer vector, initialized in reverse order
   4197 ///    with the specified 64-bit integral values.
   4198 ///
   4199 /// \headerfile <x86intrin.h>
   4200 ///
   4201 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
   4202 ///   instruction.
   4203 ///
   4204 /// \param __a
   4205 ///    A 64-bit integral value used to initialize bits [63:0] of the result.
   4206 /// \param __b
   4207 ///    A 64-bit integral value used to initialize bits [127:64] of the result.
   4208 /// \param __c
   4209 ///    A 64-bit integral value used to initialize bits [191:128] of the result.
   4210 /// \param __d
   4211 ///    A 64-bit integral value used to initialize bits [255:192] of the result.
   4212 /// \returns An initialized 256-bit integer vector.
   4213 static __inline __m256i __DEFAULT_FN_ATTRS
   4214 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
   4215 {
   4216   return _mm256_set_epi64x(__d, __c, __b, __a);
   4217 }
   4218 
   4219 /* Create vectors with repeated elements */
   4220 /// Constructs a 256-bit floating-point vector of [4 x double], with each
   4221 ///    of the four double-precision floating-point vector elements set to the
   4222 ///    specified double-precision floating-point value.
   4223 ///
   4224 /// \headerfile <x86intrin.h>
   4225 ///
   4226 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
   4227 ///
   4228 /// \param __w
   4229 ///    A double-precision floating-point value used to initialize each vector
   4230 ///    element of the result.
   4231 /// \returns An initialized 256-bit floating-point vector of [4 x double].
   4232 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
   4233 _mm256_set1_pd(double __w)
   4234 {
   4235   return _mm256_set_pd(__w, __w, __w, __w);
   4236 }
   4237 
   4238 /// Constructs a 256-bit floating-point vector of [8 x float], with each
   4239 ///    of the eight single-precision floating-point vector elements set to the
   4240 ///    specified single-precision floating-point value.
   4241 ///
   4242 /// \headerfile <x86intrin.h>
   4243 ///
   4244 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
   4245 ///   instruction.
   4246 ///
   4247 /// \param __w
   4248 ///    A single-precision floating-point value used to initialize each vector
   4249 ///    element of the result.
   4250 /// \returns An initialized 256-bit floating-point vector of [8 x float].
   4251 static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
   4252 _mm256_set1_ps(float __w)
   4253 {
   4254   return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
   4255 }
   4256 
   4257 /// Constructs a 256-bit integer vector of [8 x i32], with each of the
   4258 ///    32-bit integral vector elements set to the specified 32-bit integral
   4259 ///    value.
   4260 ///
   4261 /// \headerfile <x86intrin.h>
   4262 ///
   4263 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
   4264 ///   instruction.
   4265 ///
   4266 /// \param __i
   4267 ///    A 32-bit integral value used to initialize each vector element of the
   4268 ///    result.
   4269 /// \returns An initialized 256-bit integer vector of [8 x i32].
   4270 static __inline __m256i __DEFAULT_FN_ATTRS
   4271 _mm256_set1_epi32(int __i)
   4272 {
   4273   return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
   4274 }
   4275 
   4276 /// Constructs a 256-bit integer vector of [16 x i16], with each of the
   4277 ///    16-bit integral vector elements set to the specified 16-bit integral
   4278 ///    value.
   4279 ///
   4280 /// \headerfile <x86intrin.h>
   4281 ///
   4282 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
   4283 ///
   4284 /// \param __w
   4285 ///    A 16-bit integral value used to initialize each vector element of the
   4286 ///    result.
   4287 /// \returns An initialized 256-bit integer vector of [16 x i16].
   4288 static __inline __m256i __DEFAULT_FN_ATTRS
   4289 _mm256_set1_epi16(short __w)
   4290 {
   4291   return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
   4292                           __w, __w, __w, __w, __w, __w, __w, __w);
   4293 }
   4294 
   4295 /// Constructs a 256-bit integer vector of [32 x i8], with each of the
   4296 ///    8-bit integral vector elements set to the specified 8-bit integral value.
   4297 ///
   4298 /// \headerfile <x86intrin.h>
   4299 ///
   4300 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
   4301 ///
   4302 /// \param __b
   4303 ///    An 8-bit integral value used to initialize each vector element of the
   4304 ///    result.
   4305 /// \returns An initialized 256-bit integer vector of [32 x i8].
   4306 static __inline __m256i __DEFAULT_FN_ATTRS
   4307 _mm256_set1_epi8(char __b)
   4308 {
   4309   return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
   4310                          __b, __b, __b, __b, __b, __b, __b, __b,
   4311                          __b, __b, __b, __b, __b, __b, __b, __b,
   4312                          __b, __b, __b, __b, __b, __b, __b, __b);
   4313 }
   4314 
   4315 /// Constructs a 256-bit integer vector of [4 x i64], with each of the
   4316 ///    64-bit integral vector elements set to the specified 64-bit integral
   4317 ///    value.
   4318 ///
   4319 /// \headerfile <x86intrin.h>
   4320 ///
   4321 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
   4322 ///
   4323 /// \param __q
   4324 ///    A 64-bit integral value used to initialize each vector element of the
   4325 ///    result.
   4326 /// \returns An initialized 256-bit integer vector of [4 x i64].
   4327 static __inline __m256i __DEFAULT_FN_ATTRS
   4328 _mm256_set1_epi64x(long long __q)
   4329 {
   4330   return _mm256_set_epi64x(__q, __q, __q, __q);
   4331 }
   4332 
   4333 /* Create __zeroed vectors */
   4334 /// Constructs a 256-bit floating-point vector of [4 x double] with all
   4335 ///    vector elements initialized to zero.
   4336 ///
   4337 /// \headerfile <x86intrin.h>
   4338 ///
   4339 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
   4340 ///
   4341 /// \returns A 256-bit vector of [4 x double] with all elements set to zero.
   4342 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_pd(void) {
   4343   return __extension__(__m256d){0.0, 0.0, 0.0, 0.0};
   4344 }
   4345 
   4346 /// Constructs a 256-bit floating-point vector of [8 x float] with all
   4347 ///    vector elements initialized to zero.
   4348 ///
   4349 /// \headerfile <x86intrin.h>
   4350 ///
   4351 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
   4352 ///
   4353 /// \returns A 256-bit vector of [8 x float] with all elements set to zero.
   4354 static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void) {
   4355   return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
   4356 }
   4357 
   4358 /// Constructs a 256-bit integer vector initialized to zero.
   4359 ///
   4360 /// \headerfile <x86intrin.h>
   4361 ///
   4362 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
   4363 ///
   4364 /// \returns A 256-bit integer vector initialized to zero.
   4365 static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
   4366 _mm256_setzero_si256(void) {
   4367   return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
   4368 }
   4369 
   4370 /* Cast between vector types */
   4371 /// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
   4372 ///    floating-point vector of [8 x float].
   4373 ///
   4374 /// \headerfile <x86intrin.h>
   4375 ///
   4376 /// This intrinsic has no corresponding instruction.
   4377 ///
   4378 /// \param __a
   4379 ///    A 256-bit floating-point vector of [4 x double].
   4380 /// \returns A 256-bit floating-point vector of [8 x float] containing the same
   4381 ///    bitwise pattern as the parameter.
   4382 static __inline __m256 __DEFAULT_FN_ATTRS
   4383 _mm256_castpd_ps(__m256d __a)
   4384 {
   4385   return (__m256)__a;
   4386 }
   4387 
   4388 /// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
   4389 ///    integer vector.
   4390 ///
   4391 /// \headerfile <x86intrin.h>
   4392 ///
   4393 /// This intrinsic has no corresponding instruction.
   4394 ///
   4395 /// \param __a
   4396 ///    A 256-bit floating-point vector of [4 x double].
   4397 /// \returns A 256-bit integer vector containing the same bitwise pattern as the
   4398 ///    parameter.
   4399 static __inline __m256i __DEFAULT_FN_ATTRS
   4400 _mm256_castpd_si256(__m256d __a)
   4401 {
   4402   return (__m256i)__a;
   4403 }
   4404 
   4405 /// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
   4406 ///    floating-point vector of [4 x double].
   4407 ///
   4408 /// \headerfile <x86intrin.h>
   4409 ///
   4410 /// This intrinsic has no corresponding instruction.
   4411 ///
   4412 /// \param __a
   4413 ///    A 256-bit floating-point vector of [8 x float].
   4414 /// \returns A 256-bit floating-point vector of [4 x double] containing the same
   4415 ///    bitwise pattern as the parameter.
   4416 static __inline __m256d __DEFAULT_FN_ATTRS
   4417 _mm256_castps_pd(__m256 __a)
   4418 {
   4419   return (__m256d)__a;
   4420 }
   4421 
   4422 /// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
   4423 ///    integer vector.
   4424 ///
   4425 /// \headerfile <x86intrin.h>
   4426 ///
   4427 /// This intrinsic has no corresponding instruction.
   4428 ///
   4429 /// \param __a
   4430 ///    A 256-bit floating-point vector of [8 x float].
   4431 /// \returns A 256-bit integer vector containing the same bitwise pattern as the
   4432 ///    parameter.
   4433 static __inline __m256i __DEFAULT_FN_ATTRS
   4434 _mm256_castps_si256(__m256 __a)
   4435 {
   4436   return (__m256i)__a;
   4437 }
   4438 
   4439 /// Casts a 256-bit integer vector into a 256-bit floating-point vector
   4440 ///    of [8 x float].
   4441 ///
   4442 /// \headerfile <x86intrin.h>
   4443 ///
   4444 /// This intrinsic has no corresponding instruction.
   4445 ///
   4446 /// \param __a
   4447 ///    A 256-bit integer vector.
   4448 /// \returns A 256-bit floating-point vector of [8 x float] containing the same
   4449 ///    bitwise pattern as the parameter.
   4450 static __inline __m256 __DEFAULT_FN_ATTRS
   4451 _mm256_castsi256_ps(__m256i __a)
   4452 {
   4453   return (__m256)__a;
   4454 }
   4455 
   4456 /// Casts a 256-bit integer vector into a 256-bit floating-point vector
   4457 ///    of [4 x double].
   4458 ///
   4459 /// \headerfile <x86intrin.h>
   4460 ///
   4461 /// This intrinsic has no corresponding instruction.
   4462 ///
   4463 /// \param __a
   4464 ///    A 256-bit integer vector.
   4465 /// \returns A 256-bit floating-point vector of [4 x double] containing the same
   4466 ///    bitwise pattern as the parameter.
   4467 static __inline __m256d __DEFAULT_FN_ATTRS
   4468 _mm256_castsi256_pd(__m256i __a)
   4469 {
   4470   return (__m256d)__a;
   4471 }
   4472 
   4473 /// Returns the lower 128 bits of a 256-bit floating-point vector of
   4474 ///    [4 x double] as a 128-bit floating-point vector of [2 x double].
   4475 ///
   4476 /// \headerfile <x86intrin.h>
   4477 ///
   4478 /// This intrinsic has no corresponding instruction.
   4479 ///
   4480 /// \param __a
   4481 ///    A 256-bit floating-point vector of [4 x double].
   4482 /// \returns A 128-bit floating-point vector of [2 x double] containing the
   4483 ///    lower 128 bits of the parameter.
   4484 static __inline __m128d __DEFAULT_FN_ATTRS
   4485 _mm256_castpd256_pd128(__m256d __a)
   4486 {
   4487   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
   4488 }
   4489 
   4490 /// Returns the lower 128 bits of a 256-bit floating-point vector of
   4491 ///    [8 x float] as a 128-bit floating-point vector of [4 x float].
   4492 ///
   4493 /// \headerfile <x86intrin.h>
   4494 ///
   4495 /// This intrinsic has no corresponding instruction.
   4496 ///
   4497 /// \param __a
   4498 ///    A 256-bit floating-point vector of [8 x float].
   4499 /// \returns A 128-bit floating-point vector of [4 x float] containing the
   4500 ///    lower 128 bits of the parameter.
   4501 static __inline __m128 __DEFAULT_FN_ATTRS
   4502 _mm256_castps256_ps128(__m256 __a)
   4503 {
   4504   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
   4505 }
   4506 
   4507 /// Truncates a 256-bit integer vector into a 128-bit integer vector.
   4508 ///
   4509 /// \headerfile <x86intrin.h>
   4510 ///
   4511 /// This intrinsic has no corresponding instruction.
   4512 ///
   4513 /// \param __a
   4514 ///    A 256-bit integer vector.
   4515 /// \returns A 128-bit integer vector containing the lower 128 bits of the
   4516 ///    parameter.
   4517 static __inline __m128i __DEFAULT_FN_ATTRS
   4518 _mm256_castsi256_si128(__m256i __a)
   4519 {
   4520   return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
   4521 }
   4522 
   4523 /// Constructs a 256-bit floating-point vector of [4 x double] from a
   4524 ///    128-bit floating-point vector of [2 x double].
   4525 ///
   4526 ///    The lower 128 bits contain the value of the source vector. The contents
   4527 ///    of the upper 128 bits are undefined.
   4528 ///
   4529 /// \headerfile <x86intrin.h>
   4530 ///
   4531 /// This intrinsic has no corresponding instruction.
   4532 ///
   4533 /// \param __a
   4534 ///    A 128-bit vector of [2 x double].
   4535 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
   4536 ///    contain the value of the parameter. The contents of the upper 128 bits
   4537 ///    are undefined.
   4538 static __inline __m256d __DEFAULT_FN_ATTRS
   4539 _mm256_castpd128_pd256(__m128d __a)
   4540 {
   4541   return __builtin_shufflevector(
   4542       (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
   4543 }
   4544 
   4545 /// Constructs a 256-bit floating-point vector of [8 x float] from a
   4546 ///    128-bit floating-point vector of [4 x float].
   4547 ///
   4548 ///    The lower 128 bits contain the value of the source vector. The contents
   4549 ///    of the upper 128 bits are undefined.
   4550 ///
   4551 /// \headerfile <x86intrin.h>
   4552 ///
   4553 /// This intrinsic has no corresponding instruction.
   4554 ///
   4555 /// \param __a
   4556 ///    A 128-bit vector of [4 x float].
   4557 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
   4558 ///    contain the value of the parameter. The contents of the upper 128 bits
   4559 ///    are undefined.
   4560 static __inline __m256 __DEFAULT_FN_ATTRS
   4561 _mm256_castps128_ps256(__m128 __a)
   4562 {
   4563   return __builtin_shufflevector((__v4sf)__a,
   4564                                  (__v4sf)__builtin_nondeterministic_value(__a),
   4565                                  0, 1, 2, 3, 4, 5, 6, 7);
   4566 }
   4567 
   4568 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
   4569 ///
   4570 ///    The lower 128 bits contain the value of the source vector. The contents
   4571 ///    of the upper 128 bits are undefined.
   4572 ///
   4573 /// \headerfile <x86intrin.h>
   4574 ///
   4575 /// This intrinsic has no corresponding instruction.
   4576 ///
   4577 /// \param __a
   4578 ///    A 128-bit integer vector.
   4579 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of
   4580 ///    the parameter. The contents of the upper 128 bits are undefined.
   4581 static __inline __m256i __DEFAULT_FN_ATTRS
   4582 _mm256_castsi128_si256(__m128i __a)
   4583 {
   4584   return __builtin_shufflevector(
   4585       (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
   4586 }
   4587 
   4588 /// Constructs a 256-bit floating-point vector of [4 x double] from a
   4589 ///    128-bit floating-point vector of [2 x double]. The lower 128 bits
   4590 ///    contain the value of the source vector. The upper 128 bits are set
   4591 ///    to zero.
   4592 ///
   4593 /// \headerfile <x86intrin.h>
   4594 ///
   4595 /// This intrinsic has no corresponding instruction.
   4596 ///
   4597 /// \param __a
   4598 ///    A 128-bit vector of [2 x double].
   4599 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
   4600 ///    contain the value of the parameter. The upper 128 bits are set to zero.
   4601 static __inline __m256d __DEFAULT_FN_ATTRS
   4602 _mm256_zextpd128_pd256(__m128d __a)
   4603 {
   4604   return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
   4605 }
   4606 
   4607 /// Constructs a 256-bit floating-point vector of [8 x float] from a
   4608 ///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
   4609 ///    the value of the source vector. The upper 128 bits are set to zero.
   4610 ///
   4611 /// \headerfile <x86intrin.h>
   4612 ///
   4613 /// This intrinsic has no corresponding instruction.
   4614 ///
   4615 /// \param __a
   4616 ///    A 128-bit vector of [4 x float].
   4617 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
   4618 ///    contain the value of the parameter. The upper 128 bits are set to zero.
   4619 static __inline __m256 __DEFAULT_FN_ATTRS
   4620 _mm256_zextps128_ps256(__m128 __a)
   4621 {
   4622   return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
   4623 }
   4624 
   4625 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
   4626 ///    The lower 128 bits contain the value of the source vector. The upper
   4627 ///    128 bits are set to zero.
   4628 ///
   4629 /// \headerfile <x86intrin.h>
   4630 ///
   4631 /// This intrinsic has no corresponding instruction.
   4632 ///
   4633 /// \param __a
   4634 ///    A 128-bit integer vector.
   4635 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of
   4636 ///    the parameter. The upper 128 bits are set to zero.
   4637 static __inline __m256i __DEFAULT_FN_ATTRS
   4638 _mm256_zextsi128_si256(__m128i __a)
   4639 {
   4640   return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
   4641 }
   4642 
   4643 /*
   4644    Vector insert.
   4645    We use macros rather than inlines because we only want to accept
   4646    invocations where the immediate M is a constant expression.
   4647 */
   4648 /// Constructs a new 256-bit vector of [8 x float] by first duplicating
   4649 ///    a 256-bit vector of [8 x float] given in the first parameter, and then
   4650 ///    replacing either the upper or the lower 128 bits with the contents of a
   4651 ///    128-bit vector of [4 x float] in the second parameter.
   4652 ///
   4653 ///    The immediate integer parameter determines between the upper or the lower
   4654 ///    128 bits.
   4655 ///
   4656 /// \headerfile <x86intrin.h>
   4657 ///
   4658 /// \code
   4659 /// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
   4660 /// \endcode
   4661 ///
   4662 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4663 ///
   4664 /// \param V1
   4665 ///    A 256-bit vector of [8 x float]. This vector is copied to the result
   4666 ///    first, and then either the upper or the lower 128 bits of the result will
   4667 ///    be replaced by the contents of \a V2.
   4668 /// \param V2
   4669 ///    A 128-bit vector of [4 x float]. The contents of this parameter are
   4670 ///    written to either the upper or the lower 128 bits of the result depending
   4671 ///    on the value of parameter \a M.
   4672 /// \param M
   4673 ///    An immediate integer. The least significant bit determines how the values
   4674 ///    from the two parameters are interleaved: \n
   4675 ///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
   4676 ///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
   4677 ///    result. \n
   4678 ///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
   4679 ///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
   4680 ///    result.
   4681 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
   4682 #define _mm256_insertf128_ps(V1, V2, M) \
   4683   ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
   4684                                             (__v4sf)(__m128)(V2), (int)(M)))
   4685 
   4686 /// Constructs a new 256-bit vector of [4 x double] by first duplicating
   4687 ///    a 256-bit vector of [4 x double] given in the first parameter, and then
   4688 ///    replacing either the upper or the lower 128 bits with the contents of a
   4689 ///    128-bit vector of [2 x double] in the second parameter.
   4690 ///
   4691 ///    The immediate integer parameter determines between the upper or the lower
   4692 ///    128 bits.
   4693 ///
   4694 /// \headerfile <x86intrin.h>
   4695 ///
   4696 /// \code
   4697 /// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
   4698 /// \endcode
   4699 ///
   4700 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4701 ///
   4702 /// \param V1
   4703 ///    A 256-bit vector of [4 x double]. This vector is copied to the result
   4704 ///    first, and then either the upper or the lower 128 bits of the result will
   4705 ///    be replaced by the contents of \a V2.
   4706 /// \param V2
   4707 ///    A 128-bit vector of [2 x double]. The contents of this parameter are
   4708 ///    written to either the upper or the lower 128 bits of the result depending
   4709 ///    on the value of parameter \a M.
   4710 /// \param M
   4711 ///    An immediate integer. The least significant bit determines how the values
   4712 ///    from the two parameters are interleaved: \n
   4713 ///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
   4714 ///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
   4715 ///    result. \n
   4716 ///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
   4717 ///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
   4718 ///    result.
   4719 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
   4720 #define _mm256_insertf128_pd(V1, V2, M) \
   4721   ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
   4722                                              (__v2df)(__m128d)(V2), (int)(M)))
   4723 
   4724 /// Constructs a new 256-bit integer vector by first duplicating a
   4725 ///    256-bit integer vector given in the first parameter, and then replacing
   4726 ///    either the upper or the lower 128 bits with the contents of a 128-bit
   4727 ///    integer vector in the second parameter.
   4728 ///
   4729 ///    The immediate integer parameter determines between the upper or the lower
   4730 ///    128 bits.
   4731 ///
   4732 /// \headerfile <x86intrin.h>
   4733 ///
   4734 /// \code
   4735 /// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
   4736 /// \endcode
   4737 ///
   4738 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4739 ///
   4740 /// \param V1
   4741 ///    A 256-bit integer vector. This vector is copied to the result first, and
   4742 ///    then either the upper or the lower 128 bits of the result will be
   4743 ///    replaced by the contents of \a V2.
   4744 /// \param V2
   4745 ///    A 128-bit integer vector. The contents of this parameter are written to
   4746 ///    either the upper or the lower 128 bits of the result depending on the
   4747 ///     value of parameter \a M.
   4748 /// \param M
   4749 ///    An immediate integer. The least significant bit determines how the values
   4750 ///    from the two parameters are interleaved: \n
   4751 ///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
   4752 ///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
   4753 ///    result. \n
   4754 ///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
   4755 ///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
   4756 ///    result.
   4757 /// \returns A 256-bit integer vector containing the interleaved values.
   4758 #define _mm256_insertf128_si256(V1, V2, M) \
   4759   ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
   4760                                              (__v4si)(__m128i)(V2), (int)(M)))
   4761 
   4762 /*
   4763    Vector extract.
   4764    We use macros rather than inlines because we only want to accept
   4765    invocations where the immediate M is a constant expression.
   4766 */
   4767 /// Extracts either the upper or the lower 128 bits from a 256-bit vector
   4768 ///    of [8 x float], as determined by the immediate integer parameter, and
   4769 ///    returns the extracted bits as a 128-bit vector of [4 x float].
   4770 ///
   4771 /// \headerfile <x86intrin.h>
   4772 ///
   4773 /// \code
   4774 /// __m128 _mm256_extractf128_ps(__m256 V, const int M);
   4775 /// \endcode
   4776 ///
   4777 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
   4778 ///
   4779 /// \param V
   4780 ///    A 256-bit vector of [8 x float].
   4781 /// \param M
   4782 ///    An immediate integer. The least significant bit determines which bits are
   4783 ///    extracted from the first parameter: \n
   4784 ///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
   4785 ///    result. \n
   4786 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
   4787 /// \returns A 128-bit vector of [4 x float] containing the extracted bits.
   4788 #define _mm256_extractf128_ps(V, M) \
   4789   ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
   4790 
   4791 /// Extracts either the upper or the lower 128 bits from a 256-bit vector
   4792 ///    of [4 x double], as determined by the immediate integer parameter, and
   4793 ///    returns the extracted bits as a 128-bit vector of [2 x double].
   4794 ///
   4795 /// \headerfile <x86intrin.h>
   4796 ///
   4797 /// \code
   4798 /// __m128d _mm256_extractf128_pd(__m256d V, const int M);
   4799 /// \endcode
   4800 ///
   4801 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
   4802 ///
   4803 /// \param V
   4804 ///    A 256-bit vector of [4 x double].
   4805 /// \param M
   4806 ///    An immediate integer. The least significant bit determines which bits are
   4807 ///    extracted from the first parameter: \n
   4808 ///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
   4809 ///    result. \n
   4810 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
   4811 /// \returns A 128-bit vector of [2 x double] containing the extracted bits.
   4812 #define _mm256_extractf128_pd(V, M) \
   4813   ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
   4814 
   4815 /// Extracts either the upper or the lower 128 bits from a 256-bit
   4816 ///    integer vector, as determined by the immediate integer parameter, and
   4817 ///    returns the extracted bits as a 128-bit integer vector.
   4818 ///
   4819 /// \headerfile <x86intrin.h>
   4820 ///
   4821 /// \code
   4822 /// __m128i _mm256_extractf128_si256(__m256i V, const int M);
   4823 /// \endcode
   4824 ///
   4825 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
   4826 ///
   4827 /// \param V
   4828 ///    A 256-bit integer vector.
   4829 /// \param M
   4830 ///    An immediate integer. The least significant bit determines which bits are
   4831 ///    extracted from the first parameter:  \n
   4832 ///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
   4833 ///    result. \n
   4834 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
   4835 /// \returns A 128-bit integer vector containing the extracted bits.
   4836 #define _mm256_extractf128_si256(V, M) \
   4837   ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
   4838 
   4839 /// Constructs a 256-bit floating-point vector of [8 x float] by
   4840 ///    concatenating two 128-bit floating-point vectors of [4 x float].
   4841 ///
   4842 /// \headerfile <x86intrin.h>
   4843 ///
   4844 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4845 ///
   4846 /// \param __hi
   4847 ///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
   4848 ///    128 bits of the result.
   4849 /// \param __lo
   4850 ///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
   4851 ///    128 bits of the result.
   4852 /// \returns A 256-bit floating-point vector of [8 x float] containing the
   4853 ///    concatenated result.
   4854 static __inline __m256 __DEFAULT_FN_ATTRS
   4855 _mm256_set_m128 (__m128 __hi, __m128 __lo)
   4856 {
   4857   return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
   4858 }
   4859 
   4860 /// Constructs a 256-bit floating-point vector of [4 x double] by
   4861 ///    concatenating two 128-bit floating-point vectors of [2 x double].
   4862 ///
   4863 /// \headerfile <x86intrin.h>
   4864 ///
   4865 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4866 ///
   4867 /// \param __hi
   4868 ///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
   4869 ///    128 bits of the result.
   4870 /// \param __lo
   4871 ///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
   4872 ///    128 bits of the result.
   4873 /// \returns A 256-bit floating-point vector of [4 x double] containing the
   4874 ///    concatenated result.
   4875 static __inline __m256d __DEFAULT_FN_ATTRS
   4876 _mm256_set_m128d (__m128d __hi, __m128d __lo)
   4877 {
   4878   return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
   4879 }
   4880 
   4881 /// Constructs a 256-bit integer vector by concatenating two 128-bit
   4882 ///    integer vectors.
   4883 ///
   4884 /// \headerfile <x86intrin.h>
   4885 ///
   4886 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4887 ///
   4888 /// \param __hi
   4889 ///    A 128-bit integer vector to be copied to the upper 128 bits of the
   4890 ///    result.
   4891 /// \param __lo
   4892 ///    A 128-bit integer vector to be copied to the lower 128 bits of the
   4893 ///    result.
   4894 /// \returns A 256-bit integer vector containing the concatenated result.
   4895 static __inline __m256i __DEFAULT_FN_ATTRS
   4896 _mm256_set_m128i (__m128i __hi, __m128i __lo)
   4897 {
   4898   return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
   4899 }
   4900 
   4901 /// Constructs a 256-bit floating-point vector of [8 x float] by
   4902 ///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
   4903 ///    similar to _mm256_set_m128, but the order of the input parameters is
   4904 ///    swapped.
   4905 ///
   4906 /// \headerfile <x86intrin.h>
   4907 ///
   4908 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4909 ///
   4910 /// \param __lo
   4911 ///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
   4912 ///    128 bits of the result.
   4913 /// \param __hi
   4914 ///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
   4915 ///    128 bits of the result.
   4916 /// \returns A 256-bit floating-point vector of [8 x float] containing the
   4917 ///    concatenated result.
   4918 static __inline __m256 __DEFAULT_FN_ATTRS
   4919 _mm256_setr_m128 (__m128 __lo, __m128 __hi)
   4920 {
   4921   return _mm256_set_m128(__hi, __lo);
   4922 }
   4923 
   4924 /// Constructs a 256-bit floating-point vector of [4 x double] by
   4925 ///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
   4926 ///    similar to _mm256_set_m128d, but the order of the input parameters is
   4927 ///    swapped.
   4928 ///
   4929 /// \headerfile <x86intrin.h>
   4930 ///
   4931 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4932 ///
   4933 /// \param __lo
   4934 ///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
   4935 ///    128 bits of the result.
   4936 /// \param __hi
   4937 ///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
   4938 ///    128 bits of the result.
   4939 /// \returns A 256-bit floating-point vector of [4 x double] containing the
   4940 ///    concatenated result.
   4941 static __inline __m256d __DEFAULT_FN_ATTRS
   4942 _mm256_setr_m128d (__m128d __lo, __m128d __hi)
   4943 {
   4944   return (__m256d)_mm256_set_m128d(__hi, __lo);
   4945 }
   4946 
   4947 /// Constructs a 256-bit integer vector by concatenating two 128-bit
   4948 ///    integer vectors. This is similar to _mm256_set_m128i, but the order of
   4949 ///    the input parameters is swapped.
   4950 ///
   4951 /// \headerfile <x86intrin.h>
   4952 ///
   4953 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
   4954 ///
   4955 /// \param __lo
   4956 ///    A 128-bit integer vector to be copied to the lower 128 bits of the
   4957 ///    result.
   4958 /// \param __hi
   4959 ///    A 128-bit integer vector to be copied to the upper 128 bits of the
   4960 ///    result.
   4961 /// \returns A 256-bit integer vector containing the concatenated result.
   4962 static __inline __m256i __DEFAULT_FN_ATTRS
   4963 _mm256_setr_m128i (__m128i __lo, __m128i __hi)
   4964 {
   4965   return (__m256i)_mm256_set_m128i(__hi, __lo);
   4966 }
   4967 
   4968 /* SIMD load ops (unaligned) */
   4969 /// Loads two 128-bit floating-point vectors of [4 x float] from
   4970 ///    unaligned memory locations and constructs a 256-bit floating-point vector
   4971 ///    of [8 x float] by concatenating the two 128-bit vectors.
   4972 ///
   4973 /// \headerfile <x86intrin.h>
   4974 ///
   4975 /// This intrinsic corresponds to load instructions followed by the
   4976 ///   <c> VINSERTF128 </c> instruction.
   4977 ///
   4978 /// \param __addr_hi
   4979 ///    A pointer to a 128-bit memory location containing 4 consecutive
   4980 ///    single-precision floating-point values. These values are to be copied to
   4981 ///    bits[255:128] of the result. The address of the memory location does not
   4982 ///    have to be aligned.
   4983 /// \param __addr_lo
   4984 ///    A pointer to a 128-bit memory location containing 4 consecutive
   4985 ///    single-precision floating-point values. These values are to be copied to
   4986 ///    bits[127:0] of the result. The address of the memory location does not
   4987 ///    have to be aligned.
   4988 /// \returns A 256-bit floating-point vector of [8 x float] containing the
   4989 ///    concatenated result.
   4990 static __inline __m256 __DEFAULT_FN_ATTRS
   4991 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
   4992 {
   4993   return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
   4994 }
   4995 
   4996 /// Loads two 128-bit floating-point vectors of [2 x double] from
   4997 ///    unaligned memory locations and constructs a 256-bit floating-point vector
   4998 ///    of [4 x double] by concatenating the two 128-bit vectors.
   4999 ///
   5000 /// \headerfile <x86intrin.h>
   5001 ///
   5002 /// This intrinsic corresponds to load instructions followed by the
   5003 ///   <c> VINSERTF128 </c> instruction.
   5004 ///
   5005 /// \param __addr_hi
   5006 ///    A pointer to a 128-bit memory location containing two consecutive
   5007 ///    double-precision floating-point values. These values are to be copied to
   5008 ///    bits[255:128] of the result. The address of the memory location does not
   5009 ///    have to be aligned.
   5010 /// \param __addr_lo
   5011 ///    A pointer to a 128-bit memory location containing two consecutive
   5012 ///    double-precision floating-point values. These values are to be copied to
   5013 ///    bits[127:0] of the result. The address of the memory location does not
   5014 ///    have to be aligned.
   5015 /// \returns A 256-bit floating-point vector of [4 x double] containing the
   5016 ///    concatenated result.
   5017 static __inline __m256d __DEFAULT_FN_ATTRS
   5018 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
   5019 {
   5020   return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
   5021 }
   5022 
   5023 /// Loads two 128-bit integer vectors from unaligned memory locations and
   5024 ///    constructs a 256-bit integer vector by concatenating the two 128-bit
   5025 ///    vectors.
   5026 ///
   5027 /// \headerfile <x86intrin.h>
   5028 ///
   5029 /// This intrinsic corresponds to load instructions followed by the
   5030 ///   <c> VINSERTF128 </c> instruction.
   5031 ///
   5032 /// \param __addr_hi
   5033 ///    A pointer to a 128-bit memory location containing a 128-bit integer
   5034 ///    vector. This vector is to be copied to bits[255:128] of the result. The
   5035 ///    address of the memory location does not have to be aligned.
   5036 /// \param __addr_lo
   5037 ///    A pointer to a 128-bit memory location containing a 128-bit integer
   5038 ///    vector. This vector is to be copied to bits[127:0] of the result. The
   5039 ///    address of the memory location does not have to be aligned.
   5040 /// \returns A 256-bit integer vector containing the concatenated result.
   5041 static __inline __m256i __DEFAULT_FN_ATTRS
   5042 _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
   5043 {
   5044    return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
   5045 }
   5046 
   5047 /* SIMD store ops (unaligned) */
   5048 /// Stores the upper and lower 128 bits of a 256-bit floating-point
   5049 ///    vector of [8 x float] into two different unaligned memory locations.
   5050 ///
   5051 /// \headerfile <x86intrin.h>
   5052 ///
   5053 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
   5054 ///   store instructions.
   5055 ///
   5056 /// \param __addr_hi
   5057 ///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
   5058 ///    copied to this memory location. The address of this memory location does
   5059 ///    not have to be aligned.
   5060 /// \param __addr_lo
   5061 ///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
   5062 ///    copied to this memory location. The address of this memory location does
   5063 ///    not have to be aligned.
   5064 /// \param __a
   5065 ///    A 256-bit floating-point vector of [8 x float].
   5066 static __inline void __DEFAULT_FN_ATTRS
   5067 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
   5068 {
   5069   __m128 __v128;
   5070 
   5071   __v128 = _mm256_castps256_ps128(__a);
   5072   _mm_storeu_ps(__addr_lo, __v128);
   5073   __v128 = _mm256_extractf128_ps(__a, 1);
   5074   _mm_storeu_ps(__addr_hi, __v128);
   5075 }
   5076 
   5077 /// Stores the upper and lower 128 bits of a 256-bit floating-point
   5078 ///    vector of [4 x double] into two different unaligned memory locations.
   5079 ///
   5080 /// \headerfile <x86intrin.h>
   5081 ///
   5082 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
   5083 ///   store instructions.
   5084 ///
   5085 /// \param __addr_hi
   5086 ///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
   5087 ///    copied to this memory location. The address of this memory location does
   5088 ///    not have to be aligned.
   5089 /// \param __addr_lo
   5090 ///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
   5091 ///    copied to this memory location. The address of this memory location does
   5092 ///    not have to be aligned.
   5093 /// \param __a
   5094 ///    A 256-bit floating-point vector of [4 x double].
   5095 static __inline void __DEFAULT_FN_ATTRS
   5096 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
   5097 {
   5098   __m128d __v128;
   5099 
   5100   __v128 = _mm256_castpd256_pd128(__a);
   5101   _mm_storeu_pd(__addr_lo, __v128);
   5102   __v128 = _mm256_extractf128_pd(__a, 1);
   5103   _mm_storeu_pd(__addr_hi, __v128);
   5104 }
   5105 
   5106 /// Stores the upper and lower 128 bits of a 256-bit integer vector into
   5107 ///    two different unaligned memory locations.
   5108 ///
   5109 /// \headerfile <x86intrin.h>
   5110 ///
   5111 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
   5112 ///   store instructions.
   5113 ///
   5114 /// \param __addr_hi
   5115 ///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
   5116 ///    copied to this memory location. The address of this memory location does
   5117 ///    not have to be aligned.
   5118 /// \param __addr_lo
   5119 ///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
   5120 ///    copied to this memory location. The address of this memory location does
   5121 ///    not have to be aligned.
   5122 /// \param __a
   5123 ///    A 256-bit integer vector.
   5124 static __inline void __DEFAULT_FN_ATTRS
   5125 _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
   5126 {
   5127   __m128i __v128;
   5128 
   5129   __v128 = _mm256_castsi256_si128(__a);
   5130   _mm_storeu_si128(__addr_lo, __v128);
   5131   __v128 = _mm256_extractf128_si256(__a, 1);
   5132   _mm_storeu_si128(__addr_hi, __v128);
   5133 }
   5134 
   5135 #undef __DEFAULT_FN_ATTRS
   5136 #undef __DEFAULT_FN_ATTRS_CONSTEXPR
   5137 #undef __DEFAULT_FN_ATTRS128
   5138 #undef __DEFAULT_FN_ATTRS128_CONSTEXPR
   5139 
   5140 #endif /* __AVXINTRIN_H */