zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

avx512fintrin.h (392651B) - Raw


      1 /*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 #ifndef __IMMINTRIN_H
     10 #error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
     11 #endif
     12 
     13 #ifndef __AVX512FINTRIN_H
     14 #define __AVX512FINTRIN_H
     15 
     16 typedef char __v64qi __attribute__((__vector_size__(64)));
     17 typedef short __v32hi __attribute__((__vector_size__(64)));
     18 typedef double __v8df __attribute__((__vector_size__(64)));
     19 typedef float __v16sf __attribute__((__vector_size__(64)));
     20 typedef long long __v8di __attribute__((__vector_size__(64)));
     21 typedef int __v16si __attribute__((__vector_size__(64)));
     22 
     23 /* Unsigned types */
     24 typedef unsigned char __v64qu __attribute__((__vector_size__(64)));
     25 typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
     26 typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
     27 typedef unsigned int __v16su __attribute__((__vector_size__(64)));
     28 
     29 /* We need an explicitly signed variant for char. Note that this shouldn't
     30  * appear in the interface though. */
     31 typedef signed char __v64qs __attribute__((__vector_size__(64)));
     32 
     33 typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64)));
     34 typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64)));
     35 typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64)));
     36 
     37 typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1)));
     38 typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1)));
     39 typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1)));
     40 
     41 typedef unsigned char __mmask8;
     42 typedef unsigned short __mmask16;
     43 
     44 /* Rounding mode macros.  */
     45 #define _MM_FROUND_TO_NEAREST_INT   0x00
     46 #define _MM_FROUND_TO_NEG_INF       0x01
     47 #define _MM_FROUND_TO_POS_INF       0x02
     48 #define _MM_FROUND_TO_ZERO          0x03
     49 #define _MM_FROUND_CUR_DIRECTION    0x04
     50 
     51 /* Constants for integer comparison predicates */
     52 typedef enum {
     53     _MM_CMPINT_EQ,      /* Equal */
     54     _MM_CMPINT_LT,      /* Less than */
     55     _MM_CMPINT_LE,      /* Less than or Equal */
     56     _MM_CMPINT_UNUSED,
     57     _MM_CMPINT_NE,      /* Not Equal */
     58     _MM_CMPINT_NLT,     /* Not Less than */
     59 #define _MM_CMPINT_GE   _MM_CMPINT_NLT  /* Greater than or Equal */
     60     _MM_CMPINT_NLE      /* Not Less than or Equal */
     61 #define _MM_CMPINT_GT   _MM_CMPINT_NLE  /* Greater than */
     62 } _MM_CMPINT_ENUM;
     63 
     64 typedef enum
     65 {
     66   _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
     67   _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
     68   _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
     69   _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
     70   _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
     71   _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
     72   _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
     73   _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
     74   _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
     75   _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
     76   _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
     77   _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
     78   _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
     79   _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
     80   _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
     81   _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
     82   _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
     83   _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
     84   _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
     85   _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
     86   _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
     87   _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
     88   _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
     89   _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
     90   _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
     91   _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
     92   _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
     93   _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
     94   _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
     95   _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
     96   _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
     97   _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
     98   _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
     99   _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
    100   _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
    101   _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
    102   _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
    103   _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
    104   _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
    105   _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
    106   _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
    107   _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
    108   _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
    109   _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
    110   _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
    111   _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
    112   _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
    113   _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
    114   _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
    115   _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
    116   _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
    117   _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
    118   _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
    119   _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
    120   _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
    121   _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
    122   _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
    123   _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
    124   _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
    125   _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
    126   _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
    127   _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
    128   _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
    129   _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
    130   _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
    131   _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
    132   _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
    133   _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
    134   _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
    135   _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
    136   _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
    137   _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
    138   _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
    139   _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
    140   _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
    141   _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
    142   _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
    143   _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
    144   _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
    145   _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
    146   _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
    147   _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
    148   _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
    149   _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
    150   _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
    151   _MM_PERM_DDDD = 0xFF
    152 } _MM_PERM_ENUM;
    153 
    154 typedef enum
    155 {
    156   _MM_MANT_NORM_1_2,    /* interval [1, 2)      */
    157   _MM_MANT_NORM_p5_2,   /* interval [0.5, 2)    */
    158   _MM_MANT_NORM_p5_1,   /* interval [0.5, 1)    */
    159   _MM_MANT_NORM_p75_1p5   /* interval [0.75, 1.5) */
    160 } _MM_MANTISSA_NORM_ENUM;
    161 
    162 typedef enum
    163 {
    164   _MM_MANT_SIGN_src,    /* sign = sign(SRC)     */
    165   _MM_MANT_SIGN_zero,   /* sign = 0             */
    166   _MM_MANT_SIGN_nan   /* DEST = NaN if sign(SRC) = 1 */
    167 } _MM_MANTISSA_SIGN_ENUM;
    168 
    169 /* Define the default attributes for the functions in this file. */
    170 #define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f,evex512"), __min_vector_width__(512)))
    171 #define __DEFAULT_FN_ATTRS128                                                  \
    172   __attribute__((__always_inline__, __nodebug__,                               \
    173                  __target__("avx512f,no-evex512"), __min_vector_width__(128)))
    174 #define __DEFAULT_FN_ATTRS                                                     \
    175   __attribute__((__always_inline__, __nodebug__,                               \
    176                  __target__("avx512f,no-evex512")))
    177 
    178 #if defined(__cplusplus) && (__cplusplus >= 201103L)
    179 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
    180 #define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
    181 #define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
    182 #else
    183 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128
    184 #define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
    185 #define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS
    186 #endif
    187 
    188 /* Create vectors with repeated elements */
    189 
    190 static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
    191 _mm512_setzero_si512(void) {
    192   return __extension__(__m512i)(__v8di){0, 0, 0, 0, 0, 0, 0, 0};
    193 }
    194 
    195 #define _mm512_setzero_epi32 _mm512_setzero_si512
    196 
    197 static __inline__ __m512d __DEFAULT_FN_ATTRS512
    198 _mm512_undefined_pd(void)
    199 {
    200   return (__m512d)__builtin_ia32_undef512();
    201 }
    202 
    203 static __inline__ __m512 __DEFAULT_FN_ATTRS512
    204 _mm512_undefined(void)
    205 {
    206   return (__m512)__builtin_ia32_undef512();
    207 }
    208 
    209 static __inline__ __m512 __DEFAULT_FN_ATTRS512
    210 _mm512_undefined_ps(void)
    211 {
    212   return (__m512)__builtin_ia32_undef512();
    213 }
    214 
    215 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    216 _mm512_undefined_epi32(void)
    217 {
    218   return (__m512i)__builtin_ia32_undef512();
    219 }
    220 
    221 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    222 _mm512_broadcastd_epi32 (__m128i __A)
    223 {
    224   return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
    225                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
    226 }
    227 
    228 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    229 _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
    230 {
    231   return (__m512i)__builtin_ia32_selectd_512(__M,
    232                                              (__v16si) _mm512_broadcastd_epi32(__A),
    233                                              (__v16si) __O);
    234 }
    235 
    236 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    237 _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
    238 {
    239   return (__m512i)__builtin_ia32_selectd_512(__M,
    240                                              (__v16si) _mm512_broadcastd_epi32(__A),
    241                                              (__v16si) _mm512_setzero_si512());
    242 }
    243 
    244 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    245 _mm512_broadcastq_epi64 (__m128i __A)
    246 {
    247   return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
    248                                           0, 0, 0, 0, 0, 0, 0, 0);
    249 }
    250 
    251 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    252 _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
    253 {
    254   return (__m512i)__builtin_ia32_selectq_512(__M,
    255                                              (__v8di) _mm512_broadcastq_epi64(__A),
    256                                              (__v8di) __O);
    257 
    258 }
    259 
    260 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    261 _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
    262 {
    263   return (__m512i)__builtin_ia32_selectq_512(__M,
    264                                              (__v8di) _mm512_broadcastq_epi64(__A),
    265                                              (__v8di) _mm512_setzero_si512());
    266 }
    267 
    268 static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void) {
    269   return __extension__(__m512){0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
    270                                0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
    271 }
    272 
    273 #define _mm512_setzero _mm512_setzero_ps
    274 
    275 static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
    276 _mm512_setzero_pd(void) {
    277   return __extension__(__m512d){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
    278 }
    279 
    280 static __inline __m512 __DEFAULT_FN_ATTRS512
    281 _mm512_set1_ps(float __w)
    282 {
    283   return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
    284                                  __w, __w, __w, __w, __w, __w, __w, __w  };
    285 }
    286 
    287 static __inline __m512d __DEFAULT_FN_ATTRS512
    288 _mm512_set1_pd(double __w)
    289 {
    290   return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
    291 }
    292 
    293 static __inline __m512i __DEFAULT_FN_ATTRS512
    294 _mm512_set1_epi8(char __w)
    295 {
    296   return __extension__ (__m512i)(__v64qi){
    297     __w, __w, __w, __w, __w, __w, __w, __w,
    298     __w, __w, __w, __w, __w, __w, __w, __w,
    299     __w, __w, __w, __w, __w, __w, __w, __w,
    300     __w, __w, __w, __w, __w, __w, __w, __w,
    301     __w, __w, __w, __w, __w, __w, __w, __w,
    302     __w, __w, __w, __w, __w, __w, __w, __w,
    303     __w, __w, __w, __w, __w, __w, __w, __w,
    304     __w, __w, __w, __w, __w, __w, __w, __w  };
    305 }
    306 
    307 static __inline __m512i __DEFAULT_FN_ATTRS512
    308 _mm512_set1_epi16(short __w)
    309 {
    310   return __extension__ (__m512i)(__v32hi){
    311     __w, __w, __w, __w, __w, __w, __w, __w,
    312     __w, __w, __w, __w, __w, __w, __w, __w,
    313     __w, __w, __w, __w, __w, __w, __w, __w,
    314     __w, __w, __w, __w, __w, __w, __w, __w };
    315 }
    316 
    317 static __inline __m512i __DEFAULT_FN_ATTRS512
    318 _mm512_set1_epi32(int __s)
    319 {
    320   return __extension__ (__m512i)(__v16si){
    321     __s, __s, __s, __s, __s, __s, __s, __s,
    322     __s, __s, __s, __s, __s, __s, __s, __s };
    323 }
    324 
    325 static __inline __m512i __DEFAULT_FN_ATTRS512
    326 _mm512_maskz_set1_epi32(__mmask16 __M, int __A)
    327 {
    328   return (__m512i)__builtin_ia32_selectd_512(__M,
    329                                              (__v16si)_mm512_set1_epi32(__A),
    330                                              (__v16si)_mm512_setzero_si512());
    331 }
    332 
    333 static __inline __m512i __DEFAULT_FN_ATTRS512
    334 _mm512_set1_epi64(long long __d)
    335 {
    336   return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
    337 }
    338 
    339 static __inline __m512i __DEFAULT_FN_ATTRS512
    340 _mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
    341 {
    342   return (__m512i)__builtin_ia32_selectq_512(__M,
    343                                              (__v8di)_mm512_set1_epi64(__A),
    344                                              (__v8di)_mm512_setzero_si512());
    345 }
    346 
    347 static __inline__ __m512 __DEFAULT_FN_ATTRS512
    348 _mm512_broadcastss_ps(__m128 __A)
    349 {
    350   return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
    351                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
    352 }
    353 
    354 static __inline __m512i __DEFAULT_FN_ATTRS512
    355 _mm512_set4_epi32 (int __A, int __B, int __C, int __D)
    356 {
    357   return __extension__ (__m512i)(__v16si)
    358    { __D, __C, __B, __A, __D, __C, __B, __A,
    359      __D, __C, __B, __A, __D, __C, __B, __A };
    360 }
    361 
    362 static __inline __m512i __DEFAULT_FN_ATTRS512
    363 _mm512_set4_epi64 (long long __A, long long __B, long long __C,
    364        long long __D)
    365 {
    366   return __extension__ (__m512i) (__v8di)
    367    { __D, __C, __B, __A, __D, __C, __B, __A };
    368 }
    369 
    370 static __inline __m512d __DEFAULT_FN_ATTRS512
    371 _mm512_set4_pd (double __A, double __B, double __C, double __D)
    372 {
    373   return __extension__ (__m512d)
    374    { __D, __C, __B, __A, __D, __C, __B, __A };
    375 }
    376 
    377 static __inline __m512 __DEFAULT_FN_ATTRS512
    378 _mm512_set4_ps (float __A, float __B, float __C, float __D)
    379 {
    380   return __extension__ (__m512)
    381    { __D, __C, __B, __A, __D, __C, __B, __A,
    382      __D, __C, __B, __A, __D, __C, __B, __A };
    383 }
    384 
    385 #define _mm512_setr4_epi32(e0,e1,e2,e3)               \
    386   _mm512_set4_epi32((e3),(e2),(e1),(e0))
    387 
    388 #define _mm512_setr4_epi64(e0,e1,e2,e3)               \
    389   _mm512_set4_epi64((e3),(e2),(e1),(e0))
    390 
    391 #define _mm512_setr4_pd(e0,e1,e2,e3)                \
    392   _mm512_set4_pd((e3),(e2),(e1),(e0))
    393 
    394 #define _mm512_setr4_ps(e0,e1,e2,e3)                \
    395   _mm512_set4_ps((e3),(e2),(e1),(e0))
    396 
    397 static __inline__ __m512d __DEFAULT_FN_ATTRS512
    398 _mm512_broadcastsd_pd(__m128d __A)
    399 {
    400   return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
    401                                           0, 0, 0, 0, 0, 0, 0, 0);
    402 }
    403 
    404 /* Cast between vector types */
    405 
    406 static __inline __m512d __DEFAULT_FN_ATTRS512
    407 _mm512_castpd256_pd512(__m256d __a)
    408 {
    409   return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
    410                                  1, 2, 3, 4, 5, 6, 7);
    411 }
    412 
    413 static __inline __m512 __DEFAULT_FN_ATTRS512
    414 _mm512_castps256_ps512(__m256 __a)
    415 {
    416   return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
    417                                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
    418 }
    419 
    420 static __inline __m128d __DEFAULT_FN_ATTRS512
    421 _mm512_castpd512_pd128(__m512d __a)
    422 {
    423   return __builtin_shufflevector(__a, __a, 0, 1);
    424 }
    425 
    426 static __inline __m256d __DEFAULT_FN_ATTRS512
    427 _mm512_castpd512_pd256 (__m512d __A)
    428 {
    429   return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
    430 }
    431 
    432 static __inline __m128 __DEFAULT_FN_ATTRS512
    433 _mm512_castps512_ps128(__m512 __a)
    434 {
    435   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
    436 }
    437 
    438 static __inline __m256 __DEFAULT_FN_ATTRS512
    439 _mm512_castps512_ps256 (__m512 __A)
    440 {
    441   return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
    442 }
    443 
    444 static __inline __m512 __DEFAULT_FN_ATTRS512
    445 _mm512_castpd_ps (__m512d __A)
    446 {
    447   return (__m512) (__A);
    448 }
    449 
    450 static __inline __m512i __DEFAULT_FN_ATTRS512
    451 _mm512_castpd_si512 (__m512d __A)
    452 {
    453   return (__m512i) (__A);
    454 }
    455 
    456 static __inline__ __m512d __DEFAULT_FN_ATTRS512
    457 _mm512_castpd128_pd512 (__m128d __A)
    458 {
    459   __m256d __B = __builtin_nondeterministic_value(__B);
    460   return __builtin_shufflevector(
    461       __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
    462       __B, 0, 1, 2, 3, 4, 5, 6, 7);
    463 }
    464 
    465 static __inline __m512d __DEFAULT_FN_ATTRS512
    466 _mm512_castps_pd (__m512 __A)
    467 {
    468   return (__m512d) (__A);
    469 }
    470 
    471 static __inline __m512i __DEFAULT_FN_ATTRS512
    472 _mm512_castps_si512 (__m512 __A)
    473 {
    474   return (__m512i) (__A);
    475 }
    476 
    477 static __inline__ __m512 __DEFAULT_FN_ATTRS512
    478 _mm512_castps128_ps512 (__m128 __A)
    479 {
    480   __m256 __B = __builtin_nondeterministic_value(__B);
    481   return __builtin_shufflevector(
    482       __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7),
    483       __B, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
    484 }
    485 
    486 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    487 _mm512_castsi128_si512 (__m128i __A)
    488 {
    489   __m256i __B = __builtin_nondeterministic_value(__B);
    490   return __builtin_shufflevector(
    491       __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
    492       __B, 0, 1, 2, 3, 4, 5, 6, 7);
    493 }
    494 
    495 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    496 _mm512_castsi256_si512 (__m256i __A)
    497 {
    498    return  __builtin_shufflevector( __A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7);
    499 }
    500 
    501 static __inline __m512 __DEFAULT_FN_ATTRS512
    502 _mm512_castsi512_ps (__m512i __A)
    503 {
    504   return (__m512) (__A);
    505 }
    506 
    507 static __inline __m512d __DEFAULT_FN_ATTRS512
    508 _mm512_castsi512_pd (__m512i __A)
    509 {
    510   return (__m512d) (__A);
    511 }
    512 
    513 static __inline __m128i __DEFAULT_FN_ATTRS512
    514 _mm512_castsi512_si128 (__m512i __A)
    515 {
    516   return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
    517 }
    518 
    519 static __inline __m256i __DEFAULT_FN_ATTRS512
    520 _mm512_castsi512_si256 (__m512i __A)
    521 {
    522   return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
    523 }
    524 
    525 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
    526 _mm512_int2mask(int __a)
    527 {
    528   return (__mmask16)__a;
    529 }
    530 
    531 static __inline__ int __DEFAULT_FN_ATTRS
    532 _mm512_mask2int(__mmask16 __a)
    533 {
    534   return (int)__a;
    535 }
    536 
    537 /// Constructs a 512-bit floating-point vector of [8 x double] from a
    538 ///    128-bit floating-point vector of [2 x double]. The lower 128 bits
    539 ///    contain the value of the source vector. The upper 384 bits are set
    540 ///    to zero.
    541 ///
    542 /// \headerfile <x86intrin.h>
    543 ///
    544 /// This intrinsic has no corresponding instruction.
    545 ///
    546 /// \param __a
    547 ///    A 128-bit vector of [2 x double].
    548 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
    549 ///    contain the value of the parameter. The upper 384 bits are set to zero.
    550 static __inline __m512d __DEFAULT_FN_ATTRS512
    551 _mm512_zextpd128_pd512(__m128d __a)
    552 {
    553   return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
    554 }
    555 
    556 /// Constructs a 512-bit floating-point vector of [8 x double] from a
    557 ///    256-bit floating-point vector of [4 x double]. The lower 256 bits
    558 ///    contain the value of the source vector. The upper 256 bits are set
    559 ///    to zero.
    560 ///
    561 /// \headerfile <x86intrin.h>
    562 ///
    563 /// This intrinsic has no corresponding instruction.
    564 ///
    565 /// \param __a
    566 ///    A 256-bit vector of [4 x double].
    567 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
    568 ///    contain the value of the parameter. The upper 256 bits are set to zero.
    569 static __inline __m512d __DEFAULT_FN_ATTRS512
    570 _mm512_zextpd256_pd512(__m256d __a)
    571 {
    572   return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
    573 }
    574 
    575 /// Constructs a 512-bit floating-point vector of [16 x float] from a
    576 ///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
    577 ///    the value of the source vector. The upper 384 bits are set to zero.
    578 ///
    579 /// \headerfile <x86intrin.h>
    580 ///
    581 /// This intrinsic has no corresponding instruction.
    582 ///
    583 /// \param __a
    584 ///    A 128-bit vector of [4 x float].
    585 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
    586 ///    contain the value of the parameter. The upper 384 bits are set to zero.
    587 static __inline __m512 __DEFAULT_FN_ATTRS512
    588 _mm512_zextps128_ps512(__m128 __a)
    589 {
    590   return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
    591 }
    592 
    593 /// Constructs a 512-bit floating-point vector of [16 x float] from a
    594 ///    256-bit floating-point vector of [8 x float]. The lower 256 bits contain
    595 ///    the value of the source vector. The upper 256 bits are set to zero.
    596 ///
    597 /// \headerfile <x86intrin.h>
    598 ///
    599 /// This intrinsic has no corresponding instruction.
    600 ///
    601 /// \param __a
    602 ///    A 256-bit vector of [8 x float].
    603 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
    604 ///    contain the value of the parameter. The upper 256 bits are set to zero.
    605 static __inline __m512 __DEFAULT_FN_ATTRS512
    606 _mm512_zextps256_ps512(__m256 __a)
    607 {
    608   return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
    609 }
    610 
    611 /// Constructs a 512-bit integer vector from a 128-bit integer vector.
    612 ///    The lower 128 bits contain the value of the source vector. The upper
    613 ///    384 bits are set to zero.
    614 ///
    615 /// \headerfile <x86intrin.h>
    616 ///
    617 /// This intrinsic has no corresponding instruction.
    618 ///
    619 /// \param __a
    620 ///    A 128-bit integer vector.
    621 /// \returns A 512-bit integer vector. The lower 128 bits contain the value of
    622 ///    the parameter. The upper 384 bits are set to zero.
    623 static __inline __m512i __DEFAULT_FN_ATTRS512
    624 _mm512_zextsi128_si512(__m128i __a)
    625 {
    626   return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
    627 }
    628 
    629 /// Constructs a 512-bit integer vector from a 256-bit integer vector.
    630 ///    The lower 256 bits contain the value of the source vector. The upper
    631 ///    256 bits are set to zero.
    632 ///
    633 /// \headerfile <x86intrin.h>
    634 ///
    635 /// This intrinsic has no corresponding instruction.
    636 ///
    637 /// \param __a
    638 ///    A 256-bit integer vector.
    639 /// \returns A 512-bit integer vector. The lower 256 bits contain the value of
    640 ///    the parameter. The upper 256 bits are set to zero.
    641 static __inline __m512i __DEFAULT_FN_ATTRS512
    642 _mm512_zextsi256_si512(__m256i __a)
    643 {
    644   return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
    645 }
    646 
    647 /* Bitwise operators */
    648 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    649 _mm512_and_epi32(__m512i __a, __m512i __b)
    650 {
    651   return (__m512i)((__v16su)__a & (__v16su)__b);
    652 }
    653 
    654 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    655 _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
    656 {
    657   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
    658                 (__v16si) _mm512_and_epi32(__a, __b),
    659                 (__v16si) __src);
    660 }
    661 
    662 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    663 _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
    664 {
    665   return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
    666                                          __k, __a, __b);
    667 }
    668 
    669 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    670 _mm512_and_epi64(__m512i __a, __m512i __b)
    671 {
    672   return (__m512i)((__v8du)__a & (__v8du)__b);
    673 }
    674 
    675 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    676 _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
    677 {
    678     return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
    679                 (__v8di) _mm512_and_epi64(__a, __b),
    680                 (__v8di) __src);
    681 }
    682 
    683 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    684 _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
    685 {
    686   return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
    687                                          __k, __a, __b);
    688 }
    689 
    690 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    691 _mm512_andnot_si512 (__m512i __A, __m512i __B)
    692 {
    693   return (__m512i)(~(__v8du)__A & (__v8du)__B);
    694 }
    695 
    696 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    697 _mm512_andnot_epi32 (__m512i __A, __m512i __B)
    698 {
    699   return (__m512i)(~(__v16su)__A & (__v16su)__B);
    700 }
    701 
    702 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    703 _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
    704 {
    705   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
    706                                          (__v16si)_mm512_andnot_epi32(__A, __B),
    707                                          (__v16si)__W);
    708 }
    709 
    710 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    711 _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
    712 {
    713   return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
    714                                            __U, __A, __B);
    715 }
    716 
    717 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    718 _mm512_andnot_epi64(__m512i __A, __m512i __B)
    719 {
    720   return (__m512i)(~(__v8du)__A & (__v8du)__B);
    721 }
    722 
    723 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    724 _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
    725 {
    726   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
    727                                           (__v8di)_mm512_andnot_epi64(__A, __B),
    728                                           (__v8di)__W);
    729 }
    730 
    731 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    732 _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
    733 {
    734   return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
    735                                            __U, __A, __B);
    736 }
    737 
    738 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    739 _mm512_or_epi32(__m512i __a, __m512i __b)
    740 {
    741   return (__m512i)((__v16su)__a | (__v16su)__b);
    742 }
    743 
    744 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    745 _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
    746 {
    747   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
    748                                              (__v16si)_mm512_or_epi32(__a, __b),
    749                                              (__v16si)__src);
    750 }
    751 
    752 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    753 _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
    754 {
    755   return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
    756 }
    757 
    758 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    759 _mm512_or_epi64(__m512i __a, __m512i __b)
    760 {
    761   return (__m512i)((__v8du)__a | (__v8du)__b);
    762 }
    763 
    764 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    765 _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
    766 {
    767   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
    768                                              (__v8di)_mm512_or_epi64(__a, __b),
    769                                              (__v8di)__src);
    770 }
    771 
    772 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    773 _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
    774 {
    775   return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
    776 }
    777 
    778 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    779 _mm512_xor_epi32(__m512i __a, __m512i __b)
    780 {
    781   return (__m512i)((__v16su)__a ^ (__v16su)__b);
    782 }
    783 
    784 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    785 _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
    786 {
    787   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
    788                                             (__v16si)_mm512_xor_epi32(__a, __b),
    789                                             (__v16si)__src);
    790 }
    791 
    792 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    793 _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
    794 {
    795   return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
    796 }
    797 
    798 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    799 _mm512_xor_epi64(__m512i __a, __m512i __b)
    800 {
    801   return (__m512i)((__v8du)__a ^ (__v8du)__b);
    802 }
    803 
    804 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    805 _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
    806 {
    807   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
    808                                              (__v8di)_mm512_xor_epi64(__a, __b),
    809                                              (__v8di)__src);
    810 }
    811 
    812 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    813 _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
    814 {
    815   return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
    816 }
    817 
    818 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    819 _mm512_and_si512(__m512i __a, __m512i __b)
    820 {
    821   return (__m512i)((__v8du)__a & (__v8du)__b);
    822 }
    823 
    824 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    825 _mm512_or_si512(__m512i __a, __m512i __b)
    826 {
    827   return (__m512i)((__v8du)__a | (__v8du)__b);
    828 }
    829 
    830 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    831 _mm512_xor_si512(__m512i __a, __m512i __b)
    832 {
    833   return (__m512i)((__v8du)__a ^ (__v8du)__b);
    834 }
    835 
    836 /* Arithmetic */
    837 
    838 static __inline __m512d __DEFAULT_FN_ATTRS512
    839 _mm512_add_pd(__m512d __a, __m512d __b)
    840 {
    841   return (__m512d)((__v8df)__a + (__v8df)__b);
    842 }
    843 
    844 static __inline __m512 __DEFAULT_FN_ATTRS512
    845 _mm512_add_ps(__m512 __a, __m512 __b)
    846 {
    847   return (__m512)((__v16sf)__a + (__v16sf)__b);
    848 }
    849 
    850 static __inline __m512d __DEFAULT_FN_ATTRS512
    851 _mm512_mul_pd(__m512d __a, __m512d __b)
    852 {
    853   return (__m512d)((__v8df)__a * (__v8df)__b);
    854 }
    855 
    856 static __inline __m512 __DEFAULT_FN_ATTRS512
    857 _mm512_mul_ps(__m512 __a, __m512 __b)
    858 {
    859   return (__m512)((__v16sf)__a * (__v16sf)__b);
    860 }
    861 
    862 static __inline __m512d __DEFAULT_FN_ATTRS512
    863 _mm512_sub_pd(__m512d __a, __m512d __b)
    864 {
    865   return (__m512d)((__v8df)__a - (__v8df)__b);
    866 }
    867 
    868 static __inline __m512 __DEFAULT_FN_ATTRS512
    869 _mm512_sub_ps(__m512 __a, __m512 __b)
    870 {
    871   return (__m512)((__v16sf)__a - (__v16sf)__b);
    872 }
    873 
    874 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    875 _mm512_add_epi64 (__m512i __A, __m512i __B)
    876 {
    877   return (__m512i) ((__v8du) __A + (__v8du) __B);
    878 }
    879 
    880 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    881 _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
    882 {
    883   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
    884                                              (__v8di)_mm512_add_epi64(__A, __B),
    885                                              (__v8di)__W);
    886 }
    887 
    888 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    889 _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
    890 {
    891   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
    892                                              (__v8di)_mm512_add_epi64(__A, __B),
    893                                              (__v8di)_mm512_setzero_si512());
    894 }
    895 
    896 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    897 _mm512_sub_epi64 (__m512i __A, __m512i __B)
    898 {
    899   return (__m512i) ((__v8du) __A - (__v8du) __B);
    900 }
    901 
    902 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    903 _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
    904 {
    905   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
    906                                              (__v8di)_mm512_sub_epi64(__A, __B),
    907                                              (__v8di)__W);
    908 }
    909 
    910 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    911 _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
    912 {
    913   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
    914                                              (__v8di)_mm512_sub_epi64(__A, __B),
    915                                              (__v8di)_mm512_setzero_si512());
    916 }
    917 
    918 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    919 _mm512_add_epi32 (__m512i __A, __m512i __B)
    920 {
    921   return (__m512i) ((__v16su) __A + (__v16su) __B);
    922 }
    923 
    924 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    925 _mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
    926 {
    927   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
    928                                              (__v16si)_mm512_add_epi32(__A, __B),
    929                                              (__v16si)__W);
    930 }
    931 
    932 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    933 _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
    934 {
    935   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
    936                                              (__v16si)_mm512_add_epi32(__A, __B),
    937                                              (__v16si)_mm512_setzero_si512());
    938 }
    939 
    940 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    941 _mm512_sub_epi32 (__m512i __A, __m512i __B)
    942 {
    943   return (__m512i) ((__v16su) __A - (__v16su) __B);
    944 }
    945 
    946 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    947 _mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
    948 {
    949   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
    950                                              (__v16si)_mm512_sub_epi32(__A, __B),
    951                                              (__v16si)__W);
    952 }
    953 
    954 static __inline__ __m512i __DEFAULT_FN_ATTRS512
    955 _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
    956 {
    957   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
    958                                              (__v16si)_mm512_sub_epi32(__A, __B),
    959                                              (__v16si)_mm512_setzero_si512());
    960 }
    961 
    962 #define _mm512_max_round_pd(A, B, R) \
    963   ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \
    964                                     (__v8df)(__m512d)(B), (int)(R)))
    965 
    966 #define _mm512_mask_max_round_pd(W, U, A, B, R) \
    967   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
    968                                    (__v8df)_mm512_max_round_pd((A), (B), (R)), \
    969                                    (__v8df)(W)))
    970 
    971 #define _mm512_maskz_max_round_pd(U, A, B, R) \
    972   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
    973                                    (__v8df)_mm512_max_round_pd((A), (B), (R)), \
    974                                    (__v8df)_mm512_setzero_pd()))
    975 
    976 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
    977 _mm512_max_pd(__m512d __A, __m512d __B)
    978 {
    979   return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B,
    980                                            _MM_FROUND_CUR_DIRECTION);
    981 }
    982 
    983 static __inline__ __m512d __DEFAULT_FN_ATTRS512
    984 _mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
    985 {
    986   return (__m512d)__builtin_ia32_selectpd_512(__U,
    987                                               (__v8df)_mm512_max_pd(__A, __B),
    988                                               (__v8df)__W);
    989 }
    990 
    991 static __inline__ __m512d __DEFAULT_FN_ATTRS512
    992 _mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
    993 {
    994   return (__m512d)__builtin_ia32_selectpd_512(__U,
    995                                               (__v8df)_mm512_max_pd(__A, __B),
    996                                               (__v8df)_mm512_setzero_pd());
    997 }
    998 
    999 #define _mm512_max_round_ps(A, B, R) \
   1000   ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \
   1001                                    (__v16sf)(__m512)(B), (int)(R)))
   1002 
   1003 #define _mm512_mask_max_round_ps(W, U, A, B, R) \
   1004   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   1005                                   (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
   1006                                   (__v16sf)(W)))
   1007 
   1008 #define _mm512_maskz_max_round_ps(U, A, B, R) \
   1009   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   1010                                   (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
   1011                                   (__v16sf)_mm512_setzero_ps()))
   1012 
   1013 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
   1014 _mm512_max_ps(__m512 __A, __m512 __B)
   1015 {
   1016   return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B,
   1017                                           _MM_FROUND_CUR_DIRECTION);
   1018 }
   1019 
   1020 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   1021 _mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
   1022 {
   1023   return (__m512)__builtin_ia32_selectps_512(__U,
   1024                                              (__v16sf)_mm512_max_ps(__A, __B),
   1025                                              (__v16sf)__W);
   1026 }
   1027 
   1028 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   1029 _mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
   1030 {
   1031   return (__m512)__builtin_ia32_selectps_512(__U,
   1032                                              (__v16sf)_mm512_max_ps(__A, __B),
   1033                                              (__v16sf)_mm512_setzero_ps());
   1034 }
   1035 
   1036 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   1037 _mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
   1038   return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
   1039                 (__v4sf) __B,
   1040                 (__v4sf) __W,
   1041                 (__mmask8) __U,
   1042                 _MM_FROUND_CUR_DIRECTION);
   1043 }
   1044 
   1045 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   1046 _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
   1047   return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
   1048                 (__v4sf) __B,
   1049                 (__v4sf)  _mm_setzero_ps (),
   1050                 (__mmask8) __U,
   1051                 _MM_FROUND_CUR_DIRECTION);
   1052 }
   1053 
   1054 #define _mm_max_round_ss(A, B, R) \
   1055   ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
   1056                                            (__v4sf)(__m128)(B), \
   1057                                            (__v4sf)_mm_setzero_ps(), \
   1058                                            (__mmask8)-1, (int)(R)))
   1059 
   1060 #define _mm_mask_max_round_ss(W, U, A, B, R) \
   1061   ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
   1062                                            (__v4sf)(__m128)(B), \
   1063                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
   1064                                            (int)(R)))
   1065 
   1066 #define _mm_maskz_max_round_ss(U, A, B, R) \
   1067   ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
   1068                                            (__v4sf)(__m128)(B), \
   1069                                            (__v4sf)_mm_setzero_ps(), \
   1070                                            (__mmask8)(U), (int)(R)))
   1071 
   1072 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   1073 _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
   1074   return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
   1075                 (__v2df) __B,
   1076                 (__v2df) __W,
   1077                 (__mmask8) __U,
   1078                 _MM_FROUND_CUR_DIRECTION);
   1079 }
   1080 
   1081 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   1082 _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
   1083   return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
   1084                 (__v2df) __B,
   1085                 (__v2df)  _mm_setzero_pd (),
   1086                 (__mmask8) __U,
   1087                 _MM_FROUND_CUR_DIRECTION);
   1088 }
   1089 
   1090 #define _mm_max_round_sd(A, B, R) \
   1091   ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
   1092                                             (__v2df)(__m128d)(B), \
   1093                                             (__v2df)_mm_setzero_pd(), \
   1094                                             (__mmask8)-1, (int)(R)))
   1095 
   1096 #define _mm_mask_max_round_sd(W, U, A, B, R) \
   1097   ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
   1098                                             (__v2df)(__m128d)(B), \
   1099                                             (__v2df)(__m128d)(W), \
   1100                                             (__mmask8)(U), (int)(R)))
   1101 
   1102 #define _mm_maskz_max_round_sd(U, A, B, R) \
   1103   ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
   1104                                             (__v2df)(__m128d)(B), \
   1105                                             (__v2df)_mm_setzero_pd(), \
   1106                                             (__mmask8)(U), (int)(R)))
   1107 
   1108 static __inline __m512i
   1109 __DEFAULT_FN_ATTRS512
   1110 _mm512_max_epi32(__m512i __A, __m512i __B)
   1111 {
   1112   return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B);
   1113 }
   1114 
   1115 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1116 _mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
   1117 {
   1118   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   1119                                             (__v16si)_mm512_max_epi32(__A, __B),
   1120                                             (__v16si)__W);
   1121 }
   1122 
   1123 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1124 _mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
   1125 {
   1126   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   1127                                             (__v16si)_mm512_max_epi32(__A, __B),
   1128                                             (__v16si)_mm512_setzero_si512());
   1129 }
   1130 
   1131 static __inline __m512i __DEFAULT_FN_ATTRS512
   1132 _mm512_max_epu32(__m512i __A, __m512i __B)
   1133 {
   1134   return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B);
   1135 }
   1136 
   1137 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1138 _mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
   1139 {
   1140   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   1141                                             (__v16si)_mm512_max_epu32(__A, __B),
   1142                                             (__v16si)__W);
   1143 }
   1144 
   1145 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1146 _mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
   1147 {
   1148   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   1149                                             (__v16si)_mm512_max_epu32(__A, __B),
   1150                                             (__v16si)_mm512_setzero_si512());
   1151 }
   1152 
   1153 static __inline __m512i __DEFAULT_FN_ATTRS512
   1154 _mm512_max_epi64(__m512i __A, __m512i __B)
   1155 {
   1156   return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B);
   1157 }
   1158 
   1159 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1160 _mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
   1161 {
   1162   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   1163                                              (__v8di)_mm512_max_epi64(__A, __B),
   1164                                              (__v8di)__W);
   1165 }
   1166 
   1167 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1168 _mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
   1169 {
   1170   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   1171                                              (__v8di)_mm512_max_epi64(__A, __B),
   1172                                              (__v8di)_mm512_setzero_si512());
   1173 }
   1174 
   1175 static __inline __m512i __DEFAULT_FN_ATTRS512
   1176 _mm512_max_epu64(__m512i __A, __m512i __B)
   1177 {
   1178   return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B);
   1179 }
   1180 
   1181 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1182 _mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
   1183 {
   1184   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   1185                                              (__v8di)_mm512_max_epu64(__A, __B),
   1186                                              (__v8di)__W);
   1187 }
   1188 
   1189 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1190 _mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
   1191 {
   1192   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   1193                                              (__v8di)_mm512_max_epu64(__A, __B),
   1194                                              (__v8di)_mm512_setzero_si512());
   1195 }
   1196 
   1197 #define _mm512_min_round_pd(A, B, R) \
   1198   ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \
   1199                                     (__v8df)(__m512d)(B), (int)(R)))
   1200 
   1201 #define _mm512_mask_min_round_pd(W, U, A, B, R) \
   1202   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   1203                                    (__v8df)_mm512_min_round_pd((A), (B), (R)), \
   1204                                    (__v8df)(W)))
   1205 
   1206 #define _mm512_maskz_min_round_pd(U, A, B, R) \
   1207   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   1208                                    (__v8df)_mm512_min_round_pd((A), (B), (R)), \
   1209                                    (__v8df)_mm512_setzero_pd()))
   1210 
   1211 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
   1212 _mm512_min_pd(__m512d __A, __m512d __B)
   1213 {
   1214   return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B,
   1215                                            _MM_FROUND_CUR_DIRECTION);
   1216 }
   1217 
   1218 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1219 _mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
   1220 {
   1221   return (__m512d)__builtin_ia32_selectpd_512(__U,
   1222                                               (__v8df)_mm512_min_pd(__A, __B),
   1223                                               (__v8df)__W);
   1224 }
   1225 
   1226 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1227 _mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
   1228 {
   1229   return (__m512d)__builtin_ia32_selectpd_512(__U,
   1230                                               (__v8df)_mm512_min_pd(__A, __B),
   1231                                               (__v8df)_mm512_setzero_pd());
   1232 }
   1233 
   1234 #define _mm512_min_round_ps(A, B, R) \
   1235   ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \
   1236                                    (__v16sf)(__m512)(B), (int)(R)))
   1237 
   1238 #define _mm512_mask_min_round_ps(W, U, A, B, R) \
   1239   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   1240                                   (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
   1241                                   (__v16sf)(W)))
   1242 
   1243 #define _mm512_maskz_min_round_ps(U, A, B, R) \
   1244   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   1245                                   (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
   1246                                   (__v16sf)_mm512_setzero_ps()))
   1247 
   1248 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
   1249 _mm512_min_ps(__m512 __A, __m512 __B)
   1250 {
   1251   return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B,
   1252                                           _MM_FROUND_CUR_DIRECTION);
   1253 }
   1254 
   1255 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   1256 _mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
   1257 {
   1258   return (__m512)__builtin_ia32_selectps_512(__U,
   1259                                              (__v16sf)_mm512_min_ps(__A, __B),
   1260                                              (__v16sf)__W);
   1261 }
   1262 
   1263 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   1264 _mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
   1265 {
   1266   return (__m512)__builtin_ia32_selectps_512(__U,
   1267                                              (__v16sf)_mm512_min_ps(__A, __B),
   1268                                              (__v16sf)_mm512_setzero_ps());
   1269 }
   1270 
   1271 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   1272 _mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
   1273   return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
   1274                 (__v4sf) __B,
   1275                 (__v4sf) __W,
   1276                 (__mmask8) __U,
   1277                 _MM_FROUND_CUR_DIRECTION);
   1278 }
   1279 
   1280 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   1281 _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
   1282   return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
   1283                 (__v4sf) __B,
   1284                 (__v4sf)  _mm_setzero_ps (),
   1285                 (__mmask8) __U,
   1286                 _MM_FROUND_CUR_DIRECTION);
   1287 }
   1288 
   1289 #define _mm_min_round_ss(A, B, R) \
   1290   ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
   1291                                            (__v4sf)(__m128)(B), \
   1292                                            (__v4sf)_mm_setzero_ps(), \
   1293                                            (__mmask8)-1, (int)(R)))
   1294 
   1295 #define _mm_mask_min_round_ss(W, U, A, B, R) \
   1296   ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
   1297                                            (__v4sf)(__m128)(B), \
   1298                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
   1299                                            (int)(R)))
   1300 
   1301 #define _mm_maskz_min_round_ss(U, A, B, R) \
   1302   ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
   1303                                            (__v4sf)(__m128)(B), \
   1304                                            (__v4sf)_mm_setzero_ps(), \
   1305                                            (__mmask8)(U), (int)(R)))
   1306 
   1307 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   1308 _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
   1309   return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
   1310                 (__v2df) __B,
   1311                 (__v2df) __W,
   1312                 (__mmask8) __U,
   1313                 _MM_FROUND_CUR_DIRECTION);
   1314 }
   1315 
   1316 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   1317 _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
   1318   return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
   1319                 (__v2df) __B,
   1320                 (__v2df)  _mm_setzero_pd (),
   1321                 (__mmask8) __U,
   1322                 _MM_FROUND_CUR_DIRECTION);
   1323 }
   1324 
   1325 #define _mm_min_round_sd(A, B, R) \
   1326   ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
   1327                                             (__v2df)(__m128d)(B), \
   1328                                             (__v2df)_mm_setzero_pd(), \
   1329                                             (__mmask8)-1, (int)(R)))
   1330 
   1331 #define _mm_mask_min_round_sd(W, U, A, B, R) \
   1332   ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
   1333                                             (__v2df)(__m128d)(B), \
   1334                                             (__v2df)(__m128d)(W), \
   1335                                             (__mmask8)(U), (int)(R)))
   1336 
   1337 #define _mm_maskz_min_round_sd(U, A, B, R) \
   1338   ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
   1339                                             (__v2df)(__m128d)(B), \
   1340                                             (__v2df)_mm_setzero_pd(), \
   1341                                             (__mmask8)(U), (int)(R)))
   1342 
   1343 static __inline __m512i
   1344 __DEFAULT_FN_ATTRS512
   1345 _mm512_min_epi32(__m512i __A, __m512i __B)
   1346 {
   1347   return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B);
   1348 }
   1349 
   1350 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1351 _mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
   1352 {
   1353   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   1354                                             (__v16si)_mm512_min_epi32(__A, __B),
   1355                                             (__v16si)__W);
   1356 }
   1357 
   1358 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1359 _mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
   1360 {
   1361   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   1362                                             (__v16si)_mm512_min_epi32(__A, __B),
   1363                                             (__v16si)_mm512_setzero_si512());
   1364 }
   1365 
   1366 static __inline __m512i __DEFAULT_FN_ATTRS512
   1367 _mm512_min_epu32(__m512i __A, __m512i __B)
   1368 {
   1369   return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B);
   1370 }
   1371 
   1372 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1373 _mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
   1374 {
   1375   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   1376                                             (__v16si)_mm512_min_epu32(__A, __B),
   1377                                             (__v16si)__W);
   1378 }
   1379 
   1380 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1381 _mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
   1382 {
   1383   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   1384                                             (__v16si)_mm512_min_epu32(__A, __B),
   1385                                             (__v16si)_mm512_setzero_si512());
   1386 }
   1387 
   1388 static __inline __m512i __DEFAULT_FN_ATTRS512
   1389 _mm512_min_epi64(__m512i __A, __m512i __B)
   1390 {
   1391   return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B);
   1392 }
   1393 
   1394 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1395 _mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
   1396 {
   1397   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   1398                                              (__v8di)_mm512_min_epi64(__A, __B),
   1399                                              (__v8di)__W);
   1400 }
   1401 
   1402 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1403 _mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
   1404 {
   1405   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   1406                                              (__v8di)_mm512_min_epi64(__A, __B),
   1407                                              (__v8di)_mm512_setzero_si512());
   1408 }
   1409 
   1410 static __inline __m512i __DEFAULT_FN_ATTRS512
   1411 _mm512_min_epu64(__m512i __A, __m512i __B)
   1412 {
   1413   return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B);
   1414 }
   1415 
   1416 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1417 _mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
   1418 {
   1419   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   1420                                              (__v8di)_mm512_min_epu64(__A, __B),
   1421                                              (__v8di)__W);
   1422 }
   1423 
   1424 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1425 _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
   1426 {
   1427   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   1428                                              (__v8di)_mm512_min_epu64(__A, __B),
   1429                                              (__v8di)_mm512_setzero_si512());
   1430 }
   1431 
   1432 static __inline __m512i __DEFAULT_FN_ATTRS512
   1433 _mm512_mul_epi32(__m512i __X, __m512i __Y)
   1434 {
   1435   return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
   1436 }
   1437 
   1438 static __inline __m512i __DEFAULT_FN_ATTRS512
   1439 _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
   1440 {
   1441   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   1442                                              (__v8di)_mm512_mul_epi32(__X, __Y),
   1443                                              (__v8di)__W);
   1444 }
   1445 
   1446 static __inline __m512i __DEFAULT_FN_ATTRS512
   1447 _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
   1448 {
   1449   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   1450                                              (__v8di)_mm512_mul_epi32(__X, __Y),
   1451                                              (__v8di)_mm512_setzero_si512 ());
   1452 }
   1453 
   1454 static __inline __m512i __DEFAULT_FN_ATTRS512
   1455 _mm512_mul_epu32(__m512i __X, __m512i __Y)
   1456 {
   1457   return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
   1458 }
   1459 
   1460 static __inline __m512i __DEFAULT_FN_ATTRS512
   1461 _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
   1462 {
   1463   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   1464                                              (__v8di)_mm512_mul_epu32(__X, __Y),
   1465                                              (__v8di)__W);
   1466 }
   1467 
   1468 static __inline __m512i __DEFAULT_FN_ATTRS512
   1469 _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
   1470 {
   1471   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   1472                                              (__v8di)_mm512_mul_epu32(__X, __Y),
   1473                                              (__v8di)_mm512_setzero_si512 ());
   1474 }
   1475 
   1476 static __inline __m512i __DEFAULT_FN_ATTRS512
   1477 _mm512_mullo_epi32 (__m512i __A, __m512i __B)
   1478 {
   1479   return (__m512i) ((__v16su) __A * (__v16su) __B);
   1480 }
   1481 
   1482 static __inline __m512i __DEFAULT_FN_ATTRS512
   1483 _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
   1484 {
   1485   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   1486                                              (__v16si)_mm512_mullo_epi32(__A, __B),
   1487                                              (__v16si)_mm512_setzero_si512());
   1488 }
   1489 
   1490 static __inline __m512i __DEFAULT_FN_ATTRS512
   1491 _mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
   1492 {
   1493   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   1494                                              (__v16si)_mm512_mullo_epi32(__A, __B),
   1495                                              (__v16si)__W);
   1496 }
   1497 
   1498 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1499 _mm512_mullox_epi64 (__m512i __A, __m512i __B) {
   1500   return (__m512i) ((__v8du) __A * (__v8du) __B);
   1501 }
   1502 
   1503 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1504 _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
   1505   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   1506                                              (__v8di)_mm512_mullox_epi64(__A, __B),
   1507                                              (__v8di)__W);
   1508 }
   1509 
   1510 #define _mm512_sqrt_round_pd(A, R) \
   1511   ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)))
   1512 
   1513 #define _mm512_mask_sqrt_round_pd(W, U, A, R) \
   1514   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   1515                                        (__v8df)_mm512_sqrt_round_pd((A), (R)), \
   1516                                        (__v8df)(__m512d)(W)))
   1517 
   1518 #define _mm512_maskz_sqrt_round_pd(U, A, R) \
   1519   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   1520                                        (__v8df)_mm512_sqrt_round_pd((A), (R)), \
   1521                                        (__v8df)_mm512_setzero_pd()))
   1522 
   1523 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
   1524 _mm512_sqrt_pd(__m512d __A)
   1525 {
   1526   return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A,
   1527                                            _MM_FROUND_CUR_DIRECTION);
   1528 }
   1529 
   1530 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1531 _mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
   1532 {
   1533   return (__m512d)__builtin_ia32_selectpd_512(__U,
   1534                                               (__v8df)_mm512_sqrt_pd(__A),
   1535                                               (__v8df)__W);
   1536 }
   1537 
   1538 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1539 _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
   1540 {
   1541   return (__m512d)__builtin_ia32_selectpd_512(__U,
   1542                                               (__v8df)_mm512_sqrt_pd(__A),
   1543                                               (__v8df)_mm512_setzero_pd());
   1544 }
   1545 
   1546 #define _mm512_sqrt_round_ps(A, R) \
   1547   ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)))
   1548 
   1549 #define _mm512_mask_sqrt_round_ps(W, U, A, R) \
   1550   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   1551                                       (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
   1552                                       (__v16sf)(__m512)(W)))
   1553 
   1554 #define _mm512_maskz_sqrt_round_ps(U, A, R) \
   1555   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   1556                                       (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
   1557                                       (__v16sf)_mm512_setzero_ps()))
   1558 
   1559 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
   1560 _mm512_sqrt_ps(__m512 __A)
   1561 {
   1562   return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A,
   1563                                           _MM_FROUND_CUR_DIRECTION);
   1564 }
   1565 
   1566 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
   1567 _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
   1568 {
   1569   return (__m512)__builtin_ia32_selectps_512(__U,
   1570                                              (__v16sf)_mm512_sqrt_ps(__A),
   1571                                              (__v16sf)__W);
   1572 }
   1573 
   1574 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
   1575 _mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
   1576 {
   1577   return (__m512)__builtin_ia32_selectps_512(__U,
   1578                                              (__v16sf)_mm512_sqrt_ps(__A),
   1579                                              (__v16sf)_mm512_setzero_ps());
   1580 }
   1581 
   1582 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
   1583 _mm512_rsqrt14_pd(__m512d __A)
   1584 {
   1585   return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
   1586                  (__v8df)
   1587                  _mm512_setzero_pd (),
   1588                  (__mmask8) -1);}
   1589 
   1590 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1591 _mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
   1592 {
   1593   return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
   1594                   (__v8df) __W,
   1595                   (__mmask8) __U);
   1596 }
   1597 
   1598 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1599 _mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
   1600 {
   1601   return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
   1602                   (__v8df)
   1603                   _mm512_setzero_pd (),
   1604                   (__mmask8) __U);
   1605 }
   1606 
   1607 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
   1608 _mm512_rsqrt14_ps(__m512 __A)
   1609 {
   1610   return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
   1611                 (__v16sf)
   1612                 _mm512_setzero_ps (),
   1613                 (__mmask16) -1);
   1614 }
   1615 
   1616 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   1617 _mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
   1618 {
   1619   return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
   1620                  (__v16sf) __W,
   1621                  (__mmask16) __U);
   1622 }
   1623 
   1624 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   1625 _mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
   1626 {
   1627   return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
   1628                  (__v16sf)
   1629                  _mm512_setzero_ps (),
   1630                  (__mmask16) __U);
   1631 }
   1632 
   1633 static  __inline__ __m128 __DEFAULT_FN_ATTRS128
   1634 _mm_rsqrt14_ss(__m128 __A, __m128 __B)
   1635 {
   1636   return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
   1637              (__v4sf) __B,
   1638              (__v4sf)
   1639              _mm_setzero_ps (),
   1640              (__mmask8) -1);
   1641 }
   1642 
   1643 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   1644 _mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
   1645 {
   1646  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
   1647           (__v4sf) __B,
   1648           (__v4sf) __W,
   1649           (__mmask8) __U);
   1650 }
   1651 
   1652 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   1653 _mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
   1654 {
   1655  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
   1656           (__v4sf) __B,
   1657           (__v4sf) _mm_setzero_ps (),
   1658           (__mmask8) __U);
   1659 }
   1660 
   1661 static  __inline__ __m128d __DEFAULT_FN_ATTRS128
   1662 _mm_rsqrt14_sd(__m128d __A, __m128d __B)
   1663 {
   1664   return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
   1665               (__v2df) __B,
   1666               (__v2df)
   1667               _mm_setzero_pd (),
   1668               (__mmask8) -1);
   1669 }
   1670 
   1671 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   1672 _mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
   1673 {
   1674  return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
   1675           (__v2df) __B,
   1676           (__v2df) __W,
   1677           (__mmask8) __U);
   1678 }
   1679 
   1680 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   1681 _mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
   1682 {
   1683  return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
   1684           (__v2df) __B,
   1685           (__v2df) _mm_setzero_pd (),
   1686           (__mmask8) __U);
   1687 }
   1688 
   1689 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
   1690 _mm512_rcp14_pd(__m512d __A)
   1691 {
   1692   return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
   1693                (__v8df)
   1694                _mm512_setzero_pd (),
   1695                (__mmask8) -1);
   1696 }
   1697 
   1698 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1699 _mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
   1700 {
   1701   return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
   1702                 (__v8df) __W,
   1703                 (__mmask8) __U);
   1704 }
   1705 
   1706 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1707 _mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
   1708 {
   1709   return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
   1710                 (__v8df)
   1711                 _mm512_setzero_pd (),
   1712                 (__mmask8) __U);
   1713 }
   1714 
   1715 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
   1716 _mm512_rcp14_ps(__m512 __A)
   1717 {
   1718   return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
   1719               (__v16sf)
   1720               _mm512_setzero_ps (),
   1721               (__mmask16) -1);
   1722 }
   1723 
   1724 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   1725 _mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
   1726 {
   1727   return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
   1728                    (__v16sf) __W,
   1729                    (__mmask16) __U);
   1730 }
   1731 
   1732 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   1733 _mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
   1734 {
   1735   return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
   1736                    (__v16sf)
   1737                    _mm512_setzero_ps (),
   1738                    (__mmask16) __U);
   1739 }
   1740 
   1741 static  __inline__ __m128 __DEFAULT_FN_ATTRS128
   1742 _mm_rcp14_ss(__m128 __A, __m128 __B)
   1743 {
   1744   return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
   1745                  (__v4sf) __B,
   1746                  (__v4sf)
   1747                  _mm_setzero_ps (),
   1748                  (__mmask8) -1);
   1749 }
   1750 
   1751 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   1752 _mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
   1753 {
   1754  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
   1755           (__v4sf) __B,
   1756           (__v4sf) __W,
   1757           (__mmask8) __U);
   1758 }
   1759 
   1760 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   1761 _mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
   1762 {
   1763  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
   1764           (__v4sf) __B,
   1765           (__v4sf) _mm_setzero_ps (),
   1766           (__mmask8) __U);
   1767 }
   1768 
   1769 static  __inline__ __m128d __DEFAULT_FN_ATTRS128
   1770 _mm_rcp14_sd(__m128d __A, __m128d __B)
   1771 {
   1772   return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
   1773             (__v2df) __B,
   1774             (__v2df)
   1775             _mm_setzero_pd (),
   1776             (__mmask8) -1);
   1777 }
   1778 
   1779 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   1780 _mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
   1781 {
   1782  return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
   1783           (__v2df) __B,
   1784           (__v2df) __W,
   1785           (__mmask8) __U);
   1786 }
   1787 
   1788 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   1789 _mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
   1790 {
   1791  return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
   1792           (__v2df) __B,
   1793           (__v2df) _mm_setzero_pd (),
   1794           (__mmask8) __U);
   1795 }
   1796 
   1797 static __inline __m512 __DEFAULT_FN_ATTRS512
   1798 _mm512_floor_ps(__m512 __A)
   1799 {
   1800   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
   1801                                                   _MM_FROUND_FLOOR,
   1802                                                   (__v16sf) __A, (unsigned short)-1,
   1803                                                   _MM_FROUND_CUR_DIRECTION);
   1804 }
   1805 
   1806 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   1807 _mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
   1808 {
   1809   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
   1810                    _MM_FROUND_FLOOR,
   1811                    (__v16sf) __W, __U,
   1812                    _MM_FROUND_CUR_DIRECTION);
   1813 }
   1814 
   1815 static __inline __m512d __DEFAULT_FN_ATTRS512
   1816 _mm512_floor_pd(__m512d __A)
   1817 {
   1818   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
   1819                                                    _MM_FROUND_FLOOR,
   1820                                                    (__v8df) __A, (unsigned char)-1,
   1821                                                    _MM_FROUND_CUR_DIRECTION);
   1822 }
   1823 
   1824 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1825 _mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
   1826 {
   1827   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
   1828                 _MM_FROUND_FLOOR,
   1829                 (__v8df) __W, __U,
   1830                 _MM_FROUND_CUR_DIRECTION);
   1831 }
   1832 
   1833 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   1834 _mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
   1835 {
   1836   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
   1837                    _MM_FROUND_CEIL,
   1838                    (__v16sf) __W, __U,
   1839                    _MM_FROUND_CUR_DIRECTION);
   1840 }
   1841 
   1842 static __inline __m512 __DEFAULT_FN_ATTRS512
   1843 _mm512_ceil_ps(__m512 __A)
   1844 {
   1845   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
   1846                                                   _MM_FROUND_CEIL,
   1847                                                   (__v16sf) __A, (unsigned short)-1,
   1848                                                   _MM_FROUND_CUR_DIRECTION);
   1849 }
   1850 
   1851 static __inline __m512d __DEFAULT_FN_ATTRS512
   1852 _mm512_ceil_pd(__m512d __A)
   1853 {
   1854   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
   1855                                                    _MM_FROUND_CEIL,
   1856                                                    (__v8df) __A, (unsigned char)-1,
   1857                                                    _MM_FROUND_CUR_DIRECTION);
   1858 }
   1859 
   1860 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1861 _mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
   1862 {
   1863   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
   1864                 _MM_FROUND_CEIL,
   1865                 (__v8df) __W, __U,
   1866                 _MM_FROUND_CUR_DIRECTION);
   1867 }
   1868 
   1869 static __inline __m512i __DEFAULT_FN_ATTRS512
   1870 _mm512_abs_epi64(__m512i __A)
   1871 {
   1872   return (__m512i)__builtin_elementwise_abs((__v8di)__A);
   1873 }
   1874 
   1875 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1876 _mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
   1877 {
   1878   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   1879                                              (__v8di)_mm512_abs_epi64(__A),
   1880                                              (__v8di)__W);
   1881 }
   1882 
   1883 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1884 _mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
   1885 {
   1886   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   1887                                              (__v8di)_mm512_abs_epi64(__A),
   1888                                              (__v8di)_mm512_setzero_si512());
   1889 }
   1890 
   1891 static __inline __m512i __DEFAULT_FN_ATTRS512
   1892 _mm512_abs_epi32(__m512i __A)
   1893 {
   1894   return (__m512i)__builtin_elementwise_abs((__v16si) __A);
   1895 }
   1896 
   1897 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1898 _mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
   1899 {
   1900   return (__m512i)__builtin_ia32_selectd_512(__U,
   1901                                              (__v16si)_mm512_abs_epi32(__A),
   1902                                              (__v16si)__W);
   1903 }
   1904 
   1905 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   1906 _mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
   1907 {
   1908   return (__m512i)__builtin_ia32_selectd_512(__U,
   1909                                              (__v16si)_mm512_abs_epi32(__A),
   1910                                              (__v16si)_mm512_setzero_si512());
   1911 }
   1912 
   1913 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   1914 _mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
   1915   __A = _mm_add_ss(__A, __B);
   1916   return __builtin_ia32_selectss_128(__U, __A, __W);
   1917 }
   1918 
   1919 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   1920 _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
   1921   __A = _mm_add_ss(__A, __B);
   1922   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
   1923 }
   1924 
   1925 #define _mm_add_round_ss(A, B, R) \
   1926   ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
   1927                                            (__v4sf)(__m128)(B), \
   1928                                            (__v4sf)_mm_setzero_ps(), \
   1929                                            (__mmask8)-1, (int)(R)))
   1930 
   1931 #define _mm_mask_add_round_ss(W, U, A, B, R) \
   1932   ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
   1933                                            (__v4sf)(__m128)(B), \
   1934                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
   1935                                            (int)(R)))
   1936 
   1937 #define _mm_maskz_add_round_ss(U, A, B, R) \
   1938   ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
   1939                                            (__v4sf)(__m128)(B), \
   1940                                            (__v4sf)_mm_setzero_ps(), \
   1941                                            (__mmask8)(U), (int)(R)))
   1942 
   1943 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   1944 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
   1945   __A = _mm_add_sd(__A, __B);
   1946   return __builtin_ia32_selectsd_128(__U, __A, __W);
   1947 }
   1948 
   1949 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   1950 _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
   1951   __A = _mm_add_sd(__A, __B);
   1952   return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
   1953 }
   1954 #define _mm_add_round_sd(A, B, R) \
   1955   ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
   1956                                             (__v2df)(__m128d)(B), \
   1957                                             (__v2df)_mm_setzero_pd(), \
   1958                                             (__mmask8)-1, (int)(R)))
   1959 
   1960 #define _mm_mask_add_round_sd(W, U, A, B, R) \
   1961   ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
   1962                                             (__v2df)(__m128d)(B), \
   1963                                             (__v2df)(__m128d)(W), \
   1964                                             (__mmask8)(U), (int)(R)))
   1965 
   1966 #define _mm_maskz_add_round_sd(U, A, B, R) \
   1967   ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
   1968                                             (__v2df)(__m128d)(B), \
   1969                                             (__v2df)_mm_setzero_pd(), \
   1970                                             (__mmask8)(U), (int)(R)))
   1971 
   1972 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1973 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   1974   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   1975                                               (__v8df)_mm512_add_pd(__A, __B),
   1976                                               (__v8df)__W);
   1977 }
   1978 
   1979 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   1980 _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   1981   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   1982                                               (__v8df)_mm512_add_pd(__A, __B),
   1983                                               (__v8df)_mm512_setzero_pd());
   1984 }
   1985 
   1986 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   1987 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   1988   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   1989                                              (__v16sf)_mm512_add_ps(__A, __B),
   1990                                              (__v16sf)__W);
   1991 }
   1992 
   1993 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   1994 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   1995   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   1996                                              (__v16sf)_mm512_add_ps(__A, __B),
   1997                                              (__v16sf)_mm512_setzero_ps());
   1998 }
   1999 
   2000 #define _mm512_add_round_pd(A, B, R) \
   2001   ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \
   2002                                     (__v8df)(__m512d)(B), (int)(R)))
   2003 
   2004 #define _mm512_mask_add_round_pd(W, U, A, B, R) \
   2005   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   2006                                    (__v8df)_mm512_add_round_pd((A), (B), (R)), \
   2007                                    (__v8df)(__m512d)(W)))
   2008 
   2009 #define _mm512_maskz_add_round_pd(U, A, B, R) \
   2010   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   2011                                    (__v8df)_mm512_add_round_pd((A), (B), (R)), \
   2012                                    (__v8df)_mm512_setzero_pd()))
   2013 
   2014 #define _mm512_add_round_ps(A, B, R) \
   2015   ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
   2016                                    (__v16sf)(__m512)(B), (int)(R)))
   2017 
   2018 #define _mm512_mask_add_round_ps(W, U, A, B, R) \
   2019   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   2020                                   (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
   2021                                   (__v16sf)(__m512)(W)))
   2022 
   2023 #define _mm512_maskz_add_round_ps(U, A, B, R) \
   2024   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   2025                                   (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
   2026                                   (__v16sf)_mm512_setzero_ps()))
   2027 
   2028 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   2029 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
   2030   __A = _mm_sub_ss(__A, __B);
   2031   return __builtin_ia32_selectss_128(__U, __A, __W);
   2032 }
   2033 
   2034 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   2035 _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
   2036   __A = _mm_sub_ss(__A, __B);
   2037   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
   2038 }
   2039 #define _mm_sub_round_ss(A, B, R) \
   2040   ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
   2041                                            (__v4sf)(__m128)(B), \
   2042                                            (__v4sf)_mm_setzero_ps(), \
   2043                                            (__mmask8)-1, (int)(R)))
   2044 
   2045 #define _mm_mask_sub_round_ss(W, U, A, B, R) \
   2046   ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
   2047                                            (__v4sf)(__m128)(B), \
   2048                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
   2049                                            (int)(R)))
   2050 
   2051 #define _mm_maskz_sub_round_ss(U, A, B, R) \
   2052   ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
   2053                                            (__v4sf)(__m128)(B), \
   2054                                            (__v4sf)_mm_setzero_ps(), \
   2055                                            (__mmask8)(U), (int)(R)))
   2056 
   2057 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   2058 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
   2059   __A = _mm_sub_sd(__A, __B);
   2060   return __builtin_ia32_selectsd_128(__U, __A, __W);
   2061 }
   2062 
   2063 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   2064 _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
   2065   __A = _mm_sub_sd(__A, __B);
   2066   return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
   2067 }
   2068 
   2069 #define _mm_sub_round_sd(A, B, R) \
   2070   ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
   2071                                             (__v2df)(__m128d)(B), \
   2072                                             (__v2df)_mm_setzero_pd(), \
   2073                                             (__mmask8)-1, (int)(R)))
   2074 
   2075 #define _mm_mask_sub_round_sd(W, U, A, B, R) \
   2076   ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
   2077                                             (__v2df)(__m128d)(B), \
   2078                                             (__v2df)(__m128d)(W), \
   2079                                             (__mmask8)(U), (int)(R)))
   2080 
   2081 #define _mm_maskz_sub_round_sd(U, A, B, R) \
   2082   ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
   2083                                             (__v2df)(__m128d)(B), \
   2084                                             (__v2df)_mm_setzero_pd(), \
   2085                                             (__mmask8)(U), (int)(R)))
   2086 
   2087 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2088 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   2089   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   2090                                               (__v8df)_mm512_sub_pd(__A, __B),
   2091                                               (__v8df)__W);
   2092 }
   2093 
   2094 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2095 _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   2096   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   2097                                               (__v8df)_mm512_sub_pd(__A, __B),
   2098                                               (__v8df)_mm512_setzero_pd());
   2099 }
   2100 
   2101 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2102 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   2103   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   2104                                              (__v16sf)_mm512_sub_ps(__A, __B),
   2105                                              (__v16sf)__W);
   2106 }
   2107 
   2108 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2109 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   2110   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   2111                                              (__v16sf)_mm512_sub_ps(__A, __B),
   2112                                              (__v16sf)_mm512_setzero_ps());
   2113 }
   2114 
   2115 #define _mm512_sub_round_pd(A, B, R) \
   2116   ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \
   2117                                     (__v8df)(__m512d)(B), (int)(R)))
   2118 
   2119 #define _mm512_mask_sub_round_pd(W, U, A, B, R) \
   2120   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   2121                                    (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
   2122                                    (__v8df)(__m512d)(W)))
   2123 
   2124 #define _mm512_maskz_sub_round_pd(U, A, B, R) \
   2125   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   2126                                    (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
   2127                                    (__v8df)_mm512_setzero_pd()))
   2128 
   2129 #define _mm512_sub_round_ps(A, B, R) \
   2130   ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
   2131                                    (__v16sf)(__m512)(B), (int)(R)))
   2132 
   2133 #define _mm512_mask_sub_round_ps(W, U, A, B, R) \
   2134   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   2135                                   (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
   2136                                   (__v16sf)(__m512)(W)))
   2137 
   2138 #define _mm512_maskz_sub_round_ps(U, A, B, R) \
   2139   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   2140                                   (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
   2141                                   (__v16sf)_mm512_setzero_ps()))
   2142 
   2143 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   2144 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
   2145   __A = _mm_mul_ss(__A, __B);
   2146   return __builtin_ia32_selectss_128(__U, __A, __W);
   2147 }
   2148 
   2149 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   2150 _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
   2151   __A = _mm_mul_ss(__A, __B);
   2152   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
   2153 }
   2154 #define _mm_mul_round_ss(A, B, R) \
   2155   ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
   2156                                            (__v4sf)(__m128)(B), \
   2157                                            (__v4sf)_mm_setzero_ps(), \
   2158                                            (__mmask8)-1, (int)(R)))
   2159 
   2160 #define _mm_mask_mul_round_ss(W, U, A, B, R) \
   2161   ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
   2162                                            (__v4sf)(__m128)(B), \
   2163                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
   2164                                            (int)(R)))
   2165 
   2166 #define _mm_maskz_mul_round_ss(U, A, B, R) \
   2167   ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
   2168                                            (__v4sf)(__m128)(B), \
   2169                                            (__v4sf)_mm_setzero_ps(), \
   2170                                            (__mmask8)(U), (int)(R)))
   2171 
   2172 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   2173 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
   2174   __A = _mm_mul_sd(__A, __B);
   2175   return __builtin_ia32_selectsd_128(__U, __A, __W);
   2176 }
   2177 
   2178 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   2179 _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
   2180   __A = _mm_mul_sd(__A, __B);
   2181   return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
   2182 }
   2183 
   2184 #define _mm_mul_round_sd(A, B, R) \
   2185   ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
   2186                                             (__v2df)(__m128d)(B), \
   2187                                             (__v2df)_mm_setzero_pd(), \
   2188                                             (__mmask8)-1, (int)(R)))
   2189 
   2190 #define _mm_mask_mul_round_sd(W, U, A, B, R) \
   2191   ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
   2192                                             (__v2df)(__m128d)(B), \
   2193                                             (__v2df)(__m128d)(W), \
   2194                                             (__mmask8)(U), (int)(R)))
   2195 
   2196 #define _mm_maskz_mul_round_sd(U, A, B, R) \
   2197   ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
   2198                                             (__v2df)(__m128d)(B), \
   2199                                             (__v2df)_mm_setzero_pd(), \
   2200                                             (__mmask8)(U), (int)(R)))
   2201 
   2202 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2203 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   2204   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   2205                                               (__v8df)_mm512_mul_pd(__A, __B),
   2206                                               (__v8df)__W);
   2207 }
   2208 
   2209 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2210 _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   2211   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   2212                                               (__v8df)_mm512_mul_pd(__A, __B),
   2213                                               (__v8df)_mm512_setzero_pd());
   2214 }
   2215 
   2216 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2217 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   2218   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   2219                                              (__v16sf)_mm512_mul_ps(__A, __B),
   2220                                              (__v16sf)__W);
   2221 }
   2222 
   2223 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2224 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   2225   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   2226                                              (__v16sf)_mm512_mul_ps(__A, __B),
   2227                                              (__v16sf)_mm512_setzero_ps());
   2228 }
   2229 
   2230 #define _mm512_mul_round_pd(A, B, R) \
   2231   ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \
   2232                                     (__v8df)(__m512d)(B), (int)(R)))
   2233 
   2234 #define _mm512_mask_mul_round_pd(W, U, A, B, R) \
   2235   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   2236                                    (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
   2237                                    (__v8df)(__m512d)(W)))
   2238 
   2239 #define _mm512_maskz_mul_round_pd(U, A, B, R) \
   2240   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   2241                                    (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
   2242                                    (__v8df)_mm512_setzero_pd()))
   2243 
   2244 #define _mm512_mul_round_ps(A, B, R) \
   2245   ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
   2246                                   (__v16sf)(__m512)(B), (int)(R)))
   2247 
   2248 #define _mm512_mask_mul_round_ps(W, U, A, B, R) \
   2249   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   2250                                   (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
   2251                                   (__v16sf)(__m512)(W)))
   2252 
   2253 #define _mm512_maskz_mul_round_ps(U, A, B, R) \
   2254   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   2255                                   (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
   2256                                   (__v16sf)_mm512_setzero_ps()))
   2257 
   2258 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   2259 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
   2260   __A = _mm_div_ss(__A, __B);
   2261   return __builtin_ia32_selectss_128(__U, __A, __W);
   2262 }
   2263 
   2264 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   2265 _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
   2266   __A = _mm_div_ss(__A, __B);
   2267   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
   2268 }
   2269 
   2270 #define _mm_div_round_ss(A, B, R) \
   2271   ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
   2272                                            (__v4sf)(__m128)(B), \
   2273                                            (__v4sf)_mm_setzero_ps(), \
   2274                                            (__mmask8)-1, (int)(R)))
   2275 
   2276 #define _mm_mask_div_round_ss(W, U, A, B, R) \
   2277   ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
   2278                                            (__v4sf)(__m128)(B), \
   2279                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
   2280                                            (int)(R)))
   2281 
   2282 #define _mm_maskz_div_round_ss(U, A, B, R) \
   2283   ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
   2284                                            (__v4sf)(__m128)(B), \
   2285                                            (__v4sf)_mm_setzero_ps(), \
   2286                                            (__mmask8)(U), (int)(R)))
   2287 
   2288 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   2289 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
   2290   __A = _mm_div_sd(__A, __B);
   2291   return __builtin_ia32_selectsd_128(__U, __A, __W);
   2292 }
   2293 
   2294 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   2295 _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
   2296   __A = _mm_div_sd(__A, __B);
   2297   return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
   2298 }
   2299 
   2300 #define _mm_div_round_sd(A, B, R) \
   2301   ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
   2302                                             (__v2df)(__m128d)(B), \
   2303                                             (__v2df)_mm_setzero_pd(), \
   2304                                             (__mmask8)-1, (int)(R)))
   2305 
   2306 #define _mm_mask_div_round_sd(W, U, A, B, R) \
   2307   ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
   2308                                             (__v2df)(__m128d)(B), \
   2309                                             (__v2df)(__m128d)(W), \
   2310                                             (__mmask8)(U), (int)(R)))
   2311 
   2312 #define _mm_maskz_div_round_sd(U, A, B, R) \
   2313   ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
   2314                                             (__v2df)(__m128d)(B), \
   2315                                             (__v2df)_mm_setzero_pd(), \
   2316                                             (__mmask8)(U), (int)(R)))
   2317 
   2318 static __inline __m512d __DEFAULT_FN_ATTRS512
   2319 _mm512_div_pd(__m512d __a, __m512d __b)
   2320 {
   2321   return (__m512d)((__v8df)__a/(__v8df)__b);
   2322 }
   2323 
   2324 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2325 _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   2326   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   2327                                               (__v8df)_mm512_div_pd(__A, __B),
   2328                                               (__v8df)__W);
   2329 }
   2330 
   2331 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2332 _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   2333   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   2334                                               (__v8df)_mm512_div_pd(__A, __B),
   2335                                               (__v8df)_mm512_setzero_pd());
   2336 }
   2337 
   2338 static __inline __m512 __DEFAULT_FN_ATTRS512
   2339 _mm512_div_ps(__m512 __a, __m512 __b)
   2340 {
   2341   return (__m512)((__v16sf)__a/(__v16sf)__b);
   2342 }
   2343 
   2344 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2345 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   2346   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   2347                                              (__v16sf)_mm512_div_ps(__A, __B),
   2348                                              (__v16sf)__W);
   2349 }
   2350 
   2351 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2352 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   2353   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   2354                                              (__v16sf)_mm512_div_ps(__A, __B),
   2355                                              (__v16sf)_mm512_setzero_ps());
   2356 }
   2357 
   2358 #define _mm512_div_round_pd(A, B, R) \
   2359   ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \
   2360                                     (__v8df)(__m512d)(B), (int)(R)))
   2361 
   2362 #define _mm512_mask_div_round_pd(W, U, A, B, R) \
   2363   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   2364                                    (__v8df)_mm512_div_round_pd((A), (B), (R)), \
   2365                                    (__v8df)(__m512d)(W)))
   2366 
   2367 #define _mm512_maskz_div_round_pd(U, A, B, R) \
   2368   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   2369                                    (__v8df)_mm512_div_round_pd((A), (B), (R)), \
   2370                                    (__v8df)_mm512_setzero_pd()))
   2371 
   2372 #define _mm512_div_round_ps(A, B, R) \
   2373   ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
   2374                                    (__v16sf)(__m512)(B), (int)(R)))
   2375 
   2376 #define _mm512_mask_div_round_ps(W, U, A, B, R) \
   2377   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   2378                                   (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
   2379                                   (__v16sf)(__m512)(W)))
   2380 
   2381 #define _mm512_maskz_div_round_ps(U, A, B, R) \
   2382   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   2383                                   (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
   2384                                   (__v16sf)_mm512_setzero_ps()))
   2385 
   2386 #define _mm512_roundscale_ps(A, B) \
   2387   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
   2388                                           (__v16sf)_mm512_undefined_ps(), \
   2389                                           (__mmask16)-1, \
   2390                                           _MM_FROUND_CUR_DIRECTION))
   2391 
   2392 #define _mm512_mask_roundscale_ps(A, B, C, imm) \
   2393   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
   2394                                          (__v16sf)(__m512)(A), (__mmask16)(B), \
   2395                                          _MM_FROUND_CUR_DIRECTION))
   2396 
   2397 #define _mm512_maskz_roundscale_ps(A, B, imm) \
   2398   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
   2399                                           (__v16sf)_mm512_setzero_ps(), \
   2400                                           (__mmask16)(A), \
   2401                                           _MM_FROUND_CUR_DIRECTION))
   2402 
   2403 #define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \
   2404   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
   2405                                          (__v16sf)(__m512)(A), (__mmask16)(B), \
   2406                                          (int)(R)))
   2407 
   2408 #define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \
   2409   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
   2410                                           (__v16sf)_mm512_setzero_ps(), \
   2411                                           (__mmask16)(A), (int)(R)))
   2412 
   2413 #define _mm512_roundscale_round_ps(A, imm, R) \
   2414   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
   2415                                           (__v16sf)_mm512_undefined_ps(), \
   2416                                           (__mmask16)-1, (int)(R)))
   2417 
   2418 #define _mm512_roundscale_pd(A, B) \
   2419   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
   2420                                            (__v8df)_mm512_undefined_pd(), \
   2421                                            (__mmask8)-1, \
   2422                                            _MM_FROUND_CUR_DIRECTION))
   2423 
   2424 #define _mm512_mask_roundscale_pd(A, B, C, imm) \
   2425   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
   2426                                           (__v8df)(__m512d)(A), (__mmask8)(B), \
   2427                                           _MM_FROUND_CUR_DIRECTION))
   2428 
   2429 #define _mm512_maskz_roundscale_pd(A, B, imm) \
   2430   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
   2431                                            (__v8df)_mm512_setzero_pd(), \
   2432                                            (__mmask8)(A), \
   2433                                            _MM_FROUND_CUR_DIRECTION))
   2434 
   2435 #define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \
   2436   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
   2437                                           (__v8df)(__m512d)(A), (__mmask8)(B), \
   2438                                           (int)(R)))
   2439 
   2440 #define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \
   2441   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
   2442                                            (__v8df)_mm512_setzero_pd(), \
   2443                                            (__mmask8)(A), (int)(R)))
   2444 
   2445 #define _mm512_roundscale_round_pd(A, imm, R) \
   2446   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
   2447                                            (__v8df)_mm512_undefined_pd(), \
   2448                                            (__mmask8)-1, (int)(R)))
   2449 
   2450 #define _mm512_fmadd_round_pd(A, B, C, R) \
   2451   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
   2452                                             (__v8df)(__m512d)(B), \
   2453                                             (__v8df)(__m512d)(C), \
   2454                                             (__mmask8)-1, (int)(R)))
   2455 
   2456 
   2457 #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
   2458   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
   2459                                             (__v8df)(__m512d)(B), \
   2460                                             (__v8df)(__m512d)(C), \
   2461                                             (__mmask8)(U), (int)(R)))
   2462 
   2463 
   2464 #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
   2465   ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
   2466                                              (__v8df)(__m512d)(B), \
   2467                                              (__v8df)(__m512d)(C), \
   2468                                              (__mmask8)(U), (int)(R)))
   2469 
   2470 
   2471 #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
   2472   ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
   2473                                              (__v8df)(__m512d)(B), \
   2474                                              (__v8df)(__m512d)(C), \
   2475                                              (__mmask8)(U), (int)(R)))
   2476 
   2477 
   2478 #define _mm512_fmsub_round_pd(A, B, C, R) \
   2479   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
   2480                                             (__v8df)(__m512d)(B), \
   2481                                             -(__v8df)(__m512d)(C), \
   2482                                             (__mmask8)-1, (int)(R)))
   2483 
   2484 
   2485 #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
   2486   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
   2487                                             (__v8df)(__m512d)(B), \
   2488                                             -(__v8df)(__m512d)(C), \
   2489                                             (__mmask8)(U), (int)(R)))
   2490 
   2491 
   2492 #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
   2493   ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
   2494                                              (__v8df)(__m512d)(B), \
   2495                                              -(__v8df)(__m512d)(C), \
   2496                                              (__mmask8)(U), (int)(R)))
   2497 
   2498 
   2499 #define _mm512_fnmadd_round_pd(A, B, C, R) \
   2500   ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
   2501                                             (__v8df)(__m512d)(B), \
   2502                                             (__v8df)(__m512d)(C), \
   2503                                             (__mmask8)-1, (int)(R)))
   2504 
   2505 
   2506 #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
   2507   ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
   2508                                              (__v8df)(__m512d)(B), \
   2509                                              (__v8df)(__m512d)(C), \
   2510                                              (__mmask8)(U), (int)(R)))
   2511 
   2512 
   2513 #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
   2514   ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
   2515                                              (__v8df)(__m512d)(B), \
   2516                                              (__v8df)(__m512d)(C), \
   2517                                              (__mmask8)(U), (int)(R)))
   2518 
   2519 
   2520 #define _mm512_fnmsub_round_pd(A, B, C, R) \
   2521   ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
   2522                                             (__v8df)(__m512d)(B), \
   2523                                             -(__v8df)(__m512d)(C), \
   2524                                             (__mmask8)-1, (int)(R)))
   2525 
   2526 
   2527 #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
   2528   ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
   2529                                              (__v8df)(__m512d)(B), \
   2530                                              -(__v8df)(__m512d)(C), \
   2531                                              (__mmask8)(U), (int)(R)))
   2532 
   2533 
   2534 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2535 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
   2536 {
   2537   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
   2538                                                     (__v8df) __B,
   2539                                                     (__v8df) __C,
   2540                                                     (__mmask8) -1,
   2541                                                     _MM_FROUND_CUR_DIRECTION);
   2542 }
   2543 
   2544 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2545 _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
   2546 {
   2547   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
   2548                                                     (__v8df) __B,
   2549                                                     (__v8df) __C,
   2550                                                     (__mmask8) __U,
   2551                                                     _MM_FROUND_CUR_DIRECTION);
   2552 }
   2553 
   2554 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2555 _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
   2556 {
   2557   return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
   2558                                                      (__v8df) __B,
   2559                                                      (__v8df) __C,
   2560                                                      (__mmask8) __U,
   2561                                                      _MM_FROUND_CUR_DIRECTION);
   2562 }
   2563 
   2564 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2565 _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
   2566 {
   2567   return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
   2568                                                      (__v8df) __B,
   2569                                                      (__v8df) __C,
   2570                                                      (__mmask8) __U,
   2571                                                      _MM_FROUND_CUR_DIRECTION);
   2572 }
   2573 
   2574 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2575 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
   2576 {
   2577   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
   2578                                                     (__v8df) __B,
   2579                                                     -(__v8df) __C,
   2580                                                     (__mmask8) -1,
   2581                                                     _MM_FROUND_CUR_DIRECTION);
   2582 }
   2583 
   2584 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2585 _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
   2586 {
   2587   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
   2588                                                     (__v8df) __B,
   2589                                                     -(__v8df) __C,
   2590                                                     (__mmask8) __U,
   2591                                                     _MM_FROUND_CUR_DIRECTION);
   2592 }
   2593 
   2594 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2595 _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
   2596 {
   2597   return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
   2598                                                      (__v8df) __B,
   2599                                                      -(__v8df) __C,
   2600                                                      (__mmask8) __U,
   2601                                                      _MM_FROUND_CUR_DIRECTION);
   2602 }
   2603 
   2604 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2605 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
   2606 {
   2607   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
   2608                                                     -(__v8df) __B,
   2609                                                     (__v8df) __C,
   2610                                                     (__mmask8) -1,
   2611                                                     _MM_FROUND_CUR_DIRECTION);
   2612 }
   2613 
   2614 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2615 _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
   2616 {
   2617   return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
   2618                                                      (__v8df) __B,
   2619                                                      (__v8df) __C,
   2620                                                      (__mmask8) __U,
   2621                                                      _MM_FROUND_CUR_DIRECTION);
   2622 }
   2623 
   2624 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2625 _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
   2626 {
   2627   return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
   2628                                                      (__v8df) __B,
   2629                                                      (__v8df) __C,
   2630                                                      (__mmask8) __U,
   2631                                                      _MM_FROUND_CUR_DIRECTION);
   2632 }
   2633 
   2634 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2635 _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
   2636 {
   2637   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
   2638                                                     -(__v8df) __B,
   2639                                                     -(__v8df) __C,
   2640                                                     (__mmask8) -1,
   2641                                                     _MM_FROUND_CUR_DIRECTION);
   2642 }
   2643 
   2644 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2645 _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
   2646 {
   2647   return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
   2648                                                      (__v8df) __B,
   2649                                                      -(__v8df) __C,
   2650                                                      (__mmask8) __U,
   2651                                                      _MM_FROUND_CUR_DIRECTION);
   2652 }
   2653 
   2654 #define _mm512_fmadd_round_ps(A, B, C, R) \
   2655   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
   2656                                            (__v16sf)(__m512)(B), \
   2657                                            (__v16sf)(__m512)(C), \
   2658                                            (__mmask16)-1, (int)(R)))
   2659 
   2660 
   2661 #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
   2662   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
   2663                                            (__v16sf)(__m512)(B), \
   2664                                            (__v16sf)(__m512)(C), \
   2665                                            (__mmask16)(U), (int)(R)))
   2666 
   2667 
   2668 #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
   2669   ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
   2670                                             (__v16sf)(__m512)(B), \
   2671                                             (__v16sf)(__m512)(C), \
   2672                                             (__mmask16)(U), (int)(R)))
   2673 
   2674 
   2675 #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
   2676   ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
   2677                                             (__v16sf)(__m512)(B), \
   2678                                             (__v16sf)(__m512)(C), \
   2679                                             (__mmask16)(U), (int)(R)))
   2680 
   2681 
   2682 #define _mm512_fmsub_round_ps(A, B, C, R) \
   2683   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
   2684                                            (__v16sf)(__m512)(B), \
   2685                                            -(__v16sf)(__m512)(C), \
   2686                                            (__mmask16)-1, (int)(R)))
   2687 
   2688 
   2689 #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
   2690   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
   2691                                            (__v16sf)(__m512)(B), \
   2692                                            -(__v16sf)(__m512)(C), \
   2693                                            (__mmask16)(U), (int)(R)))
   2694 
   2695 
   2696 #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
   2697   ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
   2698                                             (__v16sf)(__m512)(B), \
   2699                                             -(__v16sf)(__m512)(C), \
   2700                                             (__mmask16)(U), (int)(R)))
   2701 
   2702 
   2703 #define _mm512_fnmadd_round_ps(A, B, C, R) \
   2704   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
   2705                                            -(__v16sf)(__m512)(B), \
   2706                                            (__v16sf)(__m512)(C), \
   2707                                            (__mmask16)-1, (int)(R)))
   2708 
   2709 
   2710 #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
   2711   ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
   2712                                             (__v16sf)(__m512)(B), \
   2713                                             (__v16sf)(__m512)(C), \
   2714                                             (__mmask16)(U), (int)(R)))
   2715 
   2716 
   2717 #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
   2718   ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
   2719                                             (__v16sf)(__m512)(B), \
   2720                                             (__v16sf)(__m512)(C), \
   2721                                             (__mmask16)(U), (int)(R)))
   2722 
   2723 
   2724 #define _mm512_fnmsub_round_ps(A, B, C, R) \
   2725   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
   2726                                            -(__v16sf)(__m512)(B), \
   2727                                            -(__v16sf)(__m512)(C), \
   2728                                            (__mmask16)-1, (int)(R)))
   2729 
   2730 
   2731 #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
   2732   ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
   2733                                             (__v16sf)(__m512)(B), \
   2734                                             -(__v16sf)(__m512)(C), \
   2735                                             (__mmask16)(U), (int)(R)))
   2736 
   2737 
   2738 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2739 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
   2740 {
   2741   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
   2742                                                    (__v16sf) __B,
   2743                                                    (__v16sf) __C,
   2744                                                    (__mmask16) -1,
   2745                                                    _MM_FROUND_CUR_DIRECTION);
   2746 }
   2747 
   2748 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2749 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
   2750 {
   2751   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
   2752                                                    (__v16sf) __B,
   2753                                                    (__v16sf) __C,
   2754                                                    (__mmask16) __U,
   2755                                                    _MM_FROUND_CUR_DIRECTION);
   2756 }
   2757 
   2758 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2759 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
   2760 {
   2761   return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
   2762                                                     (__v16sf) __B,
   2763                                                     (__v16sf) __C,
   2764                                                     (__mmask16) __U,
   2765                                                     _MM_FROUND_CUR_DIRECTION);
   2766 }
   2767 
   2768 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2769 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
   2770 {
   2771   return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
   2772                                                     (__v16sf) __B,
   2773                                                     (__v16sf) __C,
   2774                                                     (__mmask16) __U,
   2775                                                     _MM_FROUND_CUR_DIRECTION);
   2776 }
   2777 
   2778 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2779 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
   2780 {
   2781   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
   2782                                                    (__v16sf) __B,
   2783                                                    -(__v16sf) __C,
   2784                                                    (__mmask16) -1,
   2785                                                    _MM_FROUND_CUR_DIRECTION);
   2786 }
   2787 
   2788 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2789 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
   2790 {
   2791   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
   2792                                                    (__v16sf) __B,
   2793                                                    -(__v16sf) __C,
   2794                                                    (__mmask16) __U,
   2795                                                    _MM_FROUND_CUR_DIRECTION);
   2796 }
   2797 
   2798 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2799 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
   2800 {
   2801   return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
   2802                                                     (__v16sf) __B,
   2803                                                     -(__v16sf) __C,
   2804                                                     (__mmask16) __U,
   2805                                                     _MM_FROUND_CUR_DIRECTION);
   2806 }
   2807 
   2808 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2809 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
   2810 {
   2811   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
   2812                                                    -(__v16sf) __B,
   2813                                                    (__v16sf) __C,
   2814                                                    (__mmask16) -1,
   2815                                                    _MM_FROUND_CUR_DIRECTION);
   2816 }
   2817 
   2818 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2819 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
   2820 {
   2821   return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
   2822                                                     (__v16sf) __B,
   2823                                                     (__v16sf) __C,
   2824                                                     (__mmask16) __U,
   2825                                                     _MM_FROUND_CUR_DIRECTION);
   2826 }
   2827 
   2828 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2829 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
   2830 {
   2831   return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
   2832                                                     (__v16sf) __B,
   2833                                                     (__v16sf) __C,
   2834                                                     (__mmask16) __U,
   2835                                                     _MM_FROUND_CUR_DIRECTION);
   2836 }
   2837 
   2838 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2839 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
   2840 {
   2841   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
   2842                                                    -(__v16sf) __B,
   2843                                                    -(__v16sf) __C,
   2844                                                    (__mmask16) -1,
   2845                                                    _MM_FROUND_CUR_DIRECTION);
   2846 }
   2847 
   2848 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   2849 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
   2850 {
   2851   return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
   2852                                                     (__v16sf) __B,
   2853                                                     -(__v16sf) __C,
   2854                                                     (__mmask16) __U,
   2855                                                     _MM_FROUND_CUR_DIRECTION);
   2856 }
   2857 
   2858 #define _mm512_fmaddsub_round_pd(A, B, C, R) \
   2859   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
   2860                                                (__v8df)(__m512d)(B), \
   2861                                                (__v8df)(__m512d)(C), \
   2862                                                (__mmask8)-1, (int)(R)))
   2863 
   2864 
   2865 #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
   2866   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
   2867                                                (__v8df)(__m512d)(B), \
   2868                                                (__v8df)(__m512d)(C), \
   2869                                                (__mmask8)(U), (int)(R)))
   2870 
   2871 
   2872 #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
   2873   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
   2874                                                 (__v8df)(__m512d)(B), \
   2875                                                 (__v8df)(__m512d)(C), \
   2876                                                 (__mmask8)(U), (int)(R)))
   2877 
   2878 
   2879 #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
   2880   ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
   2881                                                 (__v8df)(__m512d)(B), \
   2882                                                 (__v8df)(__m512d)(C), \
   2883                                                 (__mmask8)(U), (int)(R)))
   2884 
   2885 
   2886 #define _mm512_fmsubadd_round_pd(A, B, C, R) \
   2887   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
   2888                                                (__v8df)(__m512d)(B), \
   2889                                                -(__v8df)(__m512d)(C), \
   2890                                                (__mmask8)-1, (int)(R)))
   2891 
   2892 
   2893 #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
   2894   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
   2895                                                (__v8df)(__m512d)(B), \
   2896                                                -(__v8df)(__m512d)(C), \
   2897                                                (__mmask8)(U), (int)(R)))
   2898 
   2899 
   2900 #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
   2901   ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
   2902                                                 (__v8df)(__m512d)(B), \
   2903                                                 -(__v8df)(__m512d)(C), \
   2904                                                 (__mmask8)(U), (int)(R)))
   2905 
   2906 
   2907 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2908 _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
   2909 {
   2910   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
   2911                                                       (__v8df) __B,
   2912                                                       (__v8df) __C,
   2913                                                       (__mmask8) -1,
   2914                                                       _MM_FROUND_CUR_DIRECTION);
   2915 }
   2916 
   2917 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2918 _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
   2919 {
   2920   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
   2921                                                       (__v8df) __B,
   2922                                                       (__v8df) __C,
   2923                                                       (__mmask8) __U,
   2924                                                       _MM_FROUND_CUR_DIRECTION);
   2925 }
   2926 
   2927 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2928 _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
   2929 {
   2930   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
   2931                                                        (__v8df) __B,
   2932                                                        (__v8df) __C,
   2933                                                        (__mmask8) __U,
   2934                                                        _MM_FROUND_CUR_DIRECTION);
   2935 }
   2936 
   2937 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2938 _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
   2939 {
   2940   return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
   2941                                                        (__v8df) __B,
   2942                                                        (__v8df) __C,
   2943                                                        (__mmask8) __U,
   2944                                                        _MM_FROUND_CUR_DIRECTION);
   2945 }
   2946 
   2947 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2948 _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
   2949 {
   2950   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
   2951                                                        (__v8df) __B,
   2952                                                        -(__v8df) __C,
   2953                                                        (__mmask8) -1,
   2954                                                        _MM_FROUND_CUR_DIRECTION);
   2955 }
   2956 
   2957 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2958 _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
   2959 {
   2960   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
   2961                                                        (__v8df) __B,
   2962                                                        -(__v8df) __C,
   2963                                                        (__mmask8) __U,
   2964                                                        _MM_FROUND_CUR_DIRECTION);
   2965 }
   2966 
   2967 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   2968 _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
   2969 {
   2970   return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
   2971                                                         (__v8df) __B,
   2972                                                         -(__v8df) __C,
   2973                                                         (__mmask8) __U,
   2974                                                         _MM_FROUND_CUR_DIRECTION);
   2975 }
   2976 
   2977 #define _mm512_fmaddsub_round_ps(A, B, C, R) \
   2978   ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
   2979                                               (__v16sf)(__m512)(B), \
   2980                                               (__v16sf)(__m512)(C), \
   2981                                               (__mmask16)-1, (int)(R)))
   2982 
   2983 
   2984 #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
   2985   ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
   2986                                               (__v16sf)(__m512)(B), \
   2987                                               (__v16sf)(__m512)(C), \
   2988                                               (__mmask16)(U), (int)(R)))
   2989 
   2990 
   2991 #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
   2992   ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
   2993                                                (__v16sf)(__m512)(B), \
   2994                                                (__v16sf)(__m512)(C), \
   2995                                                (__mmask16)(U), (int)(R)))
   2996 
   2997 
   2998 #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
   2999   ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
   3000                                                (__v16sf)(__m512)(B), \
   3001                                                (__v16sf)(__m512)(C), \
   3002                                                (__mmask16)(U), (int)(R)))
   3003 
   3004 
   3005 #define _mm512_fmsubadd_round_ps(A, B, C, R) \
   3006   ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
   3007                                               (__v16sf)(__m512)(B), \
   3008                                               -(__v16sf)(__m512)(C), \
   3009                                               (__mmask16)-1, (int)(R)))
   3010 
   3011 
   3012 #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
   3013   ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
   3014                                               (__v16sf)(__m512)(B), \
   3015                                               -(__v16sf)(__m512)(C), \
   3016                                               (__mmask16)(U), (int)(R)))
   3017 
   3018 
   3019 #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
   3020   ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
   3021                                                (__v16sf)(__m512)(B), \
   3022                                                -(__v16sf)(__m512)(C), \
   3023                                                (__mmask16)(U), (int)(R)))
   3024 
   3025 
   3026 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3027 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
   3028 {
   3029   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
   3030                                                       (__v16sf) __B,
   3031                                                       (__v16sf) __C,
   3032                                                       (__mmask16) -1,
   3033                                                       _MM_FROUND_CUR_DIRECTION);
   3034 }
   3035 
   3036 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3037 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
   3038 {
   3039   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
   3040                                                       (__v16sf) __B,
   3041                                                       (__v16sf) __C,
   3042                                                       (__mmask16) __U,
   3043                                                       _MM_FROUND_CUR_DIRECTION);
   3044 }
   3045 
   3046 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3047 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
   3048 {
   3049   return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
   3050                                                        (__v16sf) __B,
   3051                                                        (__v16sf) __C,
   3052                                                        (__mmask16) __U,
   3053                                                        _MM_FROUND_CUR_DIRECTION);
   3054 }
   3055 
   3056 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3057 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
   3058 {
   3059   return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
   3060                                                        (__v16sf) __B,
   3061                                                        (__v16sf) __C,
   3062                                                        (__mmask16) __U,
   3063                                                        _MM_FROUND_CUR_DIRECTION);
   3064 }
   3065 
   3066 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3067 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
   3068 {
   3069   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
   3070                                                       (__v16sf) __B,
   3071                                                       -(__v16sf) __C,
   3072                                                       (__mmask16) -1,
   3073                                                       _MM_FROUND_CUR_DIRECTION);
   3074 }
   3075 
   3076 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3077 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
   3078 {
   3079   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
   3080                                                       (__v16sf) __B,
   3081                                                       -(__v16sf) __C,
   3082                                                       (__mmask16) __U,
   3083                                                       _MM_FROUND_CUR_DIRECTION);
   3084 }
   3085 
   3086 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3087 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
   3088 {
   3089   return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
   3090                                                        (__v16sf) __B,
   3091                                                        -(__v16sf) __C,
   3092                                                        (__mmask16) __U,
   3093                                                        _MM_FROUND_CUR_DIRECTION);
   3094 }
   3095 
   3096 #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
   3097   ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
   3098                                              (__v8df)(__m512d)(B), \
   3099                                              (__v8df)(__m512d)(C), \
   3100                                              (__mmask8)(U), (int)(R)))
   3101 
   3102 
   3103 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   3104 _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
   3105 {
   3106   return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
   3107                                                     (__v8df) __B,
   3108                                                     (__v8df) __C,
   3109                                                     (__mmask8) __U,
   3110                                                     _MM_FROUND_CUR_DIRECTION);
   3111 }
   3112 
   3113 #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
   3114   ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
   3115                                             (__v16sf)(__m512)(B), \
   3116                                             (__v16sf)(__m512)(C), \
   3117                                             (__mmask16)(U), (int)(R)))
   3118 
   3119 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3120 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
   3121 {
   3122   return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
   3123                                                    (__v16sf) __B,
   3124                                                    (__v16sf) __C,
   3125                                                    (__mmask16) __U,
   3126                                                    _MM_FROUND_CUR_DIRECTION);
   3127 }
   3128 
   3129 #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
   3130   ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
   3131                                                 (__v8df)(__m512d)(B), \
   3132                                                 (__v8df)(__m512d)(C), \
   3133                                                 (__mmask8)(U), (int)(R)))
   3134 
   3135 
   3136 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   3137 _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
   3138 {
   3139   return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
   3140                                                        (__v8df) __B,
   3141                                                        (__v8df) __C,
   3142                                                        (__mmask8) __U,
   3143                                                        _MM_FROUND_CUR_DIRECTION);
   3144 }
   3145 
   3146 #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
   3147   ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
   3148                                                (__v16sf)(__m512)(B), \
   3149                                                (__v16sf)(__m512)(C), \
   3150                                                (__mmask16)(U), (int)(R)))
   3151 
   3152 
   3153 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3154 _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
   3155 {
   3156   return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
   3157                                                       (__v16sf) __B,
   3158                                                       (__v16sf) __C,
   3159                                                       (__mmask16) __U,
   3160                                                       _MM_FROUND_CUR_DIRECTION);
   3161 }
   3162 
   3163 #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
   3164   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
   3165                                             -(__v8df)(__m512d)(B), \
   3166                                             (__v8df)(__m512d)(C), \
   3167                                             (__mmask8)(U), (int)(R)))
   3168 
   3169 
   3170 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   3171 _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
   3172 {
   3173   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
   3174                                                     -(__v8df) __B,
   3175                                                     (__v8df) __C,
   3176                                                     (__mmask8) __U,
   3177                                                     _MM_FROUND_CUR_DIRECTION);
   3178 }
   3179 
   3180 #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
   3181   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
   3182                                            -(__v16sf)(__m512)(B), \
   3183                                            (__v16sf)(__m512)(C), \
   3184                                            (__mmask16)(U), (int)(R)))
   3185 
   3186 
   3187 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3188 _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
   3189 {
   3190   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
   3191                                                    -(__v16sf) __B,
   3192                                                    (__v16sf) __C,
   3193                                                    (__mmask16) __U,
   3194                                                    _MM_FROUND_CUR_DIRECTION);
   3195 }
   3196 
   3197 #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
   3198   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
   3199                                             -(__v8df)(__m512d)(B), \
   3200                                             -(__v8df)(__m512d)(C), \
   3201                                             (__mmask8)(U), (int)(R)))
   3202 
   3203 
   3204 #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
   3205   ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
   3206                                              (__v8df)(__m512d)(B), \
   3207                                              (__v8df)(__m512d)(C), \
   3208                                              (__mmask8)(U), (int)(R)))
   3209 
   3210 
   3211 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   3212 _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
   3213 {
   3214   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
   3215                                                     -(__v8df) __B,
   3216                                                     -(__v8df) __C,
   3217                                                     (__mmask8) __U,
   3218                                                     _MM_FROUND_CUR_DIRECTION);
   3219 }
   3220 
   3221 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   3222 _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
   3223 {
   3224   return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A,
   3225                                                      (__v8df) __B,
   3226                                                      (__v8df) __C,
   3227                                                      (__mmask8) __U,
   3228                                                      _MM_FROUND_CUR_DIRECTION);
   3229 }
   3230 
   3231 #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
   3232   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
   3233                                            -(__v16sf)(__m512)(B), \
   3234                                            -(__v16sf)(__m512)(C), \
   3235                                            (__mmask16)(U), (int)(R)))
   3236 
   3237 
   3238 #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
   3239   ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
   3240                                             (__v16sf)(__m512)(B), \
   3241                                             (__v16sf)(__m512)(C), \
   3242                                             (__mmask16)(U), (int)(R)))
   3243 
   3244 
   3245 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3246 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
   3247 {
   3248   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
   3249                                                    -(__v16sf) __B,
   3250                                                    -(__v16sf) __C,
   3251                                                    (__mmask16) __U,
   3252                                                    _MM_FROUND_CUR_DIRECTION);
   3253 }
   3254 
   3255 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3256 _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
   3257 {
   3258   return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A,
   3259                                                     (__v16sf) __B,
   3260                                                     (__v16sf) __C,
   3261                                                     (__mmask16) __U,
   3262                                                     _MM_FROUND_CUR_DIRECTION);
   3263 }
   3264 
   3265 
   3266 
   3267 /* Vector permutations */
   3268 
   3269 static __inline __m512i __DEFAULT_FN_ATTRS512
   3270 _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
   3271 {
   3272   return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
   3273                                                 (__v16si) __B);
   3274 }
   3275 
   3276 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   3277 _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I,
   3278                                __m512i __B)
   3279 {
   3280   return (__m512i)__builtin_ia32_selectd_512(__U,
   3281                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
   3282                               (__v16si)__A);
   3283 }
   3284 
   3285 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   3286 _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U,
   3287                                 __m512i __B)
   3288 {
   3289   return (__m512i)__builtin_ia32_selectd_512(__U,
   3290                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
   3291                               (__v16si)__I);
   3292 }
   3293 
   3294 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   3295 _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I,
   3296                                 __m512i __B)
   3297 {
   3298   return (__m512i)__builtin_ia32_selectd_512(__U,
   3299                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
   3300                               (__v16si)_mm512_setzero_si512());
   3301 }
   3302 
   3303 static __inline __m512i __DEFAULT_FN_ATTRS512
   3304 _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
   3305 {
   3306   return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
   3307                                                 (__v8di) __B);
   3308 }
   3309 
   3310 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   3311 _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I,
   3312                                __m512i __B)
   3313 {
   3314   return (__m512i)__builtin_ia32_selectq_512(__U,
   3315                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
   3316                                (__v8di)__A);
   3317 }
   3318 
   3319 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   3320 _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U,
   3321                                 __m512i __B)
   3322 {
   3323   return (__m512i)__builtin_ia32_selectq_512(__U,
   3324                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
   3325                                (__v8di)__I);
   3326 }
   3327 
   3328 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   3329 _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
   3330                                 __m512i __B)
   3331 {
   3332   return (__m512i)__builtin_ia32_selectq_512(__U,
   3333                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
   3334                                (__v8di)_mm512_setzero_si512());
   3335 }
   3336 
   3337 #define _mm512_alignr_epi64(A, B, I) \
   3338   ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
   3339                                      (__v8di)(__m512i)(B), (int)(I)))
   3340 
   3341 #define _mm512_mask_alignr_epi64(W, U, A, B, imm) \
   3342   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
   3343                                   (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
   3344                                   (__v8di)(__m512i)(W)))
   3345 
   3346 #define _mm512_maskz_alignr_epi64(U, A, B, imm) \
   3347   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
   3348                                   (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
   3349                                   (__v8di)_mm512_setzero_si512()))
   3350 
   3351 #define _mm512_alignr_epi32(A, B, I) \
   3352   ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
   3353                                      (__v16si)(__m512i)(B), (int)(I)))
   3354 
   3355 #define _mm512_mask_alignr_epi32(W, U, A, B, imm) \
   3356   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
   3357                                  (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
   3358                                  (__v16si)(__m512i)(W)))
   3359 
   3360 #define _mm512_maskz_alignr_epi32(U, A, B, imm) \
   3361   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
   3362                                  (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
   3363                                  (__v16si)_mm512_setzero_si512()))
   3364 /* Vector Extract */
   3365 
   3366 #define _mm512_extractf64x4_pd(A, I) \
   3367   ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
   3368                                              (__v4df)_mm256_undefined_pd(), \
   3369                                              (__mmask8)-1))
   3370 
   3371 #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
   3372   ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
   3373                                              (__v4df)(__m256d)(W), \
   3374                                              (__mmask8)(U)))
   3375 
   3376 #define _mm512_maskz_extractf64x4_pd(U, A, imm) \
   3377   ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
   3378                                              (__v4df)_mm256_setzero_pd(), \
   3379                                              (__mmask8)(U)))
   3380 
   3381 #define _mm512_extractf32x4_ps(A, I) \
   3382   ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
   3383                                             (__v4sf)_mm_undefined_ps(), \
   3384                                             (__mmask8)-1))
   3385 
   3386 #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
   3387   ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
   3388                                             (__v4sf)(__m128)(W), \
   3389                                             (__mmask8)(U)))
   3390 
   3391 #define _mm512_maskz_extractf32x4_ps(U, A, imm) \
   3392   ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
   3393                                             (__v4sf)_mm_setzero_ps(), \
   3394                                             (__mmask8)(U)))
   3395 
   3396 /* Vector Blend */
   3397 
   3398 static __inline __m512d __DEFAULT_FN_ATTRS512
   3399 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
   3400 {
   3401   return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
   3402                  (__v8df) __W,
   3403                  (__v8df) __A);
   3404 }
   3405 
   3406 static __inline __m512 __DEFAULT_FN_ATTRS512
   3407 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
   3408 {
   3409   return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
   3410                 (__v16sf) __W,
   3411                 (__v16sf) __A);
   3412 }
   3413 
   3414 static __inline __m512i __DEFAULT_FN_ATTRS512
   3415 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
   3416 {
   3417   return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
   3418                 (__v8di) __W,
   3419                 (__v8di) __A);
   3420 }
   3421 
   3422 static __inline __m512i __DEFAULT_FN_ATTRS512
   3423 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
   3424 {
   3425   return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
   3426                 (__v16si) __W,
   3427                 (__v16si) __A);
   3428 }
   3429 
   3430 /* Compare */
   3431 
   3432 #define _mm512_cmp_round_ps_mask(A, B, P, R) \
   3433   ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
   3434                                            (__v16sf)(__m512)(B), (int)(P), \
   3435                                            (__mmask16)-1, (int)(R)))
   3436 
   3437 #define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \
   3438   ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
   3439                                            (__v16sf)(__m512)(B), (int)(P), \
   3440                                            (__mmask16)(U), (int)(R)))
   3441 
   3442 #define _mm512_cmp_ps_mask(A, B, P) \
   3443   _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
   3444 #define _mm512_mask_cmp_ps_mask(U, A, B, P) \
   3445   _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
   3446 
   3447 #define _mm512_cmpeq_ps_mask(A, B) \
   3448     _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
   3449 #define _mm512_mask_cmpeq_ps_mask(k, A, B) \
   3450     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
   3451 
   3452 #define _mm512_cmplt_ps_mask(A, B) \
   3453     _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
   3454 #define _mm512_mask_cmplt_ps_mask(k, A, B) \
   3455     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
   3456 
   3457 #define _mm512_cmple_ps_mask(A, B) \
   3458     _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
   3459 #define _mm512_mask_cmple_ps_mask(k, A, B) \
   3460     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
   3461 
   3462 #define _mm512_cmpunord_ps_mask(A, B) \
   3463     _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
   3464 #define _mm512_mask_cmpunord_ps_mask(k, A, B) \
   3465     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
   3466 
   3467 #define _mm512_cmpneq_ps_mask(A, B) \
   3468     _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
   3469 #define _mm512_mask_cmpneq_ps_mask(k, A, B) \
   3470     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
   3471 
   3472 #define _mm512_cmpnlt_ps_mask(A, B) \
   3473     _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
   3474 #define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
   3475     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
   3476 
   3477 #define _mm512_cmpnle_ps_mask(A, B) \
   3478     _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
   3479 #define _mm512_mask_cmpnle_ps_mask(k, A, B) \
   3480     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
   3481 
   3482 #define _mm512_cmpord_ps_mask(A, B) \
   3483     _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
   3484 #define _mm512_mask_cmpord_ps_mask(k, A, B) \
   3485     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
   3486 
   3487 #define _mm512_cmp_round_pd_mask(A, B, P, R) \
   3488   ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
   3489                                           (__v8df)(__m512d)(B), (int)(P), \
   3490                                           (__mmask8)-1, (int)(R)))
   3491 
   3492 #define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \
   3493   ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
   3494                                           (__v8df)(__m512d)(B), (int)(P), \
   3495                                           (__mmask8)(U), (int)(R)))
   3496 
   3497 #define _mm512_cmp_pd_mask(A, B, P) \
   3498   _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
   3499 #define _mm512_mask_cmp_pd_mask(U, A, B, P) \
   3500   _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
   3501 
   3502 #define _mm512_cmpeq_pd_mask(A, B) \
   3503     _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
   3504 #define _mm512_mask_cmpeq_pd_mask(k, A, B) \
   3505     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
   3506 
   3507 #define _mm512_cmplt_pd_mask(A, B) \
   3508     _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
   3509 #define _mm512_mask_cmplt_pd_mask(k, A, B) \
   3510     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
   3511 
   3512 #define _mm512_cmple_pd_mask(A, B) \
   3513     _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
   3514 #define _mm512_mask_cmple_pd_mask(k, A, B) \
   3515     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
   3516 
   3517 #define _mm512_cmpunord_pd_mask(A, B) \
   3518     _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
   3519 #define _mm512_mask_cmpunord_pd_mask(k, A, B) \
   3520     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
   3521 
   3522 #define _mm512_cmpneq_pd_mask(A, B) \
   3523     _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
   3524 #define _mm512_mask_cmpneq_pd_mask(k, A, B) \
   3525     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
   3526 
   3527 #define _mm512_cmpnlt_pd_mask(A, B) \
   3528     _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
   3529 #define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
   3530     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
   3531 
   3532 #define _mm512_cmpnle_pd_mask(A, B) \
   3533     _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
   3534 #define _mm512_mask_cmpnle_pd_mask(k, A, B) \
   3535     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
   3536 
   3537 #define _mm512_cmpord_pd_mask(A, B) \
   3538     _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
   3539 #define _mm512_mask_cmpord_pd_mask(k, A, B) \
   3540     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
   3541 
   3542 /* Conversion */
   3543 
   3544 #define _mm512_cvtt_roundps_epu32(A, R) \
   3545   ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
   3546                                               (__v16si)_mm512_undefined_epi32(), \
   3547                                               (__mmask16)-1, (int)(R)))
   3548 
   3549 #define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \
   3550   ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
   3551                                               (__v16si)(__m512i)(W), \
   3552                                               (__mmask16)(U), (int)(R)))
   3553 
   3554 #define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \
   3555   ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
   3556                                               (__v16si)_mm512_setzero_si512(), \
   3557                                               (__mmask16)(U), (int)(R)))
   3558 
   3559 
   3560 static __inline __m512i __DEFAULT_FN_ATTRS512
   3561 _mm512_cvttps_epu32(__m512 __A)
   3562 {
   3563   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
   3564                   (__v16si)
   3565                   _mm512_setzero_si512 (),
   3566                   (__mmask16) -1,
   3567                   _MM_FROUND_CUR_DIRECTION);
   3568 }
   3569 
   3570 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   3571 _mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
   3572 {
   3573   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
   3574                    (__v16si) __W,
   3575                    (__mmask16) __U,
   3576                    _MM_FROUND_CUR_DIRECTION);
   3577 }
   3578 
   3579 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   3580 _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
   3581 {
   3582   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
   3583                    (__v16si) _mm512_setzero_si512 (),
   3584                    (__mmask16) __U,
   3585                    _MM_FROUND_CUR_DIRECTION);
   3586 }
   3587 
   3588 #define _mm512_cvt_roundepi32_ps(A, R) \
   3589   ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
   3590                                            (__v16sf)_mm512_setzero_ps(), \
   3591                                            (__mmask16)-1, (int)(R)))
   3592 
   3593 #define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \
   3594   ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
   3595                                            (__v16sf)(__m512)(W), \
   3596                                            (__mmask16)(U), (int)(R)))
   3597 
   3598 #define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \
   3599   ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
   3600                                            (__v16sf)_mm512_setzero_ps(), \
   3601                                            (__mmask16)(U), (int)(R)))
   3602 
   3603 #define _mm512_cvt_roundepu32_ps(A, R) \
   3604   ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
   3605                                             (__v16sf)_mm512_setzero_ps(), \
   3606                                             (__mmask16)-1, (int)(R)))
   3607 
   3608 #define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \
   3609   ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
   3610                                             (__v16sf)(__m512)(W), \
   3611                                             (__mmask16)(U), (int)(R)))
   3612 
   3613 #define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \
   3614   ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
   3615                                             (__v16sf)_mm512_setzero_ps(), \
   3616                                             (__mmask16)(U), (int)(R)))
   3617 
   3618 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3619 _mm512_cvtepu32_ps (__m512i __A)
   3620 {
   3621   return (__m512)__builtin_convertvector((__v16su)__A, __v16sf);
   3622 }
   3623 
   3624 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3625 _mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
   3626 {
   3627   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   3628                                              (__v16sf)_mm512_cvtepu32_ps(__A),
   3629                                              (__v16sf)__W);
   3630 }
   3631 
   3632 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3633 _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
   3634 {
   3635   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   3636                                              (__v16sf)_mm512_cvtepu32_ps(__A),
   3637                                              (__v16sf)_mm512_setzero_ps());
   3638 }
   3639 
   3640 static __inline __m512d __DEFAULT_FN_ATTRS512
   3641 _mm512_cvtepi32_pd(__m256i __A)
   3642 {
   3643   return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
   3644 }
   3645 
   3646 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   3647 _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
   3648 {
   3649   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
   3650                                               (__v8df)_mm512_cvtepi32_pd(__A),
   3651                                               (__v8df)__W);
   3652 }
   3653 
   3654 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   3655 _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
   3656 {
   3657   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
   3658                                               (__v8df)_mm512_cvtepi32_pd(__A),
   3659                                               (__v8df)_mm512_setzero_pd());
   3660 }
   3661 
   3662 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   3663 _mm512_cvtepi32lo_pd(__m512i __A)
   3664 {
   3665   return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
   3666 }
   3667 
   3668 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   3669 _mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
   3670 {
   3671   return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
   3672 }
   3673 
   3674 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3675 _mm512_cvtepi32_ps (__m512i __A)
   3676 {
   3677   return (__m512)__builtin_convertvector((__v16si)__A, __v16sf);
   3678 }
   3679 
   3680 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3681 _mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
   3682 {
   3683   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   3684                                              (__v16sf)_mm512_cvtepi32_ps(__A),
   3685                                              (__v16sf)__W);
   3686 }
   3687 
   3688 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3689 _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
   3690 {
   3691   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   3692                                              (__v16sf)_mm512_cvtepi32_ps(__A),
   3693                                              (__v16sf)_mm512_setzero_ps());
   3694 }
   3695 
   3696 static __inline __m512d __DEFAULT_FN_ATTRS512
   3697 _mm512_cvtepu32_pd(__m256i __A)
   3698 {
   3699   return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
   3700 }
   3701 
   3702 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   3703 _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
   3704 {
   3705   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
   3706                                               (__v8df)_mm512_cvtepu32_pd(__A),
   3707                                               (__v8df)__W);
   3708 }
   3709 
   3710 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   3711 _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
   3712 {
   3713   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
   3714                                               (__v8df)_mm512_cvtepu32_pd(__A),
   3715                                               (__v8df)_mm512_setzero_pd());
   3716 }
   3717 
   3718 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   3719 _mm512_cvtepu32lo_pd(__m512i __A)
   3720 {
   3721   return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
   3722 }
   3723 
   3724 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   3725 _mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
   3726 {
   3727   return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
   3728 }
   3729 
   3730 #define _mm512_cvt_roundpd_ps(A, R) \
   3731   ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
   3732                                            (__v8sf)_mm256_setzero_ps(), \
   3733                                            (__mmask8)-1, (int)(R)))
   3734 
   3735 #define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \
   3736   ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
   3737                                            (__v8sf)(__m256)(W), (__mmask8)(U), \
   3738                                            (int)(R)))
   3739 
   3740 #define _mm512_maskz_cvt_roundpd_ps(U, A, R) \
   3741   ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
   3742                                            (__v8sf)_mm256_setzero_ps(), \
   3743                                            (__mmask8)(U), (int)(R)))
   3744 
   3745 static __inline__ __m256 __DEFAULT_FN_ATTRS512
   3746 _mm512_cvtpd_ps (__m512d __A)
   3747 {
   3748   return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
   3749                 (__v8sf) _mm256_undefined_ps (),
   3750                 (__mmask8) -1,
   3751                 _MM_FROUND_CUR_DIRECTION);
   3752 }
   3753 
   3754 static __inline__ __m256 __DEFAULT_FN_ATTRS512
   3755 _mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
   3756 {
   3757   return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
   3758                 (__v8sf) __W,
   3759                 (__mmask8) __U,
   3760                 _MM_FROUND_CUR_DIRECTION);
   3761 }
   3762 
   3763 static __inline__ __m256 __DEFAULT_FN_ATTRS512
   3764 _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
   3765 {
   3766   return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
   3767                 (__v8sf) _mm256_setzero_ps (),
   3768                 (__mmask8) __U,
   3769                 _MM_FROUND_CUR_DIRECTION);
   3770 }
   3771 
   3772 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3773 _mm512_cvtpd_pslo (__m512d __A)
   3774 {
   3775   return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
   3776                 (__v8sf) _mm256_setzero_ps (),
   3777                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
   3778 }
   3779 
   3780 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3781 _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
   3782 {
   3783   return (__m512) __builtin_shufflevector (
   3784                 (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
   3785                                                __U, __A),
   3786                 (__v8sf) _mm256_setzero_ps (),
   3787                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
   3788 }
   3789 
   3790 #define _mm512_cvt_roundps_ph(A, I) \
   3791   ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
   3792                                              (__v16hi)_mm256_undefined_si256(), \
   3793                                              (__mmask16)-1))
   3794 
   3795 #define _mm512_mask_cvt_roundps_ph(U, W, A, I) \
   3796   ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
   3797                                              (__v16hi)(__m256i)(U), \
   3798                                              (__mmask16)(W)))
   3799 
   3800 #define _mm512_maskz_cvt_roundps_ph(W, A, I) \
   3801   ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
   3802                                              (__v16hi)_mm256_setzero_si256(), \
   3803                                              (__mmask16)(W)))
   3804 
   3805 #define _mm512_cvtps_ph       _mm512_cvt_roundps_ph
   3806 #define _mm512_mask_cvtps_ph  _mm512_mask_cvt_roundps_ph
   3807 #define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph
   3808 
   3809 #define _mm512_cvt_roundph_ps(A, R) \
   3810   ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
   3811                                             (__v16sf)_mm512_undefined_ps(), \
   3812                                             (__mmask16)-1, (int)(R)))
   3813 
   3814 #define _mm512_mask_cvt_roundph_ps(W, U, A, R) \
   3815   ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
   3816                                             (__v16sf)(__m512)(W), \
   3817                                             (__mmask16)(U), (int)(R)))
   3818 
   3819 #define _mm512_maskz_cvt_roundph_ps(U, A, R) \
   3820   ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
   3821                                             (__v16sf)_mm512_setzero_ps(), \
   3822                                             (__mmask16)(U), (int)(R)))
   3823 
   3824 
   3825 static  __inline __m512 __DEFAULT_FN_ATTRS512
   3826 _mm512_cvtph_ps(__m256i __A)
   3827 {
   3828   return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
   3829                 (__v16sf)
   3830                 _mm512_setzero_ps (),
   3831                 (__mmask16) -1,
   3832                 _MM_FROUND_CUR_DIRECTION);
   3833 }
   3834 
   3835 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3836 _mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
   3837 {
   3838   return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
   3839                  (__v16sf) __W,
   3840                  (__mmask16) __U,
   3841                  _MM_FROUND_CUR_DIRECTION);
   3842 }
   3843 
   3844 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   3845 _mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
   3846 {
   3847   return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
   3848                  (__v16sf) _mm512_setzero_ps (),
   3849                  (__mmask16) __U,
   3850                  _MM_FROUND_CUR_DIRECTION);
   3851 }
   3852 
   3853 #define _mm512_cvtt_roundpd_epi32(A, R) \
   3854   ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
   3855                                              (__v8si)_mm256_setzero_si256(), \
   3856                                              (__mmask8)-1, (int)(R)))
   3857 
   3858 #define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \
   3859   ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
   3860                                              (__v8si)(__m256i)(W), \
   3861                                              (__mmask8)(U), (int)(R)))
   3862 
   3863 #define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \
   3864   ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
   3865                                              (__v8si)_mm256_setzero_si256(), \
   3866                                              (__mmask8)(U), (int)(R)))
   3867 
   3868 static __inline __m256i __DEFAULT_FN_ATTRS512
   3869 _mm512_cvttpd_epi32(__m512d __a)
   3870 {
   3871   return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
   3872                                                    (__v8si)_mm256_setzero_si256(),
   3873                                                    (__mmask8) -1,
   3874                                                     _MM_FROUND_CUR_DIRECTION);
   3875 }
   3876 
   3877 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   3878 _mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
   3879 {
   3880   return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
   3881                   (__v8si) __W,
   3882                   (__mmask8) __U,
   3883                   _MM_FROUND_CUR_DIRECTION);
   3884 }
   3885 
   3886 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   3887 _mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
   3888 {
   3889   return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
   3890                   (__v8si) _mm256_setzero_si256 (),
   3891                   (__mmask8) __U,
   3892                   _MM_FROUND_CUR_DIRECTION);
   3893 }
   3894 
   3895 #define _mm512_cvtt_roundps_epi32(A, R) \
   3896   ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
   3897                                              (__v16si)_mm512_setzero_si512(), \
   3898                                              (__mmask16)-1, (int)(R)))
   3899 
   3900 #define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \
   3901   ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
   3902                                              (__v16si)(__m512i)(W), \
   3903                                              (__mmask16)(U), (int)(R)))
   3904 
   3905 #define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \
   3906   ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
   3907                                              (__v16si)_mm512_setzero_si512(), \
   3908                                              (__mmask16)(U), (int)(R)))
   3909 
   3910 static __inline __m512i __DEFAULT_FN_ATTRS512
   3911 _mm512_cvttps_epi32(__m512 __a)
   3912 {
   3913   return (__m512i)
   3914     __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
   3915                                      (__v16si) _mm512_setzero_si512 (),
   3916                                      (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
   3917 }
   3918 
   3919 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   3920 _mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
   3921 {
   3922   return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
   3923                   (__v16si) __W,
   3924                   (__mmask16) __U,
   3925                   _MM_FROUND_CUR_DIRECTION);
   3926 }
   3927 
   3928 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   3929 _mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
   3930 {
   3931   return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
   3932                   (__v16si) _mm512_setzero_si512 (),
   3933                   (__mmask16) __U,
   3934                   _MM_FROUND_CUR_DIRECTION);
   3935 }
   3936 
   3937 #define _mm512_cvt_roundps_epi32(A, R) \
   3938   ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
   3939                                             (__v16si)_mm512_setzero_si512(), \
   3940                                             (__mmask16)-1, (int)(R)))
   3941 
   3942 #define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \
   3943   ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
   3944                                             (__v16si)(__m512i)(W), \
   3945                                             (__mmask16)(U), (int)(R)))
   3946 
   3947 #define _mm512_maskz_cvt_roundps_epi32(U, A, R) \
   3948   ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
   3949                                             (__v16si)_mm512_setzero_si512(), \
   3950                                             (__mmask16)(U), (int)(R)))
   3951 
   3952 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   3953 _mm512_cvtps_epi32 (__m512 __A)
   3954 {
   3955   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
   3956                  (__v16si) _mm512_undefined_epi32 (),
   3957                  (__mmask16) -1,
   3958                  _MM_FROUND_CUR_DIRECTION);
   3959 }
   3960 
   3961 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   3962 _mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
   3963 {
   3964   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
   3965                  (__v16si) __W,
   3966                  (__mmask16) __U,
   3967                  _MM_FROUND_CUR_DIRECTION);
   3968 }
   3969 
   3970 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   3971 _mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
   3972 {
   3973   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
   3974                  (__v16si)
   3975                  _mm512_setzero_si512 (),
   3976                  (__mmask16) __U,
   3977                  _MM_FROUND_CUR_DIRECTION);
   3978 }
   3979 
   3980 #define _mm512_cvt_roundpd_epi32(A, R) \
   3981   ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
   3982                                             (__v8si)_mm256_setzero_si256(), \
   3983                                             (__mmask8)-1, (int)(R)))
   3984 
   3985 #define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \
   3986   ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
   3987                                             (__v8si)(__m256i)(W), \
   3988                                             (__mmask8)(U), (int)(R)))
   3989 
   3990 #define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \
   3991   ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
   3992                                             (__v8si)_mm256_setzero_si256(), \
   3993                                             (__mmask8)(U), (int)(R)))
   3994 
   3995 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   3996 _mm512_cvtpd_epi32 (__m512d __A)
   3997 {
   3998   return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
   3999                  (__v8si)
   4000                  _mm256_undefined_si256 (),
   4001                  (__mmask8) -1,
   4002                  _MM_FROUND_CUR_DIRECTION);
   4003 }
   4004 
   4005 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   4006 _mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
   4007 {
   4008   return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
   4009                  (__v8si) __W,
   4010                  (__mmask8) __U,
   4011                  _MM_FROUND_CUR_DIRECTION);
   4012 }
   4013 
   4014 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   4015 _mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
   4016 {
   4017   return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
   4018                  (__v8si)
   4019                  _mm256_setzero_si256 (),
   4020                  (__mmask8) __U,
   4021                  _MM_FROUND_CUR_DIRECTION);
   4022 }
   4023 
   4024 #define _mm512_cvt_roundps_epu32(A, R) \
   4025   ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
   4026                                              (__v16si)_mm512_setzero_si512(), \
   4027                                              (__mmask16)-1, (int)(R)))
   4028 
   4029 #define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \
   4030   ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
   4031                                              (__v16si)(__m512i)(W), \
   4032                                              (__mmask16)(U), (int)(R)))
   4033 
   4034 #define _mm512_maskz_cvt_roundps_epu32(U, A, R) \
   4035   ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
   4036                                              (__v16si)_mm512_setzero_si512(), \
   4037                                              (__mmask16)(U), (int)(R)))
   4038 
   4039 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4040 _mm512_cvtps_epu32 ( __m512 __A)
   4041 {
   4042   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
   4043                   (__v16si)\
   4044                   _mm512_undefined_epi32 (),
   4045                   (__mmask16) -1,\
   4046                   _MM_FROUND_CUR_DIRECTION);
   4047 }
   4048 
   4049 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4050 _mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
   4051 {
   4052   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
   4053                   (__v16si) __W,
   4054                   (__mmask16) __U,
   4055                   _MM_FROUND_CUR_DIRECTION);
   4056 }
   4057 
   4058 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4059 _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
   4060 {
   4061   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
   4062                   (__v16si)
   4063                   _mm512_setzero_si512 (),
   4064                   (__mmask16) __U ,
   4065                   _MM_FROUND_CUR_DIRECTION);
   4066 }
   4067 
   4068 #define _mm512_cvt_roundpd_epu32(A, R) \
   4069   ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
   4070                                              (__v8si)_mm256_setzero_si256(), \
   4071                                              (__mmask8)-1, (int)(R)))
   4072 
   4073 #define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \
   4074   ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
   4075                                              (__v8si)(__m256i)(W), \
   4076                                              (__mmask8)(U), (int)(R)))
   4077 
   4078 #define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \
   4079   ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
   4080                                              (__v8si)_mm256_setzero_si256(), \
   4081                                              (__mmask8)(U), (int)(R)))
   4082 
   4083 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   4084 _mm512_cvtpd_epu32 (__m512d __A)
   4085 {
   4086   return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
   4087                   (__v8si)
   4088                   _mm256_undefined_si256 (),
   4089                   (__mmask8) -1,
   4090                   _MM_FROUND_CUR_DIRECTION);
   4091 }
   4092 
   4093 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   4094 _mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
   4095 {
   4096   return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
   4097                   (__v8si) __W,
   4098                   (__mmask8) __U,
   4099                   _MM_FROUND_CUR_DIRECTION);
   4100 }
   4101 
   4102 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   4103 _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
   4104 {
   4105   return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
   4106                   (__v8si)
   4107                   _mm256_setzero_si256 (),
   4108                   (__mmask8) __U,
   4109                   _MM_FROUND_CUR_DIRECTION);
   4110 }
   4111 
   4112 static __inline__ double __DEFAULT_FN_ATTRS512
   4113 _mm512_cvtsd_f64(__m512d __a)
   4114 {
   4115   return __a[0];
   4116 }
   4117 
   4118 static __inline__ float __DEFAULT_FN_ATTRS512
   4119 _mm512_cvtss_f32(__m512 __a)
   4120 {
   4121   return __a[0];
   4122 }
   4123 
   4124 /* Unpack and Interleave */
   4125 
   4126 static __inline __m512d __DEFAULT_FN_ATTRS512
   4127 _mm512_unpackhi_pd(__m512d __a, __m512d __b)
   4128 {
   4129   return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
   4130                                           1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
   4131 }
   4132 
   4133 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   4134 _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
   4135 {
   4136   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
   4137                                            (__v8df)_mm512_unpackhi_pd(__A, __B),
   4138                                            (__v8df)__W);
   4139 }
   4140 
   4141 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   4142 _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
   4143 {
   4144   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
   4145                                            (__v8df)_mm512_unpackhi_pd(__A, __B),
   4146                                            (__v8df)_mm512_setzero_pd());
   4147 }
   4148 
   4149 static __inline __m512d __DEFAULT_FN_ATTRS512
   4150 _mm512_unpacklo_pd(__m512d __a, __m512d __b)
   4151 {
   4152   return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
   4153                                           0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
   4154 }
   4155 
   4156 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   4157 _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
   4158 {
   4159   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
   4160                                            (__v8df)_mm512_unpacklo_pd(__A, __B),
   4161                                            (__v8df)__W);
   4162 }
   4163 
   4164 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   4165 _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
   4166 {
   4167   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
   4168                                            (__v8df)_mm512_unpacklo_pd(__A, __B),
   4169                                            (__v8df)_mm512_setzero_pd());
   4170 }
   4171 
   4172 static __inline __m512 __DEFAULT_FN_ATTRS512
   4173 _mm512_unpackhi_ps(__m512 __a, __m512 __b)
   4174 {
   4175   return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
   4176                                          2,    18,    3,    19,
   4177                                          2+4,  18+4,  3+4,  19+4,
   4178                                          2+8,  18+8,  3+8,  19+8,
   4179                                          2+12, 18+12, 3+12, 19+12);
   4180 }
   4181 
   4182 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   4183 _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
   4184 {
   4185   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
   4186                                           (__v16sf)_mm512_unpackhi_ps(__A, __B),
   4187                                           (__v16sf)__W);
   4188 }
   4189 
   4190 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   4191 _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
   4192 {
   4193   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
   4194                                           (__v16sf)_mm512_unpackhi_ps(__A, __B),
   4195                                           (__v16sf)_mm512_setzero_ps());
   4196 }
   4197 
   4198 static __inline __m512 __DEFAULT_FN_ATTRS512
   4199 _mm512_unpacklo_ps(__m512 __a, __m512 __b)
   4200 {
   4201   return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
   4202                                          0,    16,    1,    17,
   4203                                          0+4,  16+4,  1+4,  17+4,
   4204                                          0+8,  16+8,  1+8,  17+8,
   4205                                          0+12, 16+12, 1+12, 17+12);
   4206 }
   4207 
   4208 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   4209 _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
   4210 {
   4211   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
   4212                                           (__v16sf)_mm512_unpacklo_ps(__A, __B),
   4213                                           (__v16sf)__W);
   4214 }
   4215 
   4216 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   4217 _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
   4218 {
   4219   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
   4220                                           (__v16sf)_mm512_unpacklo_ps(__A, __B),
   4221                                           (__v16sf)_mm512_setzero_ps());
   4222 }
   4223 
   4224 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4225 _mm512_unpackhi_epi32(__m512i __A, __m512i __B)
   4226 {
   4227   return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
   4228                                           2,    18,    3,    19,
   4229                                           2+4,  18+4,  3+4,  19+4,
   4230                                           2+8,  18+8,  3+8,  19+8,
   4231                                           2+12, 18+12, 3+12, 19+12);
   4232 }
   4233 
   4234 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4235 _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
   4236 {
   4237   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
   4238                                        (__v16si)_mm512_unpackhi_epi32(__A, __B),
   4239                                        (__v16si)__W);
   4240 }
   4241 
   4242 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4243 _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
   4244 {
   4245   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
   4246                                        (__v16si)_mm512_unpackhi_epi32(__A, __B),
   4247                                        (__v16si)_mm512_setzero_si512());
   4248 }
   4249 
   4250 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4251 _mm512_unpacklo_epi32(__m512i __A, __m512i __B)
   4252 {
   4253   return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
   4254                                           0,    16,    1,    17,
   4255                                           0+4,  16+4,  1+4,  17+4,
   4256                                           0+8,  16+8,  1+8,  17+8,
   4257                                           0+12, 16+12, 1+12, 17+12);
   4258 }
   4259 
   4260 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4261 _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
   4262 {
   4263   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
   4264                                        (__v16si)_mm512_unpacklo_epi32(__A, __B),
   4265                                        (__v16si)__W);
   4266 }
   4267 
   4268 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4269 _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
   4270 {
   4271   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
   4272                                        (__v16si)_mm512_unpacklo_epi32(__A, __B),
   4273                                        (__v16si)_mm512_setzero_si512());
   4274 }
   4275 
   4276 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4277 _mm512_unpackhi_epi64(__m512i __A, __m512i __B)
   4278 {
   4279   return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
   4280                                           1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
   4281 }
   4282 
   4283 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4284 _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
   4285 {
   4286   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
   4287                                         (__v8di)_mm512_unpackhi_epi64(__A, __B),
   4288                                         (__v8di)__W);
   4289 }
   4290 
   4291 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4292 _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
   4293 {
   4294   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
   4295                                         (__v8di)_mm512_unpackhi_epi64(__A, __B),
   4296                                         (__v8di)_mm512_setzero_si512());
   4297 }
   4298 
   4299 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4300 _mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
   4301 {
   4302   return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
   4303                                           0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
   4304 }
   4305 
   4306 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4307 _mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
   4308 {
   4309   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
   4310                                         (__v8di)_mm512_unpacklo_epi64(__A, __B),
   4311                                         (__v8di)__W);
   4312 }
   4313 
   4314 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4315 _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
   4316 {
   4317   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
   4318                                         (__v8di)_mm512_unpacklo_epi64(__A, __B),
   4319                                         (__v8di)_mm512_setzero_si512());
   4320 }
   4321 
   4322 
   4323 /* SIMD load ops */
   4324 
   4325 static __inline __m512i __DEFAULT_FN_ATTRS512
   4326 _mm512_loadu_si512 (void const *__P)
   4327 {
   4328   struct __loadu_si512 {
   4329     __m512i_u __v;
   4330   } __attribute__((__packed__, __may_alias__));
   4331   return ((const struct __loadu_si512*)__P)->__v;
   4332 }
   4333 
   4334 static __inline __m512i __DEFAULT_FN_ATTRS512
   4335 _mm512_loadu_epi32 (void const *__P)
   4336 {
   4337   struct __loadu_epi32 {
   4338     __m512i_u __v;
   4339   } __attribute__((__packed__, __may_alias__));
   4340   return ((const struct __loadu_epi32*)__P)->__v;
   4341 }
   4342 
   4343 static __inline __m512i __DEFAULT_FN_ATTRS512
   4344 _mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
   4345 {
   4346   return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
   4347                   (__v16si) __W,
   4348                   (__mmask16) __U);
   4349 }
   4350 
   4351 
   4352 static __inline __m512i __DEFAULT_FN_ATTRS512
   4353 _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
   4354 {
   4355   return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
   4356                                                      (__v16si)
   4357                                                      _mm512_setzero_si512 (),
   4358                                                      (__mmask16) __U);
   4359 }
   4360 
   4361 static __inline __m512i __DEFAULT_FN_ATTRS512
   4362 _mm512_loadu_epi64 (void const *__P)
   4363 {
   4364   struct __loadu_epi64 {
   4365     __m512i_u __v;
   4366   } __attribute__((__packed__, __may_alias__));
   4367   return ((const struct __loadu_epi64*)__P)->__v;
   4368 }
   4369 
   4370 static __inline __m512i __DEFAULT_FN_ATTRS512
   4371 _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
   4372 {
   4373   return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
   4374                   (__v8di) __W,
   4375                   (__mmask8) __U);
   4376 }
   4377 
   4378 static __inline __m512i __DEFAULT_FN_ATTRS512
   4379 _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
   4380 {
   4381   return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
   4382                                                      (__v8di)
   4383                                                      _mm512_setzero_si512 (),
   4384                                                      (__mmask8) __U);
   4385 }
   4386 
   4387 static __inline __m512 __DEFAULT_FN_ATTRS512
   4388 _mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
   4389 {
   4390   return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
   4391                    (__v16sf) __W,
   4392                    (__mmask16) __U);
   4393 }
   4394 
   4395 static __inline __m512 __DEFAULT_FN_ATTRS512
   4396 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
   4397 {
   4398   return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
   4399                                                   (__v16sf)
   4400                                                   _mm512_setzero_ps (),
   4401                                                   (__mmask16) __U);
   4402 }
   4403 
   4404 static __inline __m512d __DEFAULT_FN_ATTRS512
   4405 _mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
   4406 {
   4407   return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
   4408                 (__v8df) __W,
   4409                 (__mmask8) __U);
   4410 }
   4411 
   4412 static __inline __m512d __DEFAULT_FN_ATTRS512
   4413 _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
   4414 {
   4415   return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
   4416                                                    (__v8df)
   4417                                                    _mm512_setzero_pd (),
   4418                                                    (__mmask8) __U);
   4419 }
   4420 
   4421 static __inline __m512d __DEFAULT_FN_ATTRS512
   4422 _mm512_loadu_pd(void const *__p)
   4423 {
   4424   struct __loadu_pd {
   4425     __m512d_u __v;
   4426   } __attribute__((__packed__, __may_alias__));
   4427   return ((const struct __loadu_pd*)__p)->__v;
   4428 }
   4429 
   4430 static __inline __m512 __DEFAULT_FN_ATTRS512
   4431 _mm512_loadu_ps(void const *__p)
   4432 {
   4433   struct __loadu_ps {
   4434     __m512_u __v;
   4435   } __attribute__((__packed__, __may_alias__));
   4436   return ((const struct __loadu_ps*)__p)->__v;
   4437 }
   4438 
   4439 static __inline __m512 __DEFAULT_FN_ATTRS512
   4440 _mm512_load_ps(void const *__p)
   4441 {
   4442   return *(const __m512*)__p;
   4443 }
   4444 
   4445 static __inline __m512 __DEFAULT_FN_ATTRS512
   4446 _mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
   4447 {
   4448   return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
   4449                    (__v16sf) __W,
   4450                    (__mmask16) __U);
   4451 }
   4452 
   4453 static __inline __m512 __DEFAULT_FN_ATTRS512
   4454 _mm512_maskz_load_ps(__mmask16 __U, void const *__P)
   4455 {
   4456   return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
   4457                                                   (__v16sf)
   4458                                                   _mm512_setzero_ps (),
   4459                                                   (__mmask16) __U);
   4460 }
   4461 
   4462 static __inline __m512d __DEFAULT_FN_ATTRS512
   4463 _mm512_load_pd(void const *__p)
   4464 {
   4465   return *(const __m512d*)__p;
   4466 }
   4467 
   4468 static __inline __m512d __DEFAULT_FN_ATTRS512
   4469 _mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
   4470 {
   4471   return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
   4472                           (__v8df) __W,
   4473                           (__mmask8) __U);
   4474 }
   4475 
   4476 static __inline __m512d __DEFAULT_FN_ATTRS512
   4477 _mm512_maskz_load_pd(__mmask8 __U, void const *__P)
   4478 {
   4479   return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
   4480                                                    (__v8df)
   4481                                                    _mm512_setzero_pd (),
   4482                                                    (__mmask8) __U);
   4483 }
   4484 
   4485 static __inline __m512i __DEFAULT_FN_ATTRS512
   4486 _mm512_load_si512 (void const *__P)
   4487 {
   4488   return *(const __m512i *) __P;
   4489 }
   4490 
   4491 static __inline __m512i __DEFAULT_FN_ATTRS512
   4492 _mm512_load_epi32 (void const *__P)
   4493 {
   4494   return *(const __m512i *) __P;
   4495 }
   4496 
   4497 static __inline __m512i __DEFAULT_FN_ATTRS512
   4498 _mm512_load_epi64 (void const *__P)
   4499 {
   4500   return *(const __m512i *) __P;
   4501 }
   4502 
   4503 /* SIMD store ops */
   4504 
   4505 static __inline void __DEFAULT_FN_ATTRS512
   4506 _mm512_storeu_epi64 (void *__P, __m512i __A)
   4507 {
   4508   struct __storeu_epi64 {
   4509     __m512i_u __v;
   4510   } __attribute__((__packed__, __may_alias__));
   4511   ((struct __storeu_epi64*)__P)->__v = __A;
   4512 }
   4513 
   4514 static __inline void __DEFAULT_FN_ATTRS512
   4515 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
   4516 {
   4517   __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
   4518                                      (__mmask8) __U);
   4519 }
   4520 
   4521 static __inline void __DEFAULT_FN_ATTRS512
   4522 _mm512_storeu_si512 (void *__P, __m512i __A)
   4523 {
   4524   struct __storeu_si512 {
   4525     __m512i_u __v;
   4526   } __attribute__((__packed__, __may_alias__));
   4527   ((struct __storeu_si512*)__P)->__v = __A;
   4528 }
   4529 
   4530 static __inline void __DEFAULT_FN_ATTRS512
   4531 _mm512_storeu_epi32 (void *__P, __m512i __A)
   4532 {
   4533   struct __storeu_epi32 {
   4534     __m512i_u __v;
   4535   } __attribute__((__packed__, __may_alias__));
   4536   ((struct __storeu_epi32*)__P)->__v = __A;
   4537 }
   4538 
   4539 static __inline void __DEFAULT_FN_ATTRS512
   4540 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
   4541 {
   4542   __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
   4543                                      (__mmask16) __U);
   4544 }
   4545 
   4546 static __inline void __DEFAULT_FN_ATTRS512
   4547 _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
   4548 {
   4549   __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
   4550 }
   4551 
   4552 static __inline void __DEFAULT_FN_ATTRS512
   4553 _mm512_storeu_pd(void *__P, __m512d __A)
   4554 {
   4555   struct __storeu_pd {
   4556     __m512d_u __v;
   4557   } __attribute__((__packed__, __may_alias__));
   4558   ((struct __storeu_pd*)__P)->__v = __A;
   4559 }
   4560 
   4561 static __inline void __DEFAULT_FN_ATTRS512
   4562 _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
   4563 {
   4564   __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
   4565                                    (__mmask16) __U);
   4566 }
   4567 
   4568 static __inline void __DEFAULT_FN_ATTRS512
   4569 _mm512_storeu_ps(void *__P, __m512 __A)
   4570 {
   4571   struct __storeu_ps {
   4572     __m512_u __v;
   4573   } __attribute__((__packed__, __may_alias__));
   4574   ((struct __storeu_ps*)__P)->__v = __A;
   4575 }
   4576 
   4577 static __inline void __DEFAULT_FN_ATTRS512
   4578 _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
   4579 {
   4580   __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
   4581 }
   4582 
   4583 static __inline void __DEFAULT_FN_ATTRS512
   4584 _mm512_store_pd(void *__P, __m512d __A)
   4585 {
   4586   *(__m512d*)__P = __A;
   4587 }
   4588 
   4589 static __inline void __DEFAULT_FN_ATTRS512
   4590 _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
   4591 {
   4592   __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
   4593                                    (__mmask16) __U);
   4594 }
   4595 
   4596 static __inline void __DEFAULT_FN_ATTRS512
   4597 _mm512_store_ps(void *__P, __m512 __A)
   4598 {
   4599   *(__m512*)__P = __A;
   4600 }
   4601 
   4602 static __inline void __DEFAULT_FN_ATTRS512
   4603 _mm512_store_si512 (void *__P, __m512i __A)
   4604 {
   4605   *(__m512i *) __P = __A;
   4606 }
   4607 
   4608 static __inline void __DEFAULT_FN_ATTRS512
   4609 _mm512_store_epi32 (void *__P, __m512i __A)
   4610 {
   4611   *(__m512i *) __P = __A;
   4612 }
   4613 
   4614 static __inline void __DEFAULT_FN_ATTRS512
   4615 _mm512_store_epi64 (void *__P, __m512i __A)
   4616 {
   4617   *(__m512i *) __P = __A;
   4618 }
   4619 
   4620 /* Mask ops */
   4621 
   4622 static __inline __mmask16 __DEFAULT_FN_ATTRS
   4623 _mm512_knot(__mmask16 __M)
   4624 {
   4625   return __builtin_ia32_knothi(__M);
   4626 }
   4627 
   4628 /* Integer compare */
   4629 
   4630 #define _mm512_cmpeq_epi32_mask(A, B) \
   4631     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
   4632 #define _mm512_mask_cmpeq_epi32_mask(k, A, B) \
   4633     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
   4634 #define _mm512_cmpge_epi32_mask(A, B) \
   4635     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
   4636 #define _mm512_mask_cmpge_epi32_mask(k, A, B) \
   4637     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
   4638 #define _mm512_cmpgt_epi32_mask(A, B) \
   4639     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
   4640 #define _mm512_mask_cmpgt_epi32_mask(k, A, B) \
   4641     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
   4642 #define _mm512_cmple_epi32_mask(A, B) \
   4643     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
   4644 #define _mm512_mask_cmple_epi32_mask(k, A, B) \
   4645     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
   4646 #define _mm512_cmplt_epi32_mask(A, B) \
   4647     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
   4648 #define _mm512_mask_cmplt_epi32_mask(k, A, B) \
   4649     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
   4650 #define _mm512_cmpneq_epi32_mask(A, B) \
   4651     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
   4652 #define _mm512_mask_cmpneq_epi32_mask(k, A, B) \
   4653     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
   4654 
   4655 #define _mm512_cmpeq_epu32_mask(A, B) \
   4656     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
   4657 #define _mm512_mask_cmpeq_epu32_mask(k, A, B) \
   4658     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
   4659 #define _mm512_cmpge_epu32_mask(A, B) \
   4660     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
   4661 #define _mm512_mask_cmpge_epu32_mask(k, A, B) \
   4662     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
   4663 #define _mm512_cmpgt_epu32_mask(A, B) \
   4664     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
   4665 #define _mm512_mask_cmpgt_epu32_mask(k, A, B) \
   4666     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
   4667 #define _mm512_cmple_epu32_mask(A, B) \
   4668     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
   4669 #define _mm512_mask_cmple_epu32_mask(k, A, B) \
   4670     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
   4671 #define _mm512_cmplt_epu32_mask(A, B) \
   4672     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
   4673 #define _mm512_mask_cmplt_epu32_mask(k, A, B) \
   4674     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
   4675 #define _mm512_cmpneq_epu32_mask(A, B) \
   4676     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
   4677 #define _mm512_mask_cmpneq_epu32_mask(k, A, B) \
   4678     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
   4679 
   4680 #define _mm512_cmpeq_epi64_mask(A, B) \
   4681     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
   4682 #define _mm512_mask_cmpeq_epi64_mask(k, A, B) \
   4683     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
   4684 #define _mm512_cmpge_epi64_mask(A, B) \
   4685     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
   4686 #define _mm512_mask_cmpge_epi64_mask(k, A, B) \
   4687     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
   4688 #define _mm512_cmpgt_epi64_mask(A, B) \
   4689     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
   4690 #define _mm512_mask_cmpgt_epi64_mask(k, A, B) \
   4691     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
   4692 #define _mm512_cmple_epi64_mask(A, B) \
   4693     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
   4694 #define _mm512_mask_cmple_epi64_mask(k, A, B) \
   4695     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
   4696 #define _mm512_cmplt_epi64_mask(A, B) \
   4697     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
   4698 #define _mm512_mask_cmplt_epi64_mask(k, A, B) \
   4699     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
   4700 #define _mm512_cmpneq_epi64_mask(A, B) \
   4701     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
   4702 #define _mm512_mask_cmpneq_epi64_mask(k, A, B) \
   4703     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
   4704 
   4705 #define _mm512_cmpeq_epu64_mask(A, B) \
   4706     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
   4707 #define _mm512_mask_cmpeq_epu64_mask(k, A, B) \
   4708     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
   4709 #define _mm512_cmpge_epu64_mask(A, B) \
   4710     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
   4711 #define _mm512_mask_cmpge_epu64_mask(k, A, B) \
   4712     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
   4713 #define _mm512_cmpgt_epu64_mask(A, B) \
   4714     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
   4715 #define _mm512_mask_cmpgt_epu64_mask(k, A, B) \
   4716     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
   4717 #define _mm512_cmple_epu64_mask(A, B) \
   4718     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
   4719 #define _mm512_mask_cmple_epu64_mask(k, A, B) \
   4720     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
   4721 #define _mm512_cmplt_epu64_mask(A, B) \
   4722     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
   4723 #define _mm512_mask_cmplt_epu64_mask(k, A, B) \
   4724     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
   4725 #define _mm512_cmpneq_epu64_mask(A, B) \
   4726     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
   4727 #define _mm512_mask_cmpneq_epu64_mask(k, A, B) \
   4728     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
   4729 
   4730 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4731 _mm512_cvtepi8_epi32(__m128i __A)
   4732 {
   4733   /* This function always performs a signed extension, but __v16qi is a char
   4734      which may be signed or unsigned, so use __v16qs. */
   4735   return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
   4736 }
   4737 
   4738 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4739 _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
   4740 {
   4741   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   4742                                              (__v16si)_mm512_cvtepi8_epi32(__A),
   4743                                              (__v16si)__W);
   4744 }
   4745 
   4746 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4747 _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
   4748 {
   4749   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   4750                                              (__v16si)_mm512_cvtepi8_epi32(__A),
   4751                                              (__v16si)_mm512_setzero_si512());
   4752 }
   4753 
   4754 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4755 _mm512_cvtepi8_epi64(__m128i __A)
   4756 {
   4757   /* This function always performs a signed extension, but __v16qi is a char
   4758      which may be signed or unsigned, so use __v16qs. */
   4759   return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
   4760 }
   4761 
   4762 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4763 _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
   4764 {
   4765   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   4766                                              (__v8di)_mm512_cvtepi8_epi64(__A),
   4767                                              (__v8di)__W);
   4768 }
   4769 
   4770 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4771 _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
   4772 {
   4773   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   4774                                              (__v8di)_mm512_cvtepi8_epi64(__A),
   4775                                              (__v8di)_mm512_setzero_si512 ());
   4776 }
   4777 
   4778 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4779 _mm512_cvtepi32_epi64(__m256i __X)
   4780 {
   4781   return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
   4782 }
   4783 
   4784 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4785 _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
   4786 {
   4787   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   4788                                              (__v8di)_mm512_cvtepi32_epi64(__X),
   4789                                              (__v8di)__W);
   4790 }
   4791 
   4792 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4793 _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
   4794 {
   4795   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   4796                                              (__v8di)_mm512_cvtepi32_epi64(__X),
   4797                                              (__v8di)_mm512_setzero_si512());
   4798 }
   4799 
   4800 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4801 _mm512_cvtepi16_epi32(__m256i __A)
   4802 {
   4803   return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
   4804 }
   4805 
   4806 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4807 _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
   4808 {
   4809   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   4810                                             (__v16si)_mm512_cvtepi16_epi32(__A),
   4811                                             (__v16si)__W);
   4812 }
   4813 
   4814 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4815 _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
   4816 {
   4817   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   4818                                             (__v16si)_mm512_cvtepi16_epi32(__A),
   4819                                             (__v16si)_mm512_setzero_si512 ());
   4820 }
   4821 
   4822 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4823 _mm512_cvtepi16_epi64(__m128i __A)
   4824 {
   4825   return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
   4826 }
   4827 
   4828 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4829 _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
   4830 {
   4831   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   4832                                              (__v8di)_mm512_cvtepi16_epi64(__A),
   4833                                              (__v8di)__W);
   4834 }
   4835 
   4836 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4837 _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
   4838 {
   4839   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   4840                                              (__v8di)_mm512_cvtepi16_epi64(__A),
   4841                                              (__v8di)_mm512_setzero_si512());
   4842 }
   4843 
   4844 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4845 _mm512_cvtepu8_epi32(__m128i __A)
   4846 {
   4847   return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
   4848 }
   4849 
   4850 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4851 _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
   4852 {
   4853   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   4854                                              (__v16si)_mm512_cvtepu8_epi32(__A),
   4855                                              (__v16si)__W);
   4856 }
   4857 
   4858 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4859 _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
   4860 {
   4861   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   4862                                              (__v16si)_mm512_cvtepu8_epi32(__A),
   4863                                              (__v16si)_mm512_setzero_si512());
   4864 }
   4865 
   4866 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4867 _mm512_cvtepu8_epi64(__m128i __A)
   4868 {
   4869   return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
   4870 }
   4871 
   4872 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4873 _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
   4874 {
   4875   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   4876                                              (__v8di)_mm512_cvtepu8_epi64(__A),
   4877                                              (__v8di)__W);
   4878 }
   4879 
   4880 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4881 _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
   4882 {
   4883   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   4884                                              (__v8di)_mm512_cvtepu8_epi64(__A),
   4885                                              (__v8di)_mm512_setzero_si512());
   4886 }
   4887 
   4888 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4889 _mm512_cvtepu32_epi64(__m256i __X)
   4890 {
   4891   return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
   4892 }
   4893 
   4894 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4895 _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
   4896 {
   4897   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   4898                                              (__v8di)_mm512_cvtepu32_epi64(__X),
   4899                                              (__v8di)__W);
   4900 }
   4901 
   4902 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4903 _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
   4904 {
   4905   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   4906                                              (__v8di)_mm512_cvtepu32_epi64(__X),
   4907                                              (__v8di)_mm512_setzero_si512());
   4908 }
   4909 
   4910 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4911 _mm512_cvtepu16_epi32(__m256i __A)
   4912 {
   4913   return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
   4914 }
   4915 
   4916 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4917 _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
   4918 {
   4919   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   4920                                             (__v16si)_mm512_cvtepu16_epi32(__A),
   4921                                             (__v16si)__W);
   4922 }
   4923 
   4924 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4925 _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
   4926 {
   4927   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   4928                                             (__v16si)_mm512_cvtepu16_epi32(__A),
   4929                                             (__v16si)_mm512_setzero_si512());
   4930 }
   4931 
   4932 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4933 _mm512_cvtepu16_epi64(__m128i __A)
   4934 {
   4935   return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
   4936 }
   4937 
   4938 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4939 _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
   4940 {
   4941   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   4942                                              (__v8di)_mm512_cvtepu16_epi64(__A),
   4943                                              (__v8di)__W);
   4944 }
   4945 
   4946 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4947 _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
   4948 {
   4949   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   4950                                              (__v8di)_mm512_cvtepu16_epi64(__A),
   4951                                              (__v8di)_mm512_setzero_si512());
   4952 }
   4953 
   4954 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4955 _mm512_rorv_epi32 (__m512i __A, __m512i __B)
   4956 {
   4957   return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B);
   4958 }
   4959 
   4960 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4961 _mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
   4962 {
   4963   return (__m512i)__builtin_ia32_selectd_512(__U,
   4964                                            (__v16si)_mm512_rorv_epi32(__A, __B),
   4965                                            (__v16si)__W);
   4966 }
   4967 
   4968 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4969 _mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
   4970 {
   4971   return (__m512i)__builtin_ia32_selectd_512(__U,
   4972                                            (__v16si)_mm512_rorv_epi32(__A, __B),
   4973                                            (__v16si)_mm512_setzero_si512());
   4974 }
   4975 
   4976 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4977 _mm512_rorv_epi64 (__m512i __A, __m512i __B)
   4978 {
   4979   return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B);
   4980 }
   4981 
   4982 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4983 _mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
   4984 {
   4985   return (__m512i)__builtin_ia32_selectq_512(__U,
   4986                                             (__v8di)_mm512_rorv_epi64(__A, __B),
   4987                                             (__v8di)__W);
   4988 }
   4989 
   4990 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   4991 _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
   4992 {
   4993   return (__m512i)__builtin_ia32_selectq_512(__U,
   4994                                             (__v8di)_mm512_rorv_epi64(__A, __B),
   4995                                             (__v8di)_mm512_setzero_si512());
   4996 }
   4997 
   4998 
   4999 
   5000 #define _mm512_cmp_epi32_mask(a, b, p) \
   5001   ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
   5002                                           (__v16si)(__m512i)(b), (int)(p), \
   5003                                           (__mmask16)-1))
   5004 
   5005 #define _mm512_cmp_epu32_mask(a, b, p) \
   5006   ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
   5007                                            (__v16si)(__m512i)(b), (int)(p), \
   5008                                            (__mmask16)-1))
   5009 
   5010 #define _mm512_cmp_epi64_mask(a, b, p) \
   5011   ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
   5012                                          (__v8di)(__m512i)(b), (int)(p), \
   5013                                          (__mmask8)-1))
   5014 
   5015 #define _mm512_cmp_epu64_mask(a, b, p) \
   5016   ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
   5017                                           (__v8di)(__m512i)(b), (int)(p), \
   5018                                           (__mmask8)-1))
   5019 
   5020 #define _mm512_mask_cmp_epi32_mask(m, a, b, p) \
   5021   ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
   5022                                           (__v16si)(__m512i)(b), (int)(p), \
   5023                                           (__mmask16)(m)))
   5024 
   5025 #define _mm512_mask_cmp_epu32_mask(m, a, b, p) \
   5026   ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
   5027                                            (__v16si)(__m512i)(b), (int)(p), \
   5028                                            (__mmask16)(m)))
   5029 
   5030 #define _mm512_mask_cmp_epi64_mask(m, a, b, p) \
   5031   ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
   5032                                          (__v8di)(__m512i)(b), (int)(p), \
   5033                                          (__mmask8)(m)))
   5034 
   5035 #define _mm512_mask_cmp_epu64_mask(m, a, b, p) \
   5036   ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
   5037                                           (__v8di)(__m512i)(b), (int)(p), \
   5038                                           (__mmask8)(m)))
   5039 
   5040 #define _mm512_rol_epi32(a, b) \
   5041   ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)))
   5042 
   5043 #define _mm512_mask_rol_epi32(W, U, a, b) \
   5044   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
   5045                                        (__v16si)_mm512_rol_epi32((a), (b)), \
   5046                                        (__v16si)(__m512i)(W)))
   5047 
   5048 #define _mm512_maskz_rol_epi32(U, a, b) \
   5049   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
   5050                                        (__v16si)_mm512_rol_epi32((a), (b)), \
   5051                                        (__v16si)_mm512_setzero_si512()))
   5052 
   5053 #define _mm512_rol_epi64(a, b) \
   5054   ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)))
   5055 
   5056 #define _mm512_mask_rol_epi64(W, U, a, b) \
   5057   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
   5058                                        (__v8di)_mm512_rol_epi64((a), (b)), \
   5059                                        (__v8di)(__m512i)(W)))
   5060 
   5061 #define _mm512_maskz_rol_epi64(U, a, b) \
   5062   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
   5063                                        (__v8di)_mm512_rol_epi64((a), (b)), \
   5064                                        (__v8di)_mm512_setzero_si512()))
   5065 
   5066 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5067 _mm512_rolv_epi32 (__m512i __A, __m512i __B)
   5068 {
   5069   return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B);
   5070 }
   5071 
   5072 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5073 _mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
   5074 {
   5075   return (__m512i)__builtin_ia32_selectd_512(__U,
   5076                                            (__v16si)_mm512_rolv_epi32(__A, __B),
   5077                                            (__v16si)__W);
   5078 }
   5079 
   5080 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5081 _mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
   5082 {
   5083   return (__m512i)__builtin_ia32_selectd_512(__U,
   5084                                            (__v16si)_mm512_rolv_epi32(__A, __B),
   5085                                            (__v16si)_mm512_setzero_si512());
   5086 }
   5087 
   5088 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5089 _mm512_rolv_epi64 (__m512i __A, __m512i __B)
   5090 {
   5091   return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B);
   5092 }
   5093 
   5094 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5095 _mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
   5096 {
   5097   return (__m512i)__builtin_ia32_selectq_512(__U,
   5098                                             (__v8di)_mm512_rolv_epi64(__A, __B),
   5099                                             (__v8di)__W);
   5100 }
   5101 
   5102 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5103 _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
   5104 {
   5105   return (__m512i)__builtin_ia32_selectq_512(__U,
   5106                                             (__v8di)_mm512_rolv_epi64(__A, __B),
   5107                                             (__v8di)_mm512_setzero_si512());
   5108 }
   5109 
   5110 #define _mm512_ror_epi32(A, B) \
   5111   ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)))
   5112 
   5113 #define _mm512_mask_ror_epi32(W, U, A, B) \
   5114   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
   5115                                        (__v16si)_mm512_ror_epi32((A), (B)), \
   5116                                        (__v16si)(__m512i)(W)))
   5117 
   5118 #define _mm512_maskz_ror_epi32(U, A, B) \
   5119   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
   5120                                        (__v16si)_mm512_ror_epi32((A), (B)), \
   5121                                        (__v16si)_mm512_setzero_si512()))
   5122 
   5123 #define _mm512_ror_epi64(A, B) \
   5124   ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)))
   5125 
   5126 #define _mm512_mask_ror_epi64(W, U, A, B) \
   5127   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
   5128                                        (__v8di)_mm512_ror_epi64((A), (B)), \
   5129                                        (__v8di)(__m512i)(W)))
   5130 
   5131 #define _mm512_maskz_ror_epi64(U, A, B) \
   5132   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
   5133                                        (__v8di)_mm512_ror_epi64((A), (B)), \
   5134                                        (__v8di)_mm512_setzero_si512()))
   5135 
   5136 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5137 _mm512_slli_epi32(__m512i __A, unsigned int __B)
   5138 {
   5139   return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, (int)__B);
   5140 }
   5141 
   5142 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5143 _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
   5144                        unsigned int __B)
   5145 {
   5146   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5147                                          (__v16si)_mm512_slli_epi32(__A, __B),
   5148                                          (__v16si)__W);
   5149 }
   5150 
   5151 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5152 _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
   5153   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5154                                          (__v16si)_mm512_slli_epi32(__A, __B),
   5155                                          (__v16si)_mm512_setzero_si512());
   5156 }
   5157 
   5158 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5159 _mm512_slli_epi64(__m512i __A, unsigned int __B)
   5160 {
   5161   return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, (int)__B);
   5162 }
   5163 
   5164 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5165 _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
   5166 {
   5167   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5168                                           (__v8di)_mm512_slli_epi64(__A, __B),
   5169                                           (__v8di)__W);
   5170 }
   5171 
   5172 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5173 _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
   5174 {
   5175   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5176                                           (__v8di)_mm512_slli_epi64(__A, __B),
   5177                                           (__v8di)_mm512_setzero_si512());
   5178 }
   5179 
   5180 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5181 _mm512_srli_epi32(__m512i __A, unsigned int __B)
   5182 {
   5183   return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, (int)__B);
   5184 }
   5185 
   5186 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5187 _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
   5188                        unsigned int __B)
   5189 {
   5190   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5191                                          (__v16si)_mm512_srli_epi32(__A, __B),
   5192                                          (__v16si)__W);
   5193 }
   5194 
   5195 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5196 _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
   5197   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5198                                          (__v16si)_mm512_srli_epi32(__A, __B),
   5199                                          (__v16si)_mm512_setzero_si512());
   5200 }
   5201 
   5202 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5203 _mm512_srli_epi64(__m512i __A, unsigned int __B)
   5204 {
   5205   return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, (int)__B);
   5206 }
   5207 
   5208 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5209 _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A,
   5210                        unsigned int __B)
   5211 {
   5212   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5213                                           (__v8di)_mm512_srli_epi64(__A, __B),
   5214                                           (__v8di)__W);
   5215 }
   5216 
   5217 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5218 _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A,
   5219                         unsigned int __B)
   5220 {
   5221   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5222                                           (__v8di)_mm512_srli_epi64(__A, __B),
   5223                                           (__v8di)_mm512_setzero_si512());
   5224 }
   5225 
   5226 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5227 _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
   5228 {
   5229   return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
   5230               (__v16si) __W,
   5231               (__mmask16) __U);
   5232 }
   5233 
   5234 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5235 _mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
   5236 {
   5237   return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
   5238               (__v16si)
   5239               _mm512_setzero_si512 (),
   5240               (__mmask16) __U);
   5241 }
   5242 
   5243 static __inline__ void __DEFAULT_FN_ATTRS512
   5244 _mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
   5245 {
   5246   __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
   5247           (__mmask16) __U);
   5248 }
   5249 
   5250 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5251 _mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
   5252 {
   5253   return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
   5254                  (__v16si) __A,
   5255                  (__v16si) __W);
   5256 }
   5257 
   5258 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5259 _mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
   5260 {
   5261   return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
   5262                  (__v16si) __A,
   5263                  (__v16si) _mm512_setzero_si512 ());
   5264 }
   5265 
   5266 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5267 _mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
   5268 {
   5269   return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
   5270                  (__v8di) __A,
   5271                  (__v8di) __W);
   5272 }
   5273 
   5274 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5275 _mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
   5276 {
   5277   return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
   5278                  (__v8di) __A,
   5279                  (__v8di) _mm512_setzero_si512 ());
   5280 }
   5281 
   5282 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5283 _mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
   5284 {
   5285   return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
   5286               (__v8di) __W,
   5287               (__mmask8) __U);
   5288 }
   5289 
   5290 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5291 _mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
   5292 {
   5293   return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
   5294               (__v8di)
   5295               _mm512_setzero_si512 (),
   5296               (__mmask8) __U);
   5297 }
   5298 
   5299 static __inline__ void __DEFAULT_FN_ATTRS512
   5300 _mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
   5301 {
   5302   __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
   5303           (__mmask8) __U);
   5304 }
   5305 
   5306 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   5307 _mm512_movedup_pd (__m512d __A)
   5308 {
   5309   return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
   5310                                           0, 0, 2, 2, 4, 4, 6, 6);
   5311 }
   5312 
   5313 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   5314 _mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
   5315 {
   5316   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   5317                                               (__v8df)_mm512_movedup_pd(__A),
   5318                                               (__v8df)__W);
   5319 }
   5320 
   5321 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   5322 _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
   5323 {
   5324   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   5325                                               (__v8df)_mm512_movedup_pd(__A),
   5326                                               (__v8df)_mm512_setzero_pd());
   5327 }
   5328 
   5329 #define _mm512_fixupimm_round_pd(A, B, C, imm, R) \
   5330   ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
   5331                                               (__v8df)(__m512d)(B), \
   5332                                               (__v8di)(__m512i)(C), (int)(imm), \
   5333                                               (__mmask8)-1, (int)(R)))
   5334 
   5335 #define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
   5336   ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
   5337                                               (__v8df)(__m512d)(B), \
   5338                                               (__v8di)(__m512i)(C), (int)(imm), \
   5339                                               (__mmask8)(U), (int)(R)))
   5340 
   5341 #define _mm512_fixupimm_pd(A, B, C, imm) \
   5342   ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
   5343                                               (__v8df)(__m512d)(B), \
   5344                                               (__v8di)(__m512i)(C), (int)(imm), \
   5345                                               (__mmask8)-1, \
   5346                                               _MM_FROUND_CUR_DIRECTION))
   5347 
   5348 #define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \
   5349   ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
   5350                                               (__v8df)(__m512d)(B), \
   5351                                               (__v8di)(__m512i)(C), (int)(imm), \
   5352                                               (__mmask8)(U), \
   5353                                               _MM_FROUND_CUR_DIRECTION))
   5354 
   5355 #define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
   5356   ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
   5357                                                (__v8df)(__m512d)(B), \
   5358                                                (__v8di)(__m512i)(C), \
   5359                                                (int)(imm), (__mmask8)(U), \
   5360                                                (int)(R)))
   5361 
   5362 #define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \
   5363   ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
   5364                                                (__v8df)(__m512d)(B), \
   5365                                                (__v8di)(__m512i)(C), \
   5366                                                (int)(imm), (__mmask8)(U), \
   5367                                                _MM_FROUND_CUR_DIRECTION))
   5368 
   5369 #define _mm512_fixupimm_round_ps(A, B, C, imm, R) \
   5370   ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
   5371                                              (__v16sf)(__m512)(B), \
   5372                                              (__v16si)(__m512i)(C), (int)(imm), \
   5373                                              (__mmask16)-1, (int)(R)))
   5374 
   5375 #define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
   5376   ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
   5377                                              (__v16sf)(__m512)(B), \
   5378                                              (__v16si)(__m512i)(C), (int)(imm), \
   5379                                              (__mmask16)(U), (int)(R)))
   5380 
   5381 #define _mm512_fixupimm_ps(A, B, C, imm) \
   5382   ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
   5383                                              (__v16sf)(__m512)(B), \
   5384                                              (__v16si)(__m512i)(C), (int)(imm), \
   5385                                              (__mmask16)-1, \
   5386                                              _MM_FROUND_CUR_DIRECTION))
   5387 
   5388 #define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \
   5389   ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
   5390                                              (__v16sf)(__m512)(B), \
   5391                                              (__v16si)(__m512i)(C), (int)(imm), \
   5392                                              (__mmask16)(U), \
   5393                                              _MM_FROUND_CUR_DIRECTION))
   5394 
   5395 #define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
   5396   ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
   5397                                               (__v16sf)(__m512)(B), \
   5398                                               (__v16si)(__m512i)(C), \
   5399                                               (int)(imm), (__mmask16)(U), \
   5400                                               (int)(R)))
   5401 
   5402 #define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \
   5403   ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
   5404                                               (__v16sf)(__m512)(B), \
   5405                                               (__v16si)(__m512i)(C), \
   5406                                               (int)(imm), (__mmask16)(U), \
   5407                                               _MM_FROUND_CUR_DIRECTION))
   5408 
   5409 #define _mm_fixupimm_round_sd(A, B, C, imm, R) \
   5410   ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
   5411                                            (__v2df)(__m128d)(B), \
   5412                                            (__v2di)(__m128i)(C), (int)(imm), \
   5413                                            (__mmask8)-1, (int)(R)))
   5414 
   5415 #define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \
   5416   ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
   5417                                            (__v2df)(__m128d)(B), \
   5418                                            (__v2di)(__m128i)(C), (int)(imm), \
   5419                                            (__mmask8)(U), (int)(R)))
   5420 
   5421 #define _mm_fixupimm_sd(A, B, C, imm) \
   5422   ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
   5423                                            (__v2df)(__m128d)(B), \
   5424                                            (__v2di)(__m128i)(C), (int)(imm), \
   5425                                            (__mmask8)-1, \
   5426                                            _MM_FROUND_CUR_DIRECTION))
   5427 
   5428 #define _mm_mask_fixupimm_sd(A, U, B, C, imm) \
   5429   ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
   5430                                            (__v2df)(__m128d)(B), \
   5431                                            (__v2di)(__m128i)(C), (int)(imm), \
   5432                                            (__mmask8)(U), \
   5433                                            _MM_FROUND_CUR_DIRECTION))
   5434 
   5435 #define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \
   5436   ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
   5437                                             (__v2df)(__m128d)(B), \
   5438                                             (__v2di)(__m128i)(C), (int)(imm), \
   5439                                             (__mmask8)(U), (int)(R)))
   5440 
   5441 #define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \
   5442   ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
   5443                                             (__v2df)(__m128d)(B), \
   5444                                             (__v2di)(__m128i)(C), (int)(imm), \
   5445                                             (__mmask8)(U), \
   5446                                             _MM_FROUND_CUR_DIRECTION))
   5447 
   5448 #define _mm_fixupimm_round_ss(A, B, C, imm, R) \
   5449   ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
   5450                                           (__v4sf)(__m128)(B), \
   5451                                           (__v4si)(__m128i)(C), (int)(imm), \
   5452                                           (__mmask8)-1, (int)(R)))
   5453 
   5454 #define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \
   5455   ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
   5456                                           (__v4sf)(__m128)(B), \
   5457                                           (__v4si)(__m128i)(C), (int)(imm), \
   5458                                           (__mmask8)(U), (int)(R)))
   5459 
   5460 #define _mm_fixupimm_ss(A, B, C, imm) \
   5461   ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
   5462                                           (__v4sf)(__m128)(B), \
   5463                                           (__v4si)(__m128i)(C), (int)(imm), \
   5464                                           (__mmask8)-1, \
   5465                                           _MM_FROUND_CUR_DIRECTION))
   5466 
   5467 #define _mm_mask_fixupimm_ss(A, U, B, C, imm) \
   5468   ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
   5469                                           (__v4sf)(__m128)(B), \
   5470                                           (__v4si)(__m128i)(C), (int)(imm), \
   5471                                           (__mmask8)(U), \
   5472                                           _MM_FROUND_CUR_DIRECTION))
   5473 
   5474 #define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \
   5475   ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
   5476                                            (__v4sf)(__m128)(B), \
   5477                                            (__v4si)(__m128i)(C), (int)(imm), \
   5478                                            (__mmask8)(U), (int)(R)))
   5479 
   5480 #define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \
   5481   ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
   5482                                            (__v4sf)(__m128)(B), \
   5483                                            (__v4si)(__m128i)(C), (int)(imm), \
   5484                                            (__mmask8)(U), \
   5485                                            _MM_FROUND_CUR_DIRECTION))
   5486 
   5487 #define _mm_getexp_round_sd(A, B, R) \
   5488   ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
   5489                                                   (__v2df)(__m128d)(B), \
   5490                                                   (__v2df)_mm_setzero_pd(), \
   5491                                                   (__mmask8)-1, (int)(R)))
   5492 
   5493 
   5494 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   5495 _mm_getexp_sd (__m128d __A, __m128d __B)
   5496 {
   5497   return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
   5498                  (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
   5499 }
   5500 
   5501 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   5502 _mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
   5503 {
   5504  return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
   5505           (__v2df) __B,
   5506           (__v2df) __W,
   5507           (__mmask8) __U,
   5508           _MM_FROUND_CUR_DIRECTION);
   5509 }
   5510 
   5511 #define _mm_mask_getexp_round_sd(W, U, A, B, R) \
   5512   ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
   5513                                                   (__v2df)(__m128d)(B), \
   5514                                                   (__v2df)(__m128d)(W), \
   5515                                                   (__mmask8)(U), (int)(R)))
   5516 
   5517 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   5518 _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
   5519 {
   5520  return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
   5521           (__v2df) __B,
   5522           (__v2df) _mm_setzero_pd (),
   5523           (__mmask8) __U,
   5524           _MM_FROUND_CUR_DIRECTION);
   5525 }
   5526 
   5527 #define _mm_maskz_getexp_round_sd(U, A, B, R) \
   5528   ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
   5529                                                   (__v2df)(__m128d)(B), \
   5530                                                   (__v2df)_mm_setzero_pd(), \
   5531                                                   (__mmask8)(U), (int)(R)))
   5532 
   5533 #define _mm_getexp_round_ss(A, B, R) \
   5534   ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
   5535                                                  (__v4sf)(__m128)(B), \
   5536                                                  (__v4sf)_mm_setzero_ps(), \
   5537                                                  (__mmask8)-1, (int)(R)))
   5538 
   5539 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   5540 _mm_getexp_ss (__m128 __A, __m128 __B)
   5541 {
   5542   return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
   5543                 (__v4sf) __B, (__v4sf)  _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
   5544 }
   5545 
   5546 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   5547 _mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
   5548 {
   5549  return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
   5550           (__v4sf) __B,
   5551           (__v4sf) __W,
   5552           (__mmask8) __U,
   5553           _MM_FROUND_CUR_DIRECTION);
   5554 }
   5555 
   5556 #define _mm_mask_getexp_round_ss(W, U, A, B, R) \
   5557   ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
   5558                                                  (__v4sf)(__m128)(B), \
   5559                                                  (__v4sf)(__m128)(W), \
   5560                                                  (__mmask8)(U), (int)(R)))
   5561 
   5562 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   5563 _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
   5564 {
   5565  return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
   5566           (__v4sf) __B,
   5567           (__v4sf) _mm_setzero_ps (),
   5568           (__mmask8) __U,
   5569           _MM_FROUND_CUR_DIRECTION);
   5570 }
   5571 
   5572 #define _mm_maskz_getexp_round_ss(U, A, B, R) \
   5573   ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
   5574                                                  (__v4sf)(__m128)(B), \
   5575                                                  (__v4sf)_mm_setzero_ps(), \
   5576                                                  (__mmask8)(U), (int)(R)))
   5577 
   5578 #define _mm_getmant_round_sd(A, B, C, D, R) \
   5579   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
   5580                                                 (__v2df)(__m128d)(B), \
   5581                                                 (int)(((D)<<2) | (C)), \
   5582                                                 (__v2df)_mm_setzero_pd(), \
   5583                                                 (__mmask8)-1, (int)(R)))
   5584 
   5585 #define _mm_getmant_sd(A, B, C, D)  \
   5586   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
   5587                                                 (__v2df)(__m128d)(B), \
   5588                                                 (int)(((D)<<2) | (C)), \
   5589                                                 (__v2df)_mm_setzero_pd(), \
   5590                                                 (__mmask8)-1, \
   5591                                                 _MM_FROUND_CUR_DIRECTION))
   5592 
   5593 #define _mm_mask_getmant_sd(W, U, A, B, C, D) \
   5594   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
   5595                                                 (__v2df)(__m128d)(B), \
   5596                                                 (int)(((D)<<2) | (C)), \
   5597                                                 (__v2df)(__m128d)(W), \
   5598                                                 (__mmask8)(U), \
   5599                                                 _MM_FROUND_CUR_DIRECTION))
   5600 
   5601 #define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \
   5602   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
   5603                                                 (__v2df)(__m128d)(B), \
   5604                                                 (int)(((D)<<2) | (C)), \
   5605                                                 (__v2df)(__m128d)(W), \
   5606                                                 (__mmask8)(U), (int)(R)))
   5607 
   5608 #define _mm_maskz_getmant_sd(U, A, B, C, D) \
   5609   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
   5610                                                 (__v2df)(__m128d)(B), \
   5611                                                 (int)(((D)<<2) | (C)), \
   5612                                                 (__v2df)_mm_setzero_pd(), \
   5613                                                 (__mmask8)(U), \
   5614                                                 _MM_FROUND_CUR_DIRECTION))
   5615 
   5616 #define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \
   5617   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
   5618                                                 (__v2df)(__m128d)(B), \
   5619                                                 (int)(((D)<<2) | (C)), \
   5620                                                 (__v2df)_mm_setzero_pd(), \
   5621                                                 (__mmask8)(U), (int)(R)))
   5622 
   5623 #define _mm_getmant_round_ss(A, B, C, D, R) \
   5624   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
   5625                                                (__v4sf)(__m128)(B), \
   5626                                                (int)(((D)<<2) | (C)), \
   5627                                                (__v4sf)_mm_setzero_ps(), \
   5628                                                (__mmask8)-1, (int)(R)))
   5629 
   5630 #define _mm_getmant_ss(A, B, C, D) \
   5631   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
   5632                                                (__v4sf)(__m128)(B), \
   5633                                                (int)(((D)<<2) | (C)), \
   5634                                                (__v4sf)_mm_setzero_ps(), \
   5635                                                (__mmask8)-1, \
   5636                                                _MM_FROUND_CUR_DIRECTION))
   5637 
   5638 #define _mm_mask_getmant_ss(W, U, A, B, C, D) \
   5639   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
   5640                                                (__v4sf)(__m128)(B), \
   5641                                                (int)(((D)<<2) | (C)), \
   5642                                                (__v4sf)(__m128)(W), \
   5643                                                (__mmask8)(U), \
   5644                                                _MM_FROUND_CUR_DIRECTION))
   5645 
   5646 #define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \
   5647   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
   5648                                                (__v4sf)(__m128)(B), \
   5649                                                (int)(((D)<<2) | (C)), \
   5650                                                (__v4sf)(__m128)(W), \
   5651                                                (__mmask8)(U), (int)(R)))
   5652 
   5653 #define _mm_maskz_getmant_ss(U, A, B, C, D) \
   5654   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
   5655                                                (__v4sf)(__m128)(B), \
   5656                                                (int)(((D)<<2) | (C)), \
   5657                                                (__v4sf)_mm_setzero_ps(), \
   5658                                                (__mmask8)(U), \
   5659                                                _MM_FROUND_CUR_DIRECTION))
   5660 
   5661 #define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \
   5662   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
   5663                                                (__v4sf)(__m128)(B), \
   5664                                                (int)(((D)<<2) | (C)), \
   5665                                                (__v4sf)_mm_setzero_ps(), \
   5666                                                (__mmask8)(U), (int)(R)))
   5667 
   5668 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
   5669 _mm512_kmov (__mmask16 __A)
   5670 {
   5671   return  __A;
   5672 }
   5673 
   5674 #define _mm_comi_round_sd(A, B, P, R) \
   5675   ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
   5676                                (int)(P), (int)(R)))
   5677 
   5678 #define _mm_comi_round_ss(A, B, P, R) \
   5679   ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
   5680                                (int)(P), (int)(R)))
   5681 
   5682 #ifdef __x86_64__
   5683 #define _mm_cvt_roundsd_si64(A, R) \
   5684   ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
   5685 #endif
   5686 
   5687 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5688 _mm512_sll_epi32(__m512i __A, __m128i __B)
   5689 {
   5690   return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
   5691 }
   5692 
   5693 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5694 _mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
   5695 {
   5696   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5697                                           (__v16si)_mm512_sll_epi32(__A, __B),
   5698                                           (__v16si)__W);
   5699 }
   5700 
   5701 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5702 _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
   5703 {
   5704   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5705                                           (__v16si)_mm512_sll_epi32(__A, __B),
   5706                                           (__v16si)_mm512_setzero_si512());
   5707 }
   5708 
   5709 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5710 _mm512_sll_epi64(__m512i __A, __m128i __B)
   5711 {
   5712   return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
   5713 }
   5714 
   5715 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5716 _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
   5717 {
   5718   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5719                                              (__v8di)_mm512_sll_epi64(__A, __B),
   5720                                              (__v8di)__W);
   5721 }
   5722 
   5723 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5724 _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
   5725 {
   5726   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5727                                            (__v8di)_mm512_sll_epi64(__A, __B),
   5728                                            (__v8di)_mm512_setzero_si512());
   5729 }
   5730 
   5731 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5732 _mm512_sllv_epi32(__m512i __X, __m512i __Y)
   5733 {
   5734   return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
   5735 }
   5736 
   5737 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5738 _mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
   5739 {
   5740   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5741                                            (__v16si)_mm512_sllv_epi32(__X, __Y),
   5742                                            (__v16si)__W);
   5743 }
   5744 
   5745 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5746 _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
   5747 {
   5748   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5749                                            (__v16si)_mm512_sllv_epi32(__X, __Y),
   5750                                            (__v16si)_mm512_setzero_si512());
   5751 }
   5752 
   5753 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5754 _mm512_sllv_epi64(__m512i __X, __m512i __Y)
   5755 {
   5756   return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
   5757 }
   5758 
   5759 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5760 _mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
   5761 {
   5762   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5763                                             (__v8di)_mm512_sllv_epi64(__X, __Y),
   5764                                             (__v8di)__W);
   5765 }
   5766 
   5767 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5768 _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
   5769 {
   5770   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5771                                             (__v8di)_mm512_sllv_epi64(__X, __Y),
   5772                                             (__v8di)_mm512_setzero_si512());
   5773 }
   5774 
   5775 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5776 _mm512_sra_epi32(__m512i __A, __m128i __B)
   5777 {
   5778   return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
   5779 }
   5780 
   5781 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5782 _mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
   5783 {
   5784   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5785                                           (__v16si)_mm512_sra_epi32(__A, __B),
   5786                                           (__v16si)__W);
   5787 }
   5788 
   5789 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5790 _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
   5791 {
   5792   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5793                                           (__v16si)_mm512_sra_epi32(__A, __B),
   5794                                           (__v16si)_mm512_setzero_si512());
   5795 }
   5796 
   5797 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5798 _mm512_sra_epi64(__m512i __A, __m128i __B)
   5799 {
   5800   return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
   5801 }
   5802 
   5803 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5804 _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
   5805 {
   5806   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5807                                            (__v8di)_mm512_sra_epi64(__A, __B),
   5808                                            (__v8di)__W);
   5809 }
   5810 
   5811 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5812 _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
   5813 {
   5814   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5815                                            (__v8di)_mm512_sra_epi64(__A, __B),
   5816                                            (__v8di)_mm512_setzero_si512());
   5817 }
   5818 
   5819 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5820 _mm512_srav_epi32(__m512i __X, __m512i __Y)
   5821 {
   5822   return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
   5823 }
   5824 
   5825 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5826 _mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
   5827 {
   5828   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5829                                            (__v16si)_mm512_srav_epi32(__X, __Y),
   5830                                            (__v16si)__W);
   5831 }
   5832 
   5833 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5834 _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
   5835 {
   5836   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5837                                            (__v16si)_mm512_srav_epi32(__X, __Y),
   5838                                            (__v16si)_mm512_setzero_si512());
   5839 }
   5840 
   5841 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5842 _mm512_srav_epi64(__m512i __X, __m512i __Y)
   5843 {
   5844   return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
   5845 }
   5846 
   5847 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5848 _mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
   5849 {
   5850   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5851                                             (__v8di)_mm512_srav_epi64(__X, __Y),
   5852                                             (__v8di)__W);
   5853 }
   5854 
   5855 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5856 _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
   5857 {
   5858   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5859                                             (__v8di)_mm512_srav_epi64(__X, __Y),
   5860                                             (__v8di)_mm512_setzero_si512());
   5861 }
   5862 
   5863 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5864 _mm512_srl_epi32(__m512i __A, __m128i __B)
   5865 {
   5866   return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
   5867 }
   5868 
   5869 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5870 _mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
   5871 {
   5872   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5873                                           (__v16si)_mm512_srl_epi32(__A, __B),
   5874                                           (__v16si)__W);
   5875 }
   5876 
   5877 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5878 _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
   5879 {
   5880   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5881                                           (__v16si)_mm512_srl_epi32(__A, __B),
   5882                                           (__v16si)_mm512_setzero_si512());
   5883 }
   5884 
   5885 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5886 _mm512_srl_epi64(__m512i __A, __m128i __B)
   5887 {
   5888   return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
   5889 }
   5890 
   5891 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5892 _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
   5893 {
   5894   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5895                                            (__v8di)_mm512_srl_epi64(__A, __B),
   5896                                            (__v8di)__W);
   5897 }
   5898 
   5899 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5900 _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
   5901 {
   5902   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5903                                            (__v8di)_mm512_srl_epi64(__A, __B),
   5904                                            (__v8di)_mm512_setzero_si512());
   5905 }
   5906 
   5907 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5908 _mm512_srlv_epi32(__m512i __X, __m512i __Y)
   5909 {
   5910   return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
   5911 }
   5912 
   5913 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5914 _mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
   5915 {
   5916   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5917                                            (__v16si)_mm512_srlv_epi32(__X, __Y),
   5918                                            (__v16si)__W);
   5919 }
   5920 
   5921 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5922 _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
   5923 {
   5924   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   5925                                            (__v16si)_mm512_srlv_epi32(__X, __Y),
   5926                                            (__v16si)_mm512_setzero_si512());
   5927 }
   5928 
   5929 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5930 _mm512_srlv_epi64 (__m512i __X, __m512i __Y)
   5931 {
   5932   return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
   5933 }
   5934 
   5935 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5936 _mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
   5937 {
   5938   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5939                                             (__v8di)_mm512_srlv_epi64(__X, __Y),
   5940                                             (__v8di)__W);
   5941 }
   5942 
   5943 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   5944 _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
   5945 {
   5946   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   5947                                             (__v8di)_mm512_srlv_epi64(__X, __Y),
   5948                                             (__v8di)_mm512_setzero_si512());
   5949 }
   5950 
   5951 /// \enum _MM_TERNLOG_ENUM
   5952 ///    A helper to represent the ternary logic operations among vector \a A,
   5953 ///    \a B and \a C. The representation is passed to \a imm.
   5954 typedef enum {
   5955   _MM_TERNLOG_A = 0xF0,
   5956   _MM_TERNLOG_B = 0xCC,
   5957   _MM_TERNLOG_C = 0xAA
   5958 } _MM_TERNLOG_ENUM;
   5959 
   5960 #define _mm512_ternarylogic_epi32(A, B, C, imm)                                \
   5961   ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
   5962       (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
   5963       (unsigned char)(imm), (__mmask16)-1))
   5964 
   5965 #define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm)                        \
   5966   ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
   5967       (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
   5968       (unsigned char)(imm), (__mmask16)(U)))
   5969 
   5970 #define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm)                       \
   5971   ((__m512i)__builtin_ia32_pternlogd512_maskz(                                 \
   5972       (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
   5973       (unsigned char)(imm), (__mmask16)(U)))
   5974 
   5975 #define _mm512_ternarylogic_epi64(A, B, C, imm)                                \
   5976   ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
   5977       (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
   5978       (unsigned char)(imm), (__mmask8)-1))
   5979 
   5980 #define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm)                        \
   5981   ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
   5982       (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
   5983       (unsigned char)(imm), (__mmask8)(U)))
   5984 
   5985 #define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm)                       \
   5986   ((__m512i)__builtin_ia32_pternlogq512_maskz(                                 \
   5987       (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
   5988       (unsigned char)(imm), (__mmask8)(U)))
   5989 
   5990 #ifdef __x86_64__
   5991 #define _mm_cvt_roundsd_i64(A, R) \
   5992   ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
   5993 #endif
   5994 
   5995 #define _mm_cvt_roundsd_si32(A, R) \
   5996   ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
   5997 
   5998 #define _mm_cvt_roundsd_i32(A, R) \
   5999   ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
   6000 
   6001 #define _mm_cvt_roundsd_u32(A, R) \
   6002   ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)))
   6003 
   6004 static __inline__ unsigned __DEFAULT_FN_ATTRS128
   6005 _mm_cvtsd_u32 (__m128d __A)
   6006 {
   6007   return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
   6008              _MM_FROUND_CUR_DIRECTION);
   6009 }
   6010 
   6011 #ifdef __x86_64__
   6012 #define _mm_cvt_roundsd_u64(A, R) \
   6013   ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
   6014                                                    (int)(R)))
   6015 
   6016 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
   6017 _mm_cvtsd_u64 (__m128d __A)
   6018 {
   6019   return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
   6020                  __A,
   6021                  _MM_FROUND_CUR_DIRECTION);
   6022 }
   6023 #endif
   6024 
   6025 #define _mm_cvt_roundss_si32(A, R) \
   6026   ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
   6027 
   6028 #define _mm_cvt_roundss_i32(A, R) \
   6029   ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
   6030 
   6031 #ifdef __x86_64__
   6032 #define _mm_cvt_roundss_si64(A, R) \
   6033   ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
   6034 
   6035 #define _mm_cvt_roundss_i64(A, R) \
   6036   ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
   6037 #endif
   6038 
   6039 #define _mm_cvt_roundss_u32(A, R) \
   6040   ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)))
   6041 
   6042 static __inline__ unsigned __DEFAULT_FN_ATTRS128
   6043 _mm_cvtss_u32 (__m128 __A)
   6044 {
   6045   return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
   6046              _MM_FROUND_CUR_DIRECTION);
   6047 }
   6048 
   6049 #ifdef __x86_64__
   6050 #define _mm_cvt_roundss_u64(A, R) \
   6051   ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
   6052                                                    (int)(R)))
   6053 
   6054 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
   6055 _mm_cvtss_u64 (__m128 __A)
   6056 {
   6057   return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
   6058                  __A,
   6059                  _MM_FROUND_CUR_DIRECTION);
   6060 }
   6061 #endif
   6062 
   6063 #define _mm_cvtt_roundsd_i32(A, R) \
   6064   ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
   6065 
   6066 #define _mm_cvtt_roundsd_si32(A, R) \
   6067   ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
   6068 
   6069 static __inline__ int __DEFAULT_FN_ATTRS128
   6070 _mm_cvttsd_i32 (__m128d __A)
   6071 {
   6072   return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
   6073               _MM_FROUND_CUR_DIRECTION);
   6074 }
   6075 
   6076 #ifdef __x86_64__
   6077 #define _mm_cvtt_roundsd_si64(A, R) \
   6078   ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
   6079 
   6080 #define _mm_cvtt_roundsd_i64(A, R) \
   6081   ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
   6082 
   6083 static __inline__ long long __DEFAULT_FN_ATTRS128
   6084 _mm_cvttsd_i64 (__m128d __A)
   6085 {
   6086   return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
   6087               _MM_FROUND_CUR_DIRECTION);
   6088 }
   6089 #endif
   6090 
   6091 #define _mm_cvtt_roundsd_u32(A, R) \
   6092   ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)))
   6093 
   6094 static __inline__ unsigned __DEFAULT_FN_ATTRS128
   6095 _mm_cvttsd_u32 (__m128d __A)
   6096 {
   6097   return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
   6098               _MM_FROUND_CUR_DIRECTION);
   6099 }
   6100 
   6101 #ifdef __x86_64__
   6102 #define _mm_cvtt_roundsd_u64(A, R) \
   6103   ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
   6104                                                     (int)(R)))
   6105 
   6106 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
   6107 _mm_cvttsd_u64 (__m128d __A)
   6108 {
   6109   return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
   6110                   __A,
   6111                   _MM_FROUND_CUR_DIRECTION);
   6112 }
   6113 #endif
   6114 
   6115 #define _mm_cvtt_roundss_i32(A, R) \
   6116   ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
   6117 
   6118 #define _mm_cvtt_roundss_si32(A, R) \
   6119   ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
   6120 
   6121 static __inline__ int __DEFAULT_FN_ATTRS128
   6122 _mm_cvttss_i32 (__m128 __A)
   6123 {
   6124   return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
   6125               _MM_FROUND_CUR_DIRECTION);
   6126 }
   6127 
   6128 #ifdef __x86_64__
   6129 #define _mm_cvtt_roundss_i64(A, R) \
   6130   ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
   6131 
   6132 #define _mm_cvtt_roundss_si64(A, R) \
   6133   ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
   6134 
   6135 static __inline__ long long __DEFAULT_FN_ATTRS128
   6136 _mm_cvttss_i64 (__m128 __A)
   6137 {
   6138   return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
   6139               _MM_FROUND_CUR_DIRECTION);
   6140 }
   6141 #endif
   6142 
   6143 #define _mm_cvtt_roundss_u32(A, R) \
   6144   ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)))
   6145 
   6146 static __inline__ unsigned __DEFAULT_FN_ATTRS128
   6147 _mm_cvttss_u32 (__m128 __A)
   6148 {
   6149   return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
   6150               _MM_FROUND_CUR_DIRECTION);
   6151 }
   6152 
   6153 #ifdef __x86_64__
   6154 #define _mm_cvtt_roundss_u64(A, R) \
   6155   ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
   6156                                                     (int)(R)))
   6157 
   6158 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
   6159 _mm_cvttss_u64 (__m128 __A)
   6160 {
   6161   return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
   6162                   __A,
   6163                   _MM_FROUND_CUR_DIRECTION);
   6164 }
   6165 #endif
   6166 
   6167 #define _mm512_permute_pd(X, C) \
   6168   ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C)))
   6169 
   6170 #define _mm512_mask_permute_pd(W, U, X, C) \
   6171   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   6172                                         (__v8df)_mm512_permute_pd((X), (C)), \
   6173                                         (__v8df)(__m512d)(W)))
   6174 
   6175 #define _mm512_maskz_permute_pd(U, X, C) \
   6176   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   6177                                         (__v8df)_mm512_permute_pd((X), (C)), \
   6178                                         (__v8df)_mm512_setzero_pd()))
   6179 
   6180 #define _mm512_permute_ps(X, C) \
   6181   ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C)))
   6182 
   6183 #define _mm512_mask_permute_ps(W, U, X, C) \
   6184   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   6185                                        (__v16sf)_mm512_permute_ps((X), (C)), \
   6186                                        (__v16sf)(__m512)(W)))
   6187 
   6188 #define _mm512_maskz_permute_ps(U, X, C) \
   6189   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   6190                                        (__v16sf)_mm512_permute_ps((X), (C)), \
   6191                                        (__v16sf)_mm512_setzero_ps()))
   6192 
   6193 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6194 _mm512_permutevar_pd(__m512d __A, __m512i __C)
   6195 {
   6196   return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
   6197 }
   6198 
   6199 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6200 _mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
   6201 {
   6202   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   6203                                          (__v8df)_mm512_permutevar_pd(__A, __C),
   6204                                          (__v8df)__W);
   6205 }
   6206 
   6207 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6208 _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
   6209 {
   6210   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   6211                                          (__v8df)_mm512_permutevar_pd(__A, __C),
   6212                                          (__v8df)_mm512_setzero_pd());
   6213 }
   6214 
   6215 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6216 _mm512_permutevar_ps(__m512 __A, __m512i __C)
   6217 {
   6218   return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
   6219 }
   6220 
   6221 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6222 _mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
   6223 {
   6224   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   6225                                         (__v16sf)_mm512_permutevar_ps(__A, __C),
   6226                                         (__v16sf)__W);
   6227 }
   6228 
   6229 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6230 _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
   6231 {
   6232   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   6233                                         (__v16sf)_mm512_permutevar_ps(__A, __C),
   6234                                         (__v16sf)_mm512_setzero_ps());
   6235 }
   6236 
   6237 static __inline __m512d __DEFAULT_FN_ATTRS512
   6238 _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
   6239 {
   6240   return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I,
   6241                                                  (__v8df)__B);
   6242 }
   6243 
   6244 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6245 _mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
   6246 {
   6247   return (__m512d)__builtin_ia32_selectpd_512(__U,
   6248                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
   6249                                   (__v8df)__A);
   6250 }
   6251 
   6252 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6253 _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U,
   6254                              __m512d __B)
   6255 {
   6256   return (__m512d)__builtin_ia32_selectpd_512(__U,
   6257                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
   6258                                   (__v8df)(__m512d)__I);
   6259 }
   6260 
   6261 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6262 _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I,
   6263                              __m512d __B)
   6264 {
   6265   return (__m512d)__builtin_ia32_selectpd_512(__U,
   6266                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
   6267                                   (__v8df)_mm512_setzero_pd());
   6268 }
   6269 
   6270 static __inline __m512 __DEFAULT_FN_ATTRS512
   6271 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
   6272 {
   6273   return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I,
   6274                                                 (__v16sf) __B);
   6275 }
   6276 
   6277 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6278 _mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
   6279 {
   6280   return (__m512)__builtin_ia32_selectps_512(__U,
   6281                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
   6282                                  (__v16sf)__A);
   6283 }
   6284 
   6285 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6286 _mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B)
   6287 {
   6288   return (__m512)__builtin_ia32_selectps_512(__U,
   6289                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
   6290                                  (__v16sf)(__m512)__I);
   6291 }
   6292 
   6293 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6294 _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
   6295 {
   6296   return (__m512)__builtin_ia32_selectps_512(__U,
   6297                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
   6298                                  (__v16sf)_mm512_setzero_ps());
   6299 }
   6300 
   6301 
   6302 #define _mm512_cvtt_roundpd_epu32(A, R) \
   6303   ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
   6304                                               (__v8si)_mm256_undefined_si256(), \
   6305                                               (__mmask8)-1, (int)(R)))
   6306 
   6307 #define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \
   6308   ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
   6309                                               (__v8si)(__m256i)(W), \
   6310                                               (__mmask8)(U), (int)(R)))
   6311 
   6312 #define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \
   6313   ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
   6314                                               (__v8si)_mm256_setzero_si256(), \
   6315                                               (__mmask8)(U), (int)(R)))
   6316 
   6317 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   6318 _mm512_cvttpd_epu32 (__m512d __A)
   6319 {
   6320   return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
   6321                   (__v8si)
   6322                   _mm256_undefined_si256 (),
   6323                   (__mmask8) -1,
   6324                   _MM_FROUND_CUR_DIRECTION);
   6325 }
   6326 
   6327 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   6328 _mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
   6329 {
   6330   return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
   6331                   (__v8si) __W,
   6332                   (__mmask8) __U,
   6333                   _MM_FROUND_CUR_DIRECTION);
   6334 }
   6335 
   6336 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   6337 _mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
   6338 {
   6339   return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
   6340                   (__v8si)
   6341                   _mm256_setzero_si256 (),
   6342                   (__mmask8) __U,
   6343                   _MM_FROUND_CUR_DIRECTION);
   6344 }
   6345 
   6346 #define _mm_roundscale_round_sd(A, B, imm, R) \
   6347   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
   6348                                                  (__v2df)(__m128d)(B), \
   6349                                                  (__v2df)_mm_setzero_pd(), \
   6350                                                  (__mmask8)-1, (int)(imm), \
   6351                                                  (int)(R)))
   6352 
   6353 #define _mm_roundscale_sd(A, B, imm) \
   6354   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
   6355                                                  (__v2df)(__m128d)(B), \
   6356                                                  (__v2df)_mm_setzero_pd(), \
   6357                                                  (__mmask8)-1, (int)(imm), \
   6358                                                  _MM_FROUND_CUR_DIRECTION))
   6359 
   6360 #define _mm_mask_roundscale_sd(W, U, A, B, imm) \
   6361   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
   6362                                                  (__v2df)(__m128d)(B), \
   6363                                                  (__v2df)(__m128d)(W), \
   6364                                                  (__mmask8)(U), (int)(imm), \
   6365                                                  _MM_FROUND_CUR_DIRECTION))
   6366 
   6367 #define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \
   6368   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
   6369                                                  (__v2df)(__m128d)(B), \
   6370                                                  (__v2df)(__m128d)(W), \
   6371                                                  (__mmask8)(U), (int)(I), \
   6372                                                  (int)(R)))
   6373 
   6374 #define _mm_maskz_roundscale_sd(U, A, B, I) \
   6375   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
   6376                                                  (__v2df)(__m128d)(B), \
   6377                                                  (__v2df)_mm_setzero_pd(), \
   6378                                                  (__mmask8)(U), (int)(I), \
   6379                                                  _MM_FROUND_CUR_DIRECTION))
   6380 
   6381 #define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \
   6382   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
   6383                                                  (__v2df)(__m128d)(B), \
   6384                                                  (__v2df)_mm_setzero_pd(), \
   6385                                                  (__mmask8)(U), (int)(I), \
   6386                                                  (int)(R)))
   6387 
   6388 #define _mm_roundscale_round_ss(A, B, imm, R) \
   6389   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
   6390                                                 (__v4sf)(__m128)(B), \
   6391                                                 (__v4sf)_mm_setzero_ps(), \
   6392                                                 (__mmask8)-1, (int)(imm), \
   6393                                                 (int)(R)))
   6394 
   6395 #define _mm_roundscale_ss(A, B, imm) \
   6396   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
   6397                                                 (__v4sf)(__m128)(B), \
   6398                                                 (__v4sf)_mm_setzero_ps(), \
   6399                                                 (__mmask8)-1, (int)(imm), \
   6400                                                 _MM_FROUND_CUR_DIRECTION))
   6401 
   6402 #define _mm_mask_roundscale_ss(W, U, A, B, I) \
   6403   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
   6404                                                 (__v4sf)(__m128)(B), \
   6405                                                 (__v4sf)(__m128)(W), \
   6406                                                 (__mmask8)(U), (int)(I), \
   6407                                                 _MM_FROUND_CUR_DIRECTION))
   6408 
   6409 #define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \
   6410   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
   6411                                                 (__v4sf)(__m128)(B), \
   6412                                                 (__v4sf)(__m128)(W), \
   6413                                                 (__mmask8)(U), (int)(I), \
   6414                                                 (int)(R)))
   6415 
   6416 #define _mm_maskz_roundscale_ss(U, A, B, I) \
   6417   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
   6418                                                 (__v4sf)(__m128)(B), \
   6419                                                 (__v4sf)_mm_setzero_ps(), \
   6420                                                 (__mmask8)(U), (int)(I), \
   6421                                                 _MM_FROUND_CUR_DIRECTION))
   6422 
   6423 #define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \
   6424   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
   6425                                                 (__v4sf)(__m128)(B), \
   6426                                                 (__v4sf)_mm_setzero_ps(), \
   6427                                                 (__mmask8)(U), (int)(I), \
   6428                                                 (int)(R)))
   6429 
   6430 #define _mm512_scalef_round_pd(A, B, R) \
   6431   ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
   6432                                             (__v8df)(__m512d)(B), \
   6433                                             (__v8df)_mm512_undefined_pd(), \
   6434                                             (__mmask8)-1, (int)(R)))
   6435 
   6436 #define _mm512_mask_scalef_round_pd(W, U, A, B, R) \
   6437   ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
   6438                                             (__v8df)(__m512d)(B), \
   6439                                             (__v8df)(__m512d)(W), \
   6440                                             (__mmask8)(U), (int)(R)))
   6441 
   6442 #define _mm512_maskz_scalef_round_pd(U, A, B, R) \
   6443   ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
   6444                                             (__v8df)(__m512d)(B), \
   6445                                             (__v8df)_mm512_setzero_pd(), \
   6446                                             (__mmask8)(U), (int)(R)))
   6447 
   6448 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6449 _mm512_scalef_pd (__m512d __A, __m512d __B)
   6450 {
   6451   return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
   6452                 (__v8df) __B,
   6453                 (__v8df)
   6454                 _mm512_undefined_pd (),
   6455                 (__mmask8) -1,
   6456                 _MM_FROUND_CUR_DIRECTION);
   6457 }
   6458 
   6459 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6460 _mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
   6461 {
   6462   return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
   6463                 (__v8df) __B,
   6464                 (__v8df) __W,
   6465                 (__mmask8) __U,
   6466                 _MM_FROUND_CUR_DIRECTION);
   6467 }
   6468 
   6469 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6470 _mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
   6471 {
   6472   return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
   6473                 (__v8df) __B,
   6474                 (__v8df)
   6475                 _mm512_setzero_pd (),
   6476                 (__mmask8) __U,
   6477                 _MM_FROUND_CUR_DIRECTION);
   6478 }
   6479 
   6480 #define _mm512_scalef_round_ps(A, B, R) \
   6481   ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
   6482                                            (__v16sf)(__m512)(B), \
   6483                                            (__v16sf)_mm512_undefined_ps(), \
   6484                                            (__mmask16)-1, (int)(R)))
   6485 
   6486 #define _mm512_mask_scalef_round_ps(W, U, A, B, R) \
   6487   ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
   6488                                            (__v16sf)(__m512)(B), \
   6489                                            (__v16sf)(__m512)(W), \
   6490                                            (__mmask16)(U), (int)(R)))
   6491 
   6492 #define _mm512_maskz_scalef_round_ps(U, A, B, R) \
   6493   ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
   6494                                            (__v16sf)(__m512)(B), \
   6495                                            (__v16sf)_mm512_setzero_ps(), \
   6496                                            (__mmask16)(U), (int)(R)))
   6497 
   6498 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6499 _mm512_scalef_ps (__m512 __A, __m512 __B)
   6500 {
   6501   return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
   6502                (__v16sf) __B,
   6503                (__v16sf)
   6504                _mm512_undefined_ps (),
   6505                (__mmask16) -1,
   6506                _MM_FROUND_CUR_DIRECTION);
   6507 }
   6508 
   6509 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6510 _mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
   6511 {
   6512   return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
   6513                (__v16sf) __B,
   6514                (__v16sf) __W,
   6515                (__mmask16) __U,
   6516                _MM_FROUND_CUR_DIRECTION);
   6517 }
   6518 
   6519 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6520 _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
   6521 {
   6522   return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
   6523                (__v16sf) __B,
   6524                (__v16sf)
   6525                _mm512_setzero_ps (),
   6526                (__mmask16) __U,
   6527                _MM_FROUND_CUR_DIRECTION);
   6528 }
   6529 
   6530 #define _mm_scalef_round_sd(A, B, R) \
   6531   ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
   6532                                                (__v2df)(__m128d)(B), \
   6533                                                (__v2df)_mm_setzero_pd(), \
   6534                                                (__mmask8)-1, (int)(R)))
   6535 
   6536 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   6537 _mm_scalef_sd (__m128d __A, __m128d __B)
   6538 {
   6539   return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
   6540               (__v2df)( __B), (__v2df) _mm_setzero_pd(),
   6541               (__mmask8) -1,
   6542               _MM_FROUND_CUR_DIRECTION);
   6543 }
   6544 
   6545 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   6546 _mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
   6547 {
   6548  return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
   6549                  (__v2df) __B,
   6550                 (__v2df) __W,
   6551                 (__mmask8) __U,
   6552                 _MM_FROUND_CUR_DIRECTION);
   6553 }
   6554 
   6555 #define _mm_mask_scalef_round_sd(W, U, A, B, R) \
   6556   ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
   6557                                                (__v2df)(__m128d)(B), \
   6558                                                (__v2df)(__m128d)(W), \
   6559                                                (__mmask8)(U), (int)(R)))
   6560 
   6561 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   6562 _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
   6563 {
   6564  return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
   6565                  (__v2df) __B,
   6566                 (__v2df) _mm_setzero_pd (),
   6567                 (__mmask8) __U,
   6568                 _MM_FROUND_CUR_DIRECTION);
   6569 }
   6570 
   6571 #define _mm_maskz_scalef_round_sd(U, A, B, R) \
   6572   ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
   6573                                                (__v2df)(__m128d)(B), \
   6574                                                (__v2df)_mm_setzero_pd(), \
   6575                                                (__mmask8)(U), (int)(R)))
   6576 
   6577 #define _mm_scalef_round_ss(A, B, R) \
   6578   ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
   6579                                               (__v4sf)(__m128)(B), \
   6580                                               (__v4sf)_mm_setzero_ps(), \
   6581                                               (__mmask8)-1, (int)(R)))
   6582 
   6583 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   6584 _mm_scalef_ss (__m128 __A, __m128 __B)
   6585 {
   6586   return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
   6587              (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
   6588              (__mmask8) -1,
   6589              _MM_FROUND_CUR_DIRECTION);
   6590 }
   6591 
   6592 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   6593 _mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
   6594 {
   6595  return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
   6596                 (__v4sf) __B,
   6597                 (__v4sf) __W,
   6598                 (__mmask8) __U,
   6599                 _MM_FROUND_CUR_DIRECTION);
   6600 }
   6601 
   6602 #define _mm_mask_scalef_round_ss(W, U, A, B, R) \
   6603   ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
   6604                                               (__v4sf)(__m128)(B), \
   6605                                               (__v4sf)(__m128)(W), \
   6606                                               (__mmask8)(U), (int)(R)))
   6607 
   6608 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   6609 _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
   6610 {
   6611  return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
   6612                  (__v4sf) __B,
   6613                 (__v4sf) _mm_setzero_ps (),
   6614                 (__mmask8) __U,
   6615                 _MM_FROUND_CUR_DIRECTION);
   6616 }
   6617 
   6618 #define _mm_maskz_scalef_round_ss(U, A, B, R) \
   6619   ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
   6620                                               (__v4sf)(__m128)(B), \
   6621                                               (__v4sf)_mm_setzero_ps(), \
   6622                                               (__mmask8)(U), \
   6623                                               (int)(R)))
   6624 
   6625 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   6626 _mm512_srai_epi32(__m512i __A, unsigned int __B)
   6627 {
   6628   return (__m512i)__builtin_ia32_psradi512((__v16si)__A, (int)__B);
   6629 }
   6630 
   6631 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   6632 _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A,
   6633                        unsigned int __B)
   6634 {
   6635   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   6636                                          (__v16si)_mm512_srai_epi32(__A, __B),
   6637                                          (__v16si)__W);
   6638 }
   6639 
   6640 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   6641 _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A,
   6642                         unsigned int __B) {
   6643   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
   6644                                          (__v16si)_mm512_srai_epi32(__A, __B),
   6645                                          (__v16si)_mm512_setzero_si512());
   6646 }
   6647 
   6648 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   6649 _mm512_srai_epi64(__m512i __A, unsigned int __B)
   6650 {
   6651   return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, (int)__B);
   6652 }
   6653 
   6654 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   6655 _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
   6656 {
   6657   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   6658                                           (__v8di)_mm512_srai_epi64(__A, __B),
   6659                                           (__v8di)__W);
   6660 }
   6661 
   6662 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   6663 _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
   6664 {
   6665   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
   6666                                           (__v8di)_mm512_srai_epi64(__A, __B),
   6667                                           (__v8di)_mm512_setzero_si512());
   6668 }
   6669 
   6670 #define _mm512_shuffle_f32x4(A, B, imm) \
   6671   ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \
   6672                                      (__v16sf)(__m512)(B), (int)(imm)))
   6673 
   6674 #define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \
   6675   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   6676                                        (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
   6677                                        (__v16sf)(__m512)(W)))
   6678 
   6679 #define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \
   6680   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   6681                                        (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
   6682                                        (__v16sf)_mm512_setzero_ps()))
   6683 
   6684 #define _mm512_shuffle_f64x2(A, B, imm) \
   6685   ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \
   6686                                       (__v8df)(__m512d)(B), (int)(imm)))
   6687 
   6688 #define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \
   6689   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   6690                                         (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
   6691                                         (__v8df)(__m512d)(W)))
   6692 
   6693 #define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \
   6694   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   6695                                         (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
   6696                                         (__v8df)_mm512_setzero_pd()))
   6697 
   6698 #define _mm512_shuffle_i32x4(A, B, imm) \
   6699   ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \
   6700                                       (__v16si)(__m512i)(B), (int)(imm)))
   6701 
   6702 #define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \
   6703   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
   6704                                        (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
   6705                                        (__v16si)(__m512i)(W)))
   6706 
   6707 #define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \
   6708   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
   6709                                        (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
   6710                                        (__v16si)_mm512_setzero_si512()))
   6711 
   6712 #define _mm512_shuffle_i64x2(A, B, imm) \
   6713   ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \
   6714                                       (__v8di)(__m512i)(B), (int)(imm)))
   6715 
   6716 #define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \
   6717   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
   6718                                        (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
   6719                                        (__v8di)(__m512i)(W)))
   6720 
   6721 #define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \
   6722   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
   6723                                        (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
   6724                                        (__v8di)_mm512_setzero_si512()))
   6725 
   6726 #define _mm512_shuffle_pd(A, B, M) \
   6727   ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \
   6728                                      (__v8df)(__m512d)(B), (int)(M)))
   6729 
   6730 #define _mm512_mask_shuffle_pd(W, U, A, B, M) \
   6731   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   6732                                         (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
   6733                                         (__v8df)(__m512d)(W)))
   6734 
   6735 #define _mm512_maskz_shuffle_pd(U, A, B, M) \
   6736   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   6737                                         (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
   6738                                         (__v8df)_mm512_setzero_pd()))
   6739 
   6740 #define _mm512_shuffle_ps(A, B, M) \
   6741   ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \
   6742                                     (__v16sf)(__m512)(B), (int)(M)))
   6743 
   6744 #define _mm512_mask_shuffle_ps(W, U, A, B, M) \
   6745   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   6746                                        (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
   6747                                        (__v16sf)(__m512)(W)))
   6748 
   6749 #define _mm512_maskz_shuffle_ps(U, A, B, M) \
   6750   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   6751                                        (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
   6752                                        (__v16sf)_mm512_setzero_ps()))
   6753 
   6754 #define _mm_sqrt_round_sd(A, B, R) \
   6755   ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
   6756                                              (__v2df)(__m128d)(B), \
   6757                                              (__v2df)_mm_setzero_pd(), \
   6758                                              (__mmask8)-1, (int)(R)))
   6759 
   6760 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   6761 _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
   6762 {
   6763  return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
   6764                  (__v2df) __B,
   6765                 (__v2df) __W,
   6766                 (__mmask8) __U,
   6767                 _MM_FROUND_CUR_DIRECTION);
   6768 }
   6769 
   6770 #define _mm_mask_sqrt_round_sd(W, U, A, B, R) \
   6771   ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
   6772                                              (__v2df)(__m128d)(B), \
   6773                                              (__v2df)(__m128d)(W), \
   6774                                              (__mmask8)(U), (int)(R)))
   6775 
   6776 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   6777 _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
   6778 {
   6779  return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
   6780                  (__v2df) __B,
   6781                 (__v2df) _mm_setzero_pd (),
   6782                 (__mmask8) __U,
   6783                 _MM_FROUND_CUR_DIRECTION);
   6784 }
   6785 
   6786 #define _mm_maskz_sqrt_round_sd(U, A, B, R) \
   6787   ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
   6788                                              (__v2df)(__m128d)(B), \
   6789                                              (__v2df)_mm_setzero_pd(), \
   6790                                              (__mmask8)(U), (int)(R)))
   6791 
   6792 #define _mm_sqrt_round_ss(A, B, R) \
   6793   ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
   6794                                             (__v4sf)(__m128)(B), \
   6795                                             (__v4sf)_mm_setzero_ps(), \
   6796                                             (__mmask8)-1, (int)(R)))
   6797 
   6798 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   6799 _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
   6800 {
   6801  return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
   6802                  (__v4sf) __B,
   6803                 (__v4sf) __W,
   6804                 (__mmask8) __U,
   6805                 _MM_FROUND_CUR_DIRECTION);
   6806 }
   6807 
   6808 #define _mm_mask_sqrt_round_ss(W, U, A, B, R) \
   6809   ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
   6810                                             (__v4sf)(__m128)(B), \
   6811                                             (__v4sf)(__m128)(W), (__mmask8)(U), \
   6812                                             (int)(R)))
   6813 
   6814 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   6815 _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
   6816 {
   6817  return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
   6818                  (__v4sf) __B,
   6819                 (__v4sf) _mm_setzero_ps (),
   6820                 (__mmask8) __U,
   6821                 _MM_FROUND_CUR_DIRECTION);
   6822 }
   6823 
   6824 #define _mm_maskz_sqrt_round_ss(U, A, B, R) \
   6825   ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
   6826                                             (__v4sf)(__m128)(B), \
   6827                                             (__v4sf)_mm_setzero_ps(), \
   6828                                             (__mmask8)(U), (int)(R)))
   6829 
   6830 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6831 _mm512_broadcast_f32x4(__m128 __A)
   6832 {
   6833   return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
   6834                                          0, 1, 2, 3, 0, 1, 2, 3,
   6835                                          0, 1, 2, 3, 0, 1, 2, 3);
   6836 }
   6837 
   6838 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6839 _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
   6840 {
   6841   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
   6842                                            (__v16sf)_mm512_broadcast_f32x4(__A),
   6843                                            (__v16sf)__O);
   6844 }
   6845 
   6846 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6847 _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
   6848 {
   6849   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
   6850                                            (__v16sf)_mm512_broadcast_f32x4(__A),
   6851                                            (__v16sf)_mm512_setzero_ps());
   6852 }
   6853 
   6854 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6855 _mm512_broadcast_f64x4(__m256d __A)
   6856 {
   6857   return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
   6858                                           0, 1, 2, 3, 0, 1, 2, 3);
   6859 }
   6860 
   6861 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6862 _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
   6863 {
   6864   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
   6865                                             (__v8df)_mm512_broadcast_f64x4(__A),
   6866                                             (__v8df)__O);
   6867 }
   6868 
   6869 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6870 _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
   6871 {
   6872   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
   6873                                             (__v8df)_mm512_broadcast_f64x4(__A),
   6874                                             (__v8df)_mm512_setzero_pd());
   6875 }
   6876 
   6877 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   6878 _mm512_broadcast_i32x4(__m128i __A)
   6879 {
   6880   return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
   6881                                           0, 1, 2, 3, 0, 1, 2, 3,
   6882                                           0, 1, 2, 3, 0, 1, 2, 3);
   6883 }
   6884 
   6885 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   6886 _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
   6887 {
   6888   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   6889                                            (__v16si)_mm512_broadcast_i32x4(__A),
   6890                                            (__v16si)__O);
   6891 }
   6892 
   6893 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   6894 _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
   6895 {
   6896   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   6897                                            (__v16si)_mm512_broadcast_i32x4(__A),
   6898                                            (__v16si)_mm512_setzero_si512());
   6899 }
   6900 
   6901 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   6902 _mm512_broadcast_i64x4(__m256i __A)
   6903 {
   6904   return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
   6905                                           0, 1, 2, 3, 0, 1, 2, 3);
   6906 }
   6907 
   6908 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   6909 _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
   6910 {
   6911   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   6912                                             (__v8di)_mm512_broadcast_i64x4(__A),
   6913                                             (__v8di)__O);
   6914 }
   6915 
   6916 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   6917 _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
   6918 {
   6919   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   6920                                             (__v8di)_mm512_broadcast_i64x4(__A),
   6921                                             (__v8di)_mm512_setzero_si512());
   6922 }
   6923 
   6924 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6925 _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
   6926 {
   6927   return (__m512d)__builtin_ia32_selectpd_512(__M,
   6928                                               (__v8df) _mm512_broadcastsd_pd(__A),
   6929                                               (__v8df) __O);
   6930 }
   6931 
   6932 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   6933 _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
   6934 {
   6935   return (__m512d)__builtin_ia32_selectpd_512(__M,
   6936                                               (__v8df) _mm512_broadcastsd_pd(__A),
   6937                                               (__v8df) _mm512_setzero_pd());
   6938 }
   6939 
   6940 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6941 _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
   6942 {
   6943   return (__m512)__builtin_ia32_selectps_512(__M,
   6944                                              (__v16sf) _mm512_broadcastss_ps(__A),
   6945                                              (__v16sf) __O);
   6946 }
   6947 
   6948 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   6949 _mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
   6950 {
   6951   return (__m512)__builtin_ia32_selectps_512(__M,
   6952                                              (__v16sf) _mm512_broadcastss_ps(__A),
   6953                                              (__v16sf) _mm512_setzero_ps());
   6954 }
   6955 
   6956 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   6957 _mm512_cvtsepi32_epi8 (__m512i __A)
   6958 {
   6959   return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
   6960                (__v16qi) _mm_undefined_si128 (),
   6961                (__mmask16) -1);
   6962 }
   6963 
   6964 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   6965 _mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
   6966 {
   6967   return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
   6968                (__v16qi) __O, __M);
   6969 }
   6970 
   6971 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   6972 _mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
   6973 {
   6974   return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
   6975                (__v16qi) _mm_setzero_si128 (),
   6976                __M);
   6977 }
   6978 
   6979 static __inline__ void __DEFAULT_FN_ATTRS512
   6980 _mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
   6981 {
   6982   __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
   6983 }
   6984 
   6985 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   6986 _mm512_cvtsepi32_epi16 (__m512i __A)
   6987 {
   6988   return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
   6989                (__v16hi) _mm256_undefined_si256 (),
   6990                (__mmask16) -1);
   6991 }
   6992 
   6993 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   6994 _mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
   6995 {
   6996   return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
   6997                (__v16hi) __O, __M);
   6998 }
   6999 
   7000 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7001 _mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
   7002 {
   7003   return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
   7004                (__v16hi) _mm256_setzero_si256 (),
   7005                __M);
   7006 }
   7007 
   7008 static __inline__ void __DEFAULT_FN_ATTRS512
   7009 _mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
   7010 {
   7011   __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
   7012 }
   7013 
   7014 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7015 _mm512_cvtsepi64_epi8 (__m512i __A)
   7016 {
   7017   return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
   7018                (__v16qi) _mm_undefined_si128 (),
   7019                (__mmask8) -1);
   7020 }
   7021 
   7022 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7023 _mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
   7024 {
   7025   return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
   7026                (__v16qi) __O, __M);
   7027 }
   7028 
   7029 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7030 _mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
   7031 {
   7032   return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
   7033                (__v16qi) _mm_setzero_si128 (),
   7034                __M);
   7035 }
   7036 
   7037 static __inline__ void __DEFAULT_FN_ATTRS512
   7038 _mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
   7039 {
   7040   __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
   7041 }
   7042 
   7043 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7044 _mm512_cvtsepi64_epi32 (__m512i __A)
   7045 {
   7046   return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
   7047                (__v8si) _mm256_undefined_si256 (),
   7048                (__mmask8) -1);
   7049 }
   7050 
   7051 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7052 _mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
   7053 {
   7054   return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
   7055                (__v8si) __O, __M);
   7056 }
   7057 
   7058 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7059 _mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
   7060 {
   7061   return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
   7062                (__v8si) _mm256_setzero_si256 (),
   7063                __M);
   7064 }
   7065 
   7066 static __inline__ void __DEFAULT_FN_ATTRS512
   7067 _mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
   7068 {
   7069   __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
   7070 }
   7071 
   7072 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7073 _mm512_cvtsepi64_epi16 (__m512i __A)
   7074 {
   7075   return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
   7076                (__v8hi) _mm_undefined_si128 (),
   7077                (__mmask8) -1);
   7078 }
   7079 
   7080 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7081 _mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
   7082 {
   7083   return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
   7084                (__v8hi) __O, __M);
   7085 }
   7086 
   7087 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7088 _mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
   7089 {
   7090   return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
   7091                (__v8hi) _mm_setzero_si128 (),
   7092                __M);
   7093 }
   7094 
   7095 static __inline__ void __DEFAULT_FN_ATTRS512
   7096 _mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
   7097 {
   7098   __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
   7099 }
   7100 
   7101 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7102 _mm512_cvtusepi32_epi8 (__m512i __A)
   7103 {
   7104   return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
   7105                 (__v16qi) _mm_undefined_si128 (),
   7106                 (__mmask16) -1);
   7107 }
   7108 
   7109 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7110 _mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
   7111 {
   7112   return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
   7113                 (__v16qi) __O,
   7114                 __M);
   7115 }
   7116 
   7117 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7118 _mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
   7119 {
   7120   return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
   7121                 (__v16qi) _mm_setzero_si128 (),
   7122                 __M);
   7123 }
   7124 
   7125 static __inline__ void __DEFAULT_FN_ATTRS512
   7126 _mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
   7127 {
   7128   __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
   7129 }
   7130 
   7131 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7132 _mm512_cvtusepi32_epi16 (__m512i __A)
   7133 {
   7134   return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
   7135                 (__v16hi) _mm256_undefined_si256 (),
   7136                 (__mmask16) -1);
   7137 }
   7138 
   7139 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7140 _mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
   7141 {
   7142   return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
   7143                 (__v16hi) __O,
   7144                 __M);
   7145 }
   7146 
   7147 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7148 _mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
   7149 {
   7150   return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
   7151                 (__v16hi) _mm256_setzero_si256 (),
   7152                 __M);
   7153 }
   7154 
   7155 static __inline__ void __DEFAULT_FN_ATTRS512
   7156 _mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
   7157 {
   7158   __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
   7159 }
   7160 
   7161 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7162 _mm512_cvtusepi64_epi8 (__m512i __A)
   7163 {
   7164   return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
   7165                 (__v16qi) _mm_undefined_si128 (),
   7166                 (__mmask8) -1);
   7167 }
   7168 
   7169 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7170 _mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
   7171 {
   7172   return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
   7173                 (__v16qi) __O,
   7174                 __M);
   7175 }
   7176 
   7177 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7178 _mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
   7179 {
   7180   return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
   7181                 (__v16qi) _mm_setzero_si128 (),
   7182                 __M);
   7183 }
   7184 
   7185 static __inline__ void __DEFAULT_FN_ATTRS512
   7186 _mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
   7187 {
   7188   __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
   7189 }
   7190 
   7191 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7192 _mm512_cvtusepi64_epi32 (__m512i __A)
   7193 {
   7194   return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
   7195                 (__v8si) _mm256_undefined_si256 (),
   7196                 (__mmask8) -1);
   7197 }
   7198 
   7199 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7200 _mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
   7201 {
   7202   return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
   7203                 (__v8si) __O, __M);
   7204 }
   7205 
   7206 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7207 _mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
   7208 {
   7209   return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
   7210                 (__v8si) _mm256_setzero_si256 (),
   7211                 __M);
   7212 }
   7213 
   7214 static __inline__ void __DEFAULT_FN_ATTRS512
   7215 _mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
   7216 {
   7217   __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
   7218 }
   7219 
   7220 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7221 _mm512_cvtusepi64_epi16 (__m512i __A)
   7222 {
   7223   return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
   7224                 (__v8hi) _mm_undefined_si128 (),
   7225                 (__mmask8) -1);
   7226 }
   7227 
   7228 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7229 _mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
   7230 {
   7231   return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
   7232                 (__v8hi) __O, __M);
   7233 }
   7234 
   7235 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7236 _mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
   7237 {
   7238   return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
   7239                 (__v8hi) _mm_setzero_si128 (),
   7240                 __M);
   7241 }
   7242 
   7243 static __inline__ void __DEFAULT_FN_ATTRS512
   7244 _mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
   7245 {
   7246   __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
   7247 }
   7248 
   7249 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7250 _mm512_cvtepi32_epi8 (__m512i __A)
   7251 {
   7252   return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
   7253               (__v16qi) _mm_undefined_si128 (),
   7254               (__mmask16) -1);
   7255 }
   7256 
   7257 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7258 _mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
   7259 {
   7260   return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
   7261               (__v16qi) __O, __M);
   7262 }
   7263 
   7264 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7265 _mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
   7266 {
   7267   return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
   7268               (__v16qi) _mm_setzero_si128 (),
   7269               __M);
   7270 }
   7271 
   7272 static __inline__ void __DEFAULT_FN_ATTRS512
   7273 _mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
   7274 {
   7275   __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
   7276 }
   7277 
   7278 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7279 _mm512_cvtepi32_epi16 (__m512i __A)
   7280 {
   7281   return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
   7282               (__v16hi) _mm256_undefined_si256 (),
   7283               (__mmask16) -1);
   7284 }
   7285 
   7286 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7287 _mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
   7288 {
   7289   return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
   7290               (__v16hi) __O, __M);
   7291 }
   7292 
   7293 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7294 _mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
   7295 {
   7296   return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
   7297               (__v16hi) _mm256_setzero_si256 (),
   7298               __M);
   7299 }
   7300 
   7301 static __inline__ void __DEFAULT_FN_ATTRS512
   7302 _mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
   7303 {
   7304   __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
   7305 }
   7306 
   7307 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7308 _mm512_cvtepi64_epi8 (__m512i __A)
   7309 {
   7310   return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
   7311               (__v16qi) _mm_undefined_si128 (),
   7312               (__mmask8) -1);
   7313 }
   7314 
   7315 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7316 _mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
   7317 {
   7318   return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
   7319               (__v16qi) __O, __M);
   7320 }
   7321 
   7322 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7323 _mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
   7324 {
   7325   return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
   7326               (__v16qi) _mm_setzero_si128 (),
   7327               __M);
   7328 }
   7329 
   7330 static __inline__ void __DEFAULT_FN_ATTRS512
   7331 _mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
   7332 {
   7333   __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
   7334 }
   7335 
   7336 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7337 _mm512_cvtepi64_epi32 (__m512i __A)
   7338 {
   7339   return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
   7340               (__v8si) _mm256_undefined_si256 (),
   7341               (__mmask8) -1);
   7342 }
   7343 
   7344 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7345 _mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
   7346 {
   7347   return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
   7348               (__v8si) __O, __M);
   7349 }
   7350 
   7351 static __inline__ __m256i __DEFAULT_FN_ATTRS512
   7352 _mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
   7353 {
   7354   return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
   7355               (__v8si) _mm256_setzero_si256 (),
   7356               __M);
   7357 }
   7358 
   7359 static __inline__ void __DEFAULT_FN_ATTRS512
   7360 _mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
   7361 {
   7362   __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
   7363 }
   7364 
   7365 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7366 _mm512_cvtepi64_epi16 (__m512i __A)
   7367 {
   7368   return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
   7369               (__v8hi) _mm_undefined_si128 (),
   7370               (__mmask8) -1);
   7371 }
   7372 
   7373 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7374 _mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
   7375 {
   7376   return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
   7377               (__v8hi) __O, __M);
   7378 }
   7379 
   7380 static __inline__ __m128i __DEFAULT_FN_ATTRS512
   7381 _mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
   7382 {
   7383   return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
   7384               (__v8hi) _mm_setzero_si128 (),
   7385               __M);
   7386 }
   7387 
   7388 static __inline__ void __DEFAULT_FN_ATTRS512
   7389 _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
   7390 {
   7391   __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
   7392 }
   7393 
   7394 #define _mm512_extracti32x4_epi32(A, imm) \
   7395   ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
   7396                                              (__v4si)_mm_undefined_si128(), \
   7397                                              (__mmask8)-1))
   7398 
   7399 #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
   7400   ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
   7401                                              (__v4si)(__m128i)(W), \
   7402                                              (__mmask8)(U)))
   7403 
   7404 #define _mm512_maskz_extracti32x4_epi32(U, A, imm) \
   7405   ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
   7406                                              (__v4si)_mm_setzero_si128(), \
   7407                                              (__mmask8)(U)))
   7408 
   7409 #define _mm512_extracti64x4_epi64(A, imm) \
   7410   ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
   7411                                              (__v4di)_mm256_undefined_si256(), \
   7412                                              (__mmask8)-1))
   7413 
   7414 #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
   7415   ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
   7416                                              (__v4di)(__m256i)(W), \
   7417                                              (__mmask8)(U)))
   7418 
   7419 #define _mm512_maskz_extracti64x4_epi64(U, A, imm) \
   7420   ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
   7421                                              (__v4di)_mm256_setzero_si256(), \
   7422                                              (__mmask8)(U)))
   7423 
   7424 #define _mm512_insertf64x4(A, B, imm) \
   7425   ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
   7426                                        (__v4df)(__m256d)(B), (int)(imm)))
   7427 
   7428 #define _mm512_mask_insertf64x4(W, U, A, B, imm) \
   7429   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   7430                                    (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
   7431                                    (__v8df)(__m512d)(W)))
   7432 
   7433 #define _mm512_maskz_insertf64x4(U, A, B, imm) \
   7434   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   7435                                    (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
   7436                                    (__v8df)_mm512_setzero_pd()))
   7437 
   7438 #define _mm512_inserti64x4(A, B, imm) \
   7439   ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \
   7440                                        (__v4di)(__m256i)(B), (int)(imm)))
   7441 
   7442 #define _mm512_mask_inserti64x4(W, U, A, B, imm) \
   7443   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
   7444                                    (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
   7445                                    (__v8di)(__m512i)(W)))
   7446 
   7447 #define _mm512_maskz_inserti64x4(U, A, B, imm) \
   7448   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
   7449                                    (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
   7450                                    (__v8di)_mm512_setzero_si512()))
   7451 
   7452 #define _mm512_insertf32x4(A, B, imm) \
   7453   ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \
   7454                                       (__v4sf)(__m128)(B), (int)(imm)))
   7455 
   7456 #define _mm512_mask_insertf32x4(W, U, A, B, imm) \
   7457   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   7458                                   (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
   7459                                   (__v16sf)(__m512)(W)))
   7460 
   7461 #define _mm512_maskz_insertf32x4(U, A, B, imm) \
   7462   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
   7463                                   (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
   7464                                   (__v16sf)_mm512_setzero_ps()))
   7465 
   7466 #define _mm512_inserti32x4(A, B, imm) \
   7467   ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \
   7468                                        (__v4si)(__m128i)(B), (int)(imm)))
   7469 
   7470 #define _mm512_mask_inserti32x4(W, U, A, B, imm) \
   7471   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
   7472                                   (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
   7473                                   (__v16si)(__m512i)(W)))
   7474 
   7475 #define _mm512_maskz_inserti32x4(U, A, B, imm) \
   7476   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
   7477                                   (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
   7478                                   (__v16si)_mm512_setzero_si512()))
   7479 
   7480 #define _mm512_getmant_round_pd(A, B, C, R) \
   7481   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
   7482                                              (int)(((C)<<2) | (B)), \
   7483                                              (__v8df)_mm512_undefined_pd(), \
   7484                                              (__mmask8)-1, (int)(R)))
   7485 
   7486 #define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \
   7487   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
   7488                                              (int)(((C)<<2) | (B)), \
   7489                                              (__v8df)(__m512d)(W), \
   7490                                              (__mmask8)(U), (int)(R)))
   7491 
   7492 #define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \
   7493   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
   7494                                              (int)(((C)<<2) | (B)), \
   7495                                              (__v8df)_mm512_setzero_pd(), \
   7496                                              (__mmask8)(U), (int)(R)))
   7497 
   7498 #define _mm512_getmant_pd(A, B, C) \
   7499   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
   7500                                              (int)(((C)<<2) | (B)), \
   7501                                              (__v8df)_mm512_setzero_pd(), \
   7502                                              (__mmask8)-1, \
   7503                                              _MM_FROUND_CUR_DIRECTION))
   7504 
   7505 #define _mm512_mask_getmant_pd(W, U, A, B, C) \
   7506   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
   7507                                              (int)(((C)<<2) | (B)), \
   7508                                              (__v8df)(__m512d)(W), \
   7509                                              (__mmask8)(U), \
   7510                                              _MM_FROUND_CUR_DIRECTION))
   7511 
   7512 #define _mm512_maskz_getmant_pd(U, A, B, C) \
   7513   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
   7514                                              (int)(((C)<<2) | (B)), \
   7515                                              (__v8df)_mm512_setzero_pd(), \
   7516                                              (__mmask8)(U), \
   7517                                              _MM_FROUND_CUR_DIRECTION))
   7518 
   7519 #define _mm512_getmant_round_ps(A, B, C, R) \
   7520   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
   7521                                             (int)(((C)<<2) | (B)), \
   7522                                             (__v16sf)_mm512_undefined_ps(), \
   7523                                             (__mmask16)-1, (int)(R)))
   7524 
   7525 #define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \
   7526   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
   7527                                             (int)(((C)<<2) | (B)), \
   7528                                             (__v16sf)(__m512)(W), \
   7529                                             (__mmask16)(U), (int)(R)))
   7530 
   7531 #define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \
   7532   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
   7533                                             (int)(((C)<<2) | (B)), \
   7534                                             (__v16sf)_mm512_setzero_ps(), \
   7535                                             (__mmask16)(U), (int)(R)))
   7536 
   7537 #define _mm512_getmant_ps(A, B, C) \
   7538   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
   7539                                             (int)(((C)<<2)|(B)), \
   7540                                             (__v16sf)_mm512_undefined_ps(), \
   7541                                             (__mmask16)-1, \
   7542                                             _MM_FROUND_CUR_DIRECTION))
   7543 
   7544 #define _mm512_mask_getmant_ps(W, U, A, B, C) \
   7545   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
   7546                                             (int)(((C)<<2)|(B)), \
   7547                                             (__v16sf)(__m512)(W), \
   7548                                             (__mmask16)(U), \
   7549                                             _MM_FROUND_CUR_DIRECTION))
   7550 
   7551 #define _mm512_maskz_getmant_ps(U, A, B, C) \
   7552   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
   7553                                             (int)(((C)<<2)|(B)), \
   7554                                             (__v16sf)_mm512_setzero_ps(), \
   7555                                             (__mmask16)(U), \
   7556                                             _MM_FROUND_CUR_DIRECTION))
   7557 
   7558 #define _mm512_getexp_round_pd(A, R) \
   7559   ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
   7560                                             (__v8df)_mm512_undefined_pd(), \
   7561                                             (__mmask8)-1, (int)(R)))
   7562 
   7563 #define _mm512_mask_getexp_round_pd(W, U, A, R) \
   7564   ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
   7565                                             (__v8df)(__m512d)(W), \
   7566                                             (__mmask8)(U), (int)(R)))
   7567 
   7568 #define _mm512_maskz_getexp_round_pd(U, A, R) \
   7569   ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
   7570                                             (__v8df)_mm512_setzero_pd(), \
   7571                                             (__mmask8)(U), (int)(R)))
   7572 
   7573 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   7574 _mm512_getexp_pd (__m512d __A)
   7575 {
   7576   return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
   7577                 (__v8df) _mm512_undefined_pd (),
   7578                 (__mmask8) -1,
   7579                 _MM_FROUND_CUR_DIRECTION);
   7580 }
   7581 
   7582 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   7583 _mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
   7584 {
   7585   return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
   7586                 (__v8df) __W,
   7587                 (__mmask8) __U,
   7588                 _MM_FROUND_CUR_DIRECTION);
   7589 }
   7590 
   7591 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   7592 _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
   7593 {
   7594   return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
   7595                 (__v8df) _mm512_setzero_pd (),
   7596                 (__mmask8) __U,
   7597                 _MM_FROUND_CUR_DIRECTION);
   7598 }
   7599 
   7600 #define _mm512_getexp_round_ps(A, R) \
   7601   ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
   7602                                            (__v16sf)_mm512_undefined_ps(), \
   7603                                            (__mmask16)-1, (int)(R)))
   7604 
   7605 #define _mm512_mask_getexp_round_ps(W, U, A, R) \
   7606   ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
   7607                                            (__v16sf)(__m512)(W), \
   7608                                            (__mmask16)(U), (int)(R)))
   7609 
   7610 #define _mm512_maskz_getexp_round_ps(U, A, R) \
   7611   ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
   7612                                            (__v16sf)_mm512_setzero_ps(), \
   7613                                            (__mmask16)(U), (int)(R)))
   7614 
   7615 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   7616 _mm512_getexp_ps (__m512 __A)
   7617 {
   7618   return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
   7619                (__v16sf) _mm512_undefined_ps (),
   7620                (__mmask16) -1,
   7621                _MM_FROUND_CUR_DIRECTION);
   7622 }
   7623 
   7624 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   7625 _mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
   7626 {
   7627   return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
   7628                (__v16sf) __W,
   7629                (__mmask16) __U,
   7630                _MM_FROUND_CUR_DIRECTION);
   7631 }
   7632 
   7633 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   7634 _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
   7635 {
   7636   return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
   7637                (__v16sf) _mm512_setzero_ps (),
   7638                (__mmask16) __U,
   7639                _MM_FROUND_CUR_DIRECTION);
   7640 }
   7641 
   7642 #define _mm512_i64gather_ps(index, addr, scale) \
   7643   ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
   7644                                         (void const *)(addr), \
   7645                                         (__v8di)(__m512i)(index), (__mmask8)-1, \
   7646                                         (int)(scale)))
   7647 
   7648 #define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \
   7649   ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
   7650                                         (void const *)(addr), \
   7651                                         (__v8di)(__m512i)(index), \
   7652                                         (__mmask8)(mask), (int)(scale)))
   7653 
   7654 #define _mm512_i64gather_epi32(index, addr, scale) \
   7655   ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \
   7656                                          (void const *)(addr), \
   7657                                          (__v8di)(__m512i)(index), \
   7658                                          (__mmask8)-1, (int)(scale)))
   7659 
   7660 #define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \
   7661   ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
   7662                                          (void const *)(addr), \
   7663                                          (__v8di)(__m512i)(index), \
   7664                                          (__mmask8)(mask), (int)(scale)))
   7665 
   7666 #define _mm512_i64gather_pd(index, addr, scale) \
   7667   ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
   7668                                         (void const *)(addr), \
   7669                                         (__v8di)(__m512i)(index), (__mmask8)-1, \
   7670                                         (int)(scale)))
   7671 
   7672 #define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \
   7673   ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
   7674                                         (void const *)(addr), \
   7675                                         (__v8di)(__m512i)(index), \
   7676                                         (__mmask8)(mask), (int)(scale)))
   7677 
   7678 #define _mm512_i64gather_epi64(index, addr, scale) \
   7679   ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \
   7680                                         (void const *)(addr), \
   7681                                         (__v8di)(__m512i)(index), (__mmask8)-1, \
   7682                                         (int)(scale)))
   7683 
   7684 #define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \
   7685   ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
   7686                                         (void const *)(addr), \
   7687                                         (__v8di)(__m512i)(index), \
   7688                                         (__mmask8)(mask), (int)(scale)))
   7689 
   7690 #define _mm512_i32gather_ps(index, addr, scale) \
   7691   ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
   7692                                         (void const *)(addr), \
   7693                                         (__v16si)(__m512)(index), \
   7694                                         (__mmask16)-1, (int)(scale)))
   7695 
   7696 #define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \
   7697   ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
   7698                                         (void const *)(addr), \
   7699                                         (__v16si)(__m512)(index), \
   7700                                         (__mmask16)(mask), (int)(scale)))
   7701 
   7702 #define _mm512_i32gather_epi32(index, addr, scale) \
   7703   ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
   7704                                          (void const *)(addr), \
   7705                                          (__v16si)(__m512i)(index), \
   7706                                          (__mmask16)-1, (int)(scale)))
   7707 
   7708 #define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \
   7709   ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
   7710                                          (void const *)(addr), \
   7711                                          (__v16si)(__m512i)(index), \
   7712                                          (__mmask16)(mask), (int)(scale)))
   7713 
   7714 #define _mm512_i32gather_pd(index, addr, scale) \
   7715   ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
   7716                                         (void const *)(addr), \
   7717                                         (__v8si)(__m256i)(index), (__mmask8)-1, \
   7718                                         (int)(scale)))
   7719 
   7720 #define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \
   7721   ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
   7722                                         (void const *)(addr), \
   7723                                         (__v8si)(__m256i)(index), \
   7724                                         (__mmask8)(mask), (int)(scale)))
   7725 
   7726 #define _mm512_i32gather_epi64(index, addr, scale) \
   7727   ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
   7728                                         (void const *)(addr), \
   7729                                         (__v8si)(__m256i)(index), (__mmask8)-1, \
   7730                                         (int)(scale)))
   7731 
   7732 #define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \
   7733   ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
   7734                                         (void const *)(addr), \
   7735                                         (__v8si)(__m256i)(index), \
   7736                                         (__mmask8)(mask), (int)(scale)))
   7737 
   7738 #define _mm512_i64scatter_ps(addr, index, v1, scale) \
   7739   __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \
   7740                                 (__v8di)(__m512i)(index), \
   7741                                 (__v8sf)(__m256)(v1), (int)(scale))
   7742 
   7743 #define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \
   7744   __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \
   7745                                 (__v8di)(__m512i)(index), \
   7746                                 (__v8sf)(__m256)(v1), (int)(scale))
   7747 
   7748 #define _mm512_i64scatter_epi32(addr, index, v1, scale) \
   7749   __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \
   7750                                 (__v8di)(__m512i)(index), \
   7751                                 (__v8si)(__m256i)(v1), (int)(scale))
   7752 
   7753 #define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
   7754   __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \
   7755                                 (__v8di)(__m512i)(index), \
   7756                                 (__v8si)(__m256i)(v1), (int)(scale))
   7757 
   7758 #define _mm512_i64scatter_pd(addr, index, v1, scale) \
   7759   __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \
   7760                                (__v8di)(__m512i)(index), \
   7761                                (__v8df)(__m512d)(v1), (int)(scale))
   7762 
   7763 #define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \
   7764   __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \
   7765                                (__v8di)(__m512i)(index), \
   7766                                (__v8df)(__m512d)(v1), (int)(scale))
   7767 
   7768 #define _mm512_i64scatter_epi64(addr, index, v1, scale) \
   7769   __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \
   7770                                (__v8di)(__m512i)(index), \
   7771                                (__v8di)(__m512i)(v1), (int)(scale))
   7772 
   7773 #define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
   7774   __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \
   7775                                (__v8di)(__m512i)(index), \
   7776                                (__v8di)(__m512i)(v1), (int)(scale))
   7777 
   7778 #define _mm512_i32scatter_ps(addr, index, v1, scale) \
   7779   __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \
   7780                                 (__v16si)(__m512i)(index), \
   7781                                 (__v16sf)(__m512)(v1), (int)(scale))
   7782 
   7783 #define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \
   7784   __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \
   7785                                 (__v16si)(__m512i)(index), \
   7786                                 (__v16sf)(__m512)(v1), (int)(scale))
   7787 
   7788 #define _mm512_i32scatter_epi32(addr, index, v1, scale) \
   7789   __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \
   7790                                 (__v16si)(__m512i)(index), \
   7791                                 (__v16si)(__m512i)(v1), (int)(scale))
   7792 
   7793 #define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
   7794   __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \
   7795                                 (__v16si)(__m512i)(index), \
   7796                                 (__v16si)(__m512i)(v1), (int)(scale))
   7797 
   7798 #define _mm512_i32scatter_pd(addr, index, v1, scale) \
   7799   __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \
   7800                                (__v8si)(__m256i)(index), \
   7801                                (__v8df)(__m512d)(v1), (int)(scale))
   7802 
   7803 #define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \
   7804   __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \
   7805                                (__v8si)(__m256i)(index), \
   7806                                (__v8df)(__m512d)(v1), (int)(scale))
   7807 
   7808 #define _mm512_i32scatter_epi64(addr, index, v1, scale) \
   7809   __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \
   7810                                (__v8si)(__m256i)(index), \
   7811                                (__v8di)(__m512i)(v1), (int)(scale))
   7812 
   7813 #define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
   7814   __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \
   7815                                (__v8si)(__m256i)(index), \
   7816                                (__v8di)(__m512i)(v1), (int)(scale))
   7817 
   7818 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   7819 _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
   7820 {
   7821   return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
   7822                                        (__v4sf)__A,
   7823                                        (__v4sf)__B,
   7824                                        (__mmask8)__U,
   7825                                        _MM_FROUND_CUR_DIRECTION);
   7826 }
   7827 
   7828 #define _mm_fmadd_round_ss(A, B, C, R) \
   7829   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
   7830                                          (__v4sf)(__m128)(B), \
   7831                                          (__v4sf)(__m128)(C), (__mmask8)-1, \
   7832                                          (int)(R)))
   7833 
   7834 #define _mm_mask_fmadd_round_ss(W, U, A, B, R) \
   7835   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
   7836                                          (__v4sf)(__m128)(A), \
   7837                                          (__v4sf)(__m128)(B), (__mmask8)(U), \
   7838                                          (int)(R)))
   7839 
   7840 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   7841 _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
   7842 {
   7843   return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
   7844                                         (__v4sf)__B,
   7845                                         (__v4sf)__C,
   7846                                         (__mmask8)__U,
   7847                                         _MM_FROUND_CUR_DIRECTION);
   7848 }
   7849 
   7850 #define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \
   7851   ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
   7852                                           (__v4sf)(__m128)(B), \
   7853                                           (__v4sf)(__m128)(C), (__mmask8)(U), \
   7854                                           (int)(R)))
   7855 
   7856 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   7857 _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
   7858 {
   7859   return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
   7860                                         (__v4sf)__X,
   7861                                         (__v4sf)__Y,
   7862                                         (__mmask8)__U,
   7863                                         _MM_FROUND_CUR_DIRECTION);
   7864 }
   7865 
   7866 #define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \
   7867   ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
   7868                                           (__v4sf)(__m128)(X), \
   7869                                           (__v4sf)(__m128)(Y), (__mmask8)(U), \
   7870                                           (int)(R)))
   7871 
   7872 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   7873 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
   7874 {
   7875   return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
   7876                                        (__v4sf)__A,
   7877                                        -(__v4sf)__B,
   7878                                        (__mmask8)__U,
   7879                                        _MM_FROUND_CUR_DIRECTION);
   7880 }
   7881 
   7882 #define _mm_fmsub_round_ss(A, B, C, R) \
   7883   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
   7884                                          (__v4sf)(__m128)(B), \
   7885                                          -(__v4sf)(__m128)(C), (__mmask8)-1, \
   7886                                          (int)(R)))
   7887 
   7888 #define _mm_mask_fmsub_round_ss(W, U, A, B, R) \
   7889   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
   7890                                          (__v4sf)(__m128)(A), \
   7891                                          -(__v4sf)(__m128)(B), (__mmask8)(U), \
   7892                                          (int)(R)))
   7893 
   7894 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   7895 _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
   7896 {
   7897   return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
   7898                                         (__v4sf)__B,
   7899                                         -(__v4sf)__C,
   7900                                         (__mmask8)__U,
   7901                                         _MM_FROUND_CUR_DIRECTION);
   7902 }
   7903 
   7904 #define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \
   7905   ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
   7906                                           (__v4sf)(__m128)(B), \
   7907                                           -(__v4sf)(__m128)(C), (__mmask8)(U), \
   7908                                           (int)(R)))
   7909 
   7910 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   7911 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
   7912 {
   7913   return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
   7914                                         (__v4sf)__X,
   7915                                         (__v4sf)__Y,
   7916                                         (__mmask8)__U,
   7917                                         _MM_FROUND_CUR_DIRECTION);
   7918 }
   7919 
   7920 #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \
   7921   ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
   7922                                           (__v4sf)(__m128)(X), \
   7923                                           (__v4sf)(__m128)(Y), (__mmask8)(U), \
   7924                                           (int)(R)))
   7925 
   7926 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   7927 _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
   7928 {
   7929   return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
   7930                                        -(__v4sf)__A,
   7931                                        (__v4sf)__B,
   7932                                        (__mmask8)__U,
   7933                                        _MM_FROUND_CUR_DIRECTION);
   7934 }
   7935 
   7936 #define _mm_fnmadd_round_ss(A, B, C, R) \
   7937   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
   7938                                          -(__v4sf)(__m128)(B), \
   7939                                          (__v4sf)(__m128)(C), (__mmask8)-1, \
   7940                                          (int)(R)))
   7941 
   7942 #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \
   7943   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
   7944                                          -(__v4sf)(__m128)(A), \
   7945                                          (__v4sf)(__m128)(B), (__mmask8)(U), \
   7946                                          (int)(R)))
   7947 
   7948 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   7949 _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
   7950 {
   7951   return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
   7952                                         -(__v4sf)__B,
   7953                                         (__v4sf)__C,
   7954                                         (__mmask8)__U,
   7955                                         _MM_FROUND_CUR_DIRECTION);
   7956 }
   7957 
   7958 #define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \
   7959   ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
   7960                                           -(__v4sf)(__m128)(B), \
   7961                                           (__v4sf)(__m128)(C), (__mmask8)(U), \
   7962                                           (int)(R)))
   7963 
   7964 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   7965 _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
   7966 {
   7967   return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
   7968                                         -(__v4sf)__X,
   7969                                         (__v4sf)__Y,
   7970                                         (__mmask8)__U,
   7971                                         _MM_FROUND_CUR_DIRECTION);
   7972 }
   7973 
   7974 #define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \
   7975   ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
   7976                                           -(__v4sf)(__m128)(X), \
   7977                                           (__v4sf)(__m128)(Y), (__mmask8)(U), \
   7978                                           (int)(R)))
   7979 
   7980 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   7981 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
   7982 {
   7983   return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
   7984                                        -(__v4sf)__A,
   7985                                        -(__v4sf)__B,
   7986                                        (__mmask8)__U,
   7987                                        _MM_FROUND_CUR_DIRECTION);
   7988 }
   7989 
   7990 #define _mm_fnmsub_round_ss(A, B, C, R) \
   7991   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
   7992                                          -(__v4sf)(__m128)(B), \
   7993                                          -(__v4sf)(__m128)(C), (__mmask8)-1, \
   7994                                          (int)(R)))
   7995 
   7996 #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \
   7997   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
   7998                                          -(__v4sf)(__m128)(A), \
   7999                                          -(__v4sf)(__m128)(B), (__mmask8)(U), \
   8000                                          (int)(R)))
   8001 
   8002 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   8003 _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
   8004 {
   8005   return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
   8006                                         -(__v4sf)__B,
   8007                                         -(__v4sf)__C,
   8008                                         (__mmask8)__U,
   8009                                         _MM_FROUND_CUR_DIRECTION);
   8010 }
   8011 
   8012 #define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \
   8013   ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
   8014                                           -(__v4sf)(__m128)(B), \
   8015                                           -(__v4sf)(__m128)(C), (__mmask8)(U), \
   8016                                           (int)(R)))
   8017 
   8018 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   8019 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
   8020 {
   8021   return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
   8022                                         -(__v4sf)__X,
   8023                                         (__v4sf)__Y,
   8024                                         (__mmask8)__U,
   8025                                         _MM_FROUND_CUR_DIRECTION);
   8026 }
   8027 
   8028 #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \
   8029   ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
   8030                                           -(__v4sf)(__m128)(X), \
   8031                                           (__v4sf)(__m128)(Y), (__mmask8)(U), \
   8032                                           (int)(R)))
   8033 
   8034 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8035 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
   8036 {
   8037   return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
   8038                                        (__v2df)__A,
   8039                                        (__v2df)__B,
   8040                                        (__mmask8)__U,
   8041                                        _MM_FROUND_CUR_DIRECTION);
   8042 }
   8043 
   8044 #define _mm_fmadd_round_sd(A, B, C, R) \
   8045   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
   8046                                           (__v2df)(__m128d)(B), \
   8047                                           (__v2df)(__m128d)(C), (__mmask8)-1, \
   8048                                           (int)(R)))
   8049 
   8050 #define _mm_mask_fmadd_round_sd(W, U, A, B, R) \
   8051   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
   8052                                           (__v2df)(__m128d)(A), \
   8053                                           (__v2df)(__m128d)(B), (__mmask8)(U), \
   8054                                           (int)(R)))
   8055 
   8056 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8057 _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
   8058 {
   8059   return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
   8060                                         (__v2df)__B,
   8061                                         (__v2df)__C,
   8062                                         (__mmask8)__U,
   8063                                         _MM_FROUND_CUR_DIRECTION);
   8064 }
   8065 
   8066 #define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \
   8067   ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
   8068                                            (__v2df)(__m128d)(B), \
   8069                                            (__v2df)(__m128d)(C), (__mmask8)(U), \
   8070                                            (int)(R)))
   8071 
   8072 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8073 _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
   8074 {
   8075   return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
   8076                                         (__v2df)__X,
   8077                                         (__v2df)__Y,
   8078                                         (__mmask8)__U,
   8079                                         _MM_FROUND_CUR_DIRECTION);
   8080 }
   8081 
   8082 #define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \
   8083   ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
   8084                                            (__v2df)(__m128d)(X), \
   8085                                            (__v2df)(__m128d)(Y), (__mmask8)(U), \
   8086                                            (int)(R)))
   8087 
   8088 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8089 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
   8090 {
   8091   return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
   8092                                        (__v2df)__A,
   8093                                        -(__v2df)__B,
   8094                                        (__mmask8)__U,
   8095                                        _MM_FROUND_CUR_DIRECTION);
   8096 }
   8097 
   8098 #define _mm_fmsub_round_sd(A, B, C, R) \
   8099   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
   8100                                           (__v2df)(__m128d)(B), \
   8101                                           -(__v2df)(__m128d)(C), (__mmask8)-1, \
   8102                                           (int)(R)))
   8103 
   8104 #define _mm_mask_fmsub_round_sd(W, U, A, B, R) \
   8105   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
   8106                                           (__v2df)(__m128d)(A), \
   8107                                           -(__v2df)(__m128d)(B), (__mmask8)(U), \
   8108                                           (int)(R)))
   8109 
   8110 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8111 _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
   8112 {
   8113   return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
   8114                                         (__v2df)__B,
   8115                                         -(__v2df)__C,
   8116                                         (__mmask8)__U,
   8117                                         _MM_FROUND_CUR_DIRECTION);
   8118 }
   8119 
   8120 #define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \
   8121   ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
   8122                                            (__v2df)(__m128d)(B), \
   8123                                            -(__v2df)(__m128d)(C), \
   8124                                            (__mmask8)(U), (int)(R)))
   8125 
   8126 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8127 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
   8128 {
   8129   return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
   8130                                         (__v2df)__X,
   8131                                         (__v2df)__Y,
   8132                                         (__mmask8)__U,
   8133                                         _MM_FROUND_CUR_DIRECTION);
   8134 }
   8135 
   8136 #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \
   8137   ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
   8138                                            (__v2df)(__m128d)(X), \
   8139                                            (__v2df)(__m128d)(Y), \
   8140                                            (__mmask8)(U), (int)(R)))
   8141 
   8142 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8143 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
   8144 {
   8145   return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
   8146                                        -(__v2df)__A,
   8147                                        (__v2df)__B,
   8148                                        (__mmask8)__U,
   8149                                        _MM_FROUND_CUR_DIRECTION);
   8150 }
   8151 
   8152 #define _mm_fnmadd_round_sd(A, B, C, R) \
   8153   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
   8154                                           -(__v2df)(__m128d)(B), \
   8155                                           (__v2df)(__m128d)(C), (__mmask8)-1, \
   8156                                           (int)(R)))
   8157 
   8158 #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \
   8159   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
   8160                                           -(__v2df)(__m128d)(A), \
   8161                                           (__v2df)(__m128d)(B), (__mmask8)(U), \
   8162                                           (int)(R)))
   8163 
   8164 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8165 _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
   8166 {
   8167   return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
   8168                                         -(__v2df)__B,
   8169                                         (__v2df)__C,
   8170                                         (__mmask8)__U,
   8171                                         _MM_FROUND_CUR_DIRECTION);
   8172 }
   8173 
   8174 #define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \
   8175   ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
   8176                                            -(__v2df)(__m128d)(B), \
   8177                                            (__v2df)(__m128d)(C), (__mmask8)(U), \
   8178                                            (int)(R)))
   8179 
   8180 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8181 _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
   8182 {
   8183   return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
   8184                                         -(__v2df)__X,
   8185                                         (__v2df)__Y,
   8186                                         (__mmask8)__U,
   8187                                         _MM_FROUND_CUR_DIRECTION);
   8188 }
   8189 
   8190 #define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \
   8191   ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
   8192                                            -(__v2df)(__m128d)(X), \
   8193                                            (__v2df)(__m128d)(Y), (__mmask8)(U), \
   8194                                            (int)(R)))
   8195 
   8196 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8197 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
   8198 {
   8199   return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
   8200                                        -(__v2df)__A,
   8201                                        -(__v2df)__B,
   8202                                        (__mmask8)__U,
   8203                                        _MM_FROUND_CUR_DIRECTION);
   8204 }
   8205 
   8206 #define _mm_fnmsub_round_sd(A, B, C, R) \
   8207   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
   8208                                           -(__v2df)(__m128d)(B), \
   8209                                           -(__v2df)(__m128d)(C), (__mmask8)-1, \
   8210                                           (int)(R)))
   8211 
   8212 #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \
   8213   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
   8214                                           -(__v2df)(__m128d)(A), \
   8215                                           -(__v2df)(__m128d)(B), (__mmask8)(U), \
   8216                                           (int)(R)))
   8217 
   8218 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8219 _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
   8220 {
   8221   return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
   8222                                         -(__v2df)__B,
   8223                                         -(__v2df)__C,
   8224                                         (__mmask8)__U,
   8225                                         _MM_FROUND_CUR_DIRECTION);
   8226 }
   8227 
   8228 #define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \
   8229   ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
   8230                                            -(__v2df)(__m128d)(B), \
   8231                                            -(__v2df)(__m128d)(C), \
   8232                                            (__mmask8)(U), \
   8233                                            (int)(R)))
   8234 
   8235 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8236 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
   8237 {
   8238   return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
   8239                                         -(__v2df)__X,
   8240                                         (__v2df)__Y,
   8241                                         (__mmask8)__U,
   8242                                         _MM_FROUND_CUR_DIRECTION);
   8243 }
   8244 
   8245 #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \
   8246   ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
   8247                                            -(__v2df)(__m128d)(X), \
   8248                                            (__v2df)(__m128d)(Y), \
   8249                                            (__mmask8)(U), (int)(R)))
   8250 
   8251 #define _mm512_permutex_pd(X, C) \
   8252   ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C)))
   8253 
   8254 #define _mm512_mask_permutex_pd(W, U, X, C) \
   8255   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   8256                                         (__v8df)_mm512_permutex_pd((X), (C)), \
   8257                                         (__v8df)(__m512d)(W)))
   8258 
   8259 #define _mm512_maskz_permutex_pd(U, X, C) \
   8260   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
   8261                                         (__v8df)_mm512_permutex_pd((X), (C)), \
   8262                                         (__v8df)_mm512_setzero_pd()))
   8263 
   8264 #define _mm512_permutex_epi64(X, C) \
   8265   ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C)))
   8266 
   8267 #define _mm512_mask_permutex_epi64(W, U, X, C) \
   8268   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
   8269                                        (__v8di)_mm512_permutex_epi64((X), (C)), \
   8270                                        (__v8di)(__m512i)(W)))
   8271 
   8272 #define _mm512_maskz_permutex_epi64(U, X, C) \
   8273   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
   8274                                        (__v8di)_mm512_permutex_epi64((X), (C)), \
   8275                                        (__v8di)_mm512_setzero_si512()))
   8276 
   8277 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8278 _mm512_permutexvar_pd (__m512i __X, __m512d __Y)
   8279 {
   8280   return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X);
   8281 }
   8282 
   8283 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8284 _mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
   8285 {
   8286   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   8287                                         (__v8df)_mm512_permutexvar_pd(__X, __Y),
   8288                                         (__v8df)__W);
   8289 }
   8290 
   8291 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8292 _mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
   8293 {
   8294   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   8295                                         (__v8df)_mm512_permutexvar_pd(__X, __Y),
   8296                                         (__v8df)_mm512_setzero_pd());
   8297 }
   8298 
   8299 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8300 _mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
   8301 {
   8302   return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X);
   8303 }
   8304 
   8305 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8306 _mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
   8307 {
   8308   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   8309                                      (__v8di)_mm512_permutexvar_epi64(__X, __Y),
   8310                                      (__v8di)_mm512_setzero_si512());
   8311 }
   8312 
   8313 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8314 _mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
   8315              __m512i __Y)
   8316 {
   8317   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
   8318                                      (__v8di)_mm512_permutexvar_epi64(__X, __Y),
   8319                                      (__v8di)__W);
   8320 }
   8321 
   8322 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8323 _mm512_permutexvar_ps (__m512i __X, __m512 __Y)
   8324 {
   8325   return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X);
   8326 }
   8327 
   8328 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8329 _mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
   8330 {
   8331   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   8332                                        (__v16sf)_mm512_permutexvar_ps(__X, __Y),
   8333                                        (__v16sf)__W);
   8334 }
   8335 
   8336 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8337 _mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
   8338 {
   8339   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   8340                                        (__v16sf)_mm512_permutexvar_ps(__X, __Y),
   8341                                        (__v16sf)_mm512_setzero_ps());
   8342 }
   8343 
   8344 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8345 _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
   8346 {
   8347   return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X);
   8348 }
   8349 
   8350 #define _mm512_permutevar_epi32 _mm512_permutexvar_epi32
   8351 
   8352 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8353 _mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
   8354 {
   8355   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   8356                                     (__v16si)_mm512_permutexvar_epi32(__X, __Y),
   8357                                     (__v16si)_mm512_setzero_si512());
   8358 }
   8359 
   8360 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8361 _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
   8362              __m512i __Y)
   8363 {
   8364   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
   8365                                     (__v16si)_mm512_permutexvar_epi32(__X, __Y),
   8366                                     (__v16si)__W);
   8367 }
   8368 
   8369 #define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32
   8370 
   8371 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
   8372 _mm512_kand (__mmask16 __A, __mmask16 __B)
   8373 {
   8374   return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
   8375 }
   8376 
   8377 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
   8378 _mm512_kandn (__mmask16 __A, __mmask16 __B)
   8379 {
   8380   return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
   8381 }
   8382 
   8383 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
   8384 _mm512_kor (__mmask16 __A, __mmask16 __B)
   8385 {
   8386   return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
   8387 }
   8388 
   8389 static __inline__ int __DEFAULT_FN_ATTRS
   8390 _mm512_kortestc (__mmask16 __A, __mmask16 __B)
   8391 {
   8392   return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
   8393 }
   8394 
   8395 static __inline__ int __DEFAULT_FN_ATTRS
   8396 _mm512_kortestz (__mmask16 __A, __mmask16 __B)
   8397 {
   8398   return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
   8399 }
   8400 
   8401 static __inline__ unsigned char __DEFAULT_FN_ATTRS
   8402 _kortestc_mask16_u8(__mmask16 __A, __mmask16 __B)
   8403 {
   8404   return (unsigned char)__builtin_ia32_kortestchi(__A, __B);
   8405 }
   8406 
   8407 static __inline__ unsigned char __DEFAULT_FN_ATTRS
   8408 _kortestz_mask16_u8(__mmask16 __A, __mmask16 __B)
   8409 {
   8410   return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
   8411 }
   8412 
   8413 static __inline__ unsigned char __DEFAULT_FN_ATTRS
   8414 _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
   8415   *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B);
   8416   return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
   8417 }
   8418 
   8419 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
   8420 _mm512_kunpackb (__mmask16 __A, __mmask16 __B)
   8421 {
   8422   return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
   8423 }
   8424 
   8425 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
   8426 _mm512_kxnor (__mmask16 __A, __mmask16 __B)
   8427 {
   8428   return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
   8429 }
   8430 
   8431 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
   8432 _mm512_kxor (__mmask16 __A, __mmask16 __B)
   8433 {
   8434   return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
   8435 }
   8436 
   8437 #define _kand_mask16 _mm512_kand
   8438 #define _kandn_mask16 _mm512_kandn
   8439 #define _knot_mask16 _mm512_knot
   8440 #define _kor_mask16 _mm512_kor
   8441 #define _kxnor_mask16 _mm512_kxnor
   8442 #define _kxor_mask16 _mm512_kxor
   8443 
   8444 #define _kshiftli_mask16(A, I) \
   8445   ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I)))
   8446 
   8447 #define _kshiftri_mask16(A, I) \
   8448   ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)))
   8449 
   8450 static __inline__ unsigned int __DEFAULT_FN_ATTRS
   8451 _cvtmask16_u32(__mmask16 __A) {
   8452   return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A);
   8453 }
   8454 
   8455 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
   8456 _cvtu32_mask16(unsigned int __A) {
   8457   return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A);
   8458 }
   8459 
   8460 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
   8461 _load_mask16(__mmask16 *__A) {
   8462   return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A);
   8463 }
   8464 
   8465 static __inline__ void __DEFAULT_FN_ATTRS
   8466 _store_mask16(__mmask16 *__A, __mmask16 __B) {
   8467   *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B);
   8468 }
   8469 
   8470 static __inline__ void __DEFAULT_FN_ATTRS512
   8471 _mm512_stream_si512 (void * __P, __m512i __A)
   8472 {
   8473   typedef __v8di __v8di_aligned __attribute__((aligned(64)));
   8474   __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
   8475 }
   8476 
   8477 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8478 _mm512_stream_load_si512 (void const *__P)
   8479 {
   8480   typedef __v8di __v8di_aligned __attribute__((aligned(64)));
   8481   return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
   8482 }
   8483 
   8484 static __inline__ void __DEFAULT_FN_ATTRS512
   8485 _mm512_stream_pd (void *__P, __m512d __A)
   8486 {
   8487   typedef __v8df __v8df_aligned __attribute__((aligned(64)));
   8488   __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
   8489 }
   8490 
   8491 static __inline__ void __DEFAULT_FN_ATTRS512
   8492 _mm512_stream_ps (void *__P, __m512 __A)
   8493 {
   8494   typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
   8495   __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
   8496 }
   8497 
   8498 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8499 _mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
   8500 {
   8501   return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
   8502                   (__v8df) __W,
   8503                   (__mmask8) __U);
   8504 }
   8505 
   8506 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8507 _mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
   8508 {
   8509   return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
   8510                   (__v8df)
   8511                   _mm512_setzero_pd (),
   8512                   (__mmask8) __U);
   8513 }
   8514 
   8515 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8516 _mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
   8517 {
   8518   return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
   8519                   (__v8di) __W,
   8520                   (__mmask8) __U);
   8521 }
   8522 
   8523 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8524 _mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
   8525 {
   8526   return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
   8527                   (__v8di)
   8528                   _mm512_setzero_si512 (),
   8529                   (__mmask8) __U);
   8530 }
   8531 
   8532 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8533 _mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
   8534 {
   8535   return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
   8536                  (__v16sf) __W,
   8537                  (__mmask16) __U);
   8538 }
   8539 
   8540 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8541 _mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
   8542 {
   8543   return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
   8544                  (__v16sf)
   8545                  _mm512_setzero_ps (),
   8546                  (__mmask16) __U);
   8547 }
   8548 
   8549 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8550 _mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
   8551 {
   8552   return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
   8553                   (__v16si) __W,
   8554                   (__mmask16) __U);
   8555 }
   8556 
   8557 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8558 _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
   8559 {
   8560   return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
   8561                   (__v16si)
   8562                   _mm512_setzero_si512 (),
   8563                   (__mmask16) __U);
   8564 }
   8565 
   8566 #define _mm_cmp_round_ss_mask(X, Y, P, R) \
   8567   ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
   8568                                        (__v4sf)(__m128)(Y), (int)(P), \
   8569                                        (__mmask8)-1, (int)(R)))
   8570 
   8571 #define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \
   8572   ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
   8573                                        (__v4sf)(__m128)(Y), (int)(P), \
   8574                                        (__mmask8)(M), (int)(R)))
   8575 
   8576 #define _mm_cmp_ss_mask(X, Y, P) \
   8577   ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
   8578                                        (__v4sf)(__m128)(Y), (int)(P), \
   8579                                        (__mmask8)-1, \
   8580                                        _MM_FROUND_CUR_DIRECTION))
   8581 
   8582 #define _mm_mask_cmp_ss_mask(M, X, Y, P) \
   8583   ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
   8584                                        (__v4sf)(__m128)(Y), (int)(P), \
   8585                                        (__mmask8)(M), \
   8586                                        _MM_FROUND_CUR_DIRECTION))
   8587 
   8588 #define _mm_cmp_round_sd_mask(X, Y, P, R) \
   8589   ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
   8590                                        (__v2df)(__m128d)(Y), (int)(P), \
   8591                                        (__mmask8)-1, (int)(R)))
   8592 
   8593 #define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \
   8594   ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
   8595                                        (__v2df)(__m128d)(Y), (int)(P), \
   8596                                        (__mmask8)(M), (int)(R)))
   8597 
   8598 #define _mm_cmp_sd_mask(X, Y, P) \
   8599   ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
   8600                                        (__v2df)(__m128d)(Y), (int)(P), \
   8601                                        (__mmask8)-1, \
   8602                                        _MM_FROUND_CUR_DIRECTION))
   8603 
   8604 #define _mm_mask_cmp_sd_mask(M, X, Y, P) \
   8605   ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
   8606                                        (__v2df)(__m128d)(Y), (int)(P), \
   8607                                        (__mmask8)(M), \
   8608                                        _MM_FROUND_CUR_DIRECTION))
   8609 
   8610 /* Bit Test */
   8611 
   8612 static __inline __mmask16 __DEFAULT_FN_ATTRS512
   8613 _mm512_test_epi32_mask (__m512i __A, __m512i __B)
   8614 {
   8615   return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B),
   8616                                    _mm512_setzero_si512());
   8617 }
   8618 
   8619 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
   8620 _mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
   8621 {
   8622   return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
   8623                                         _mm512_setzero_si512());
   8624 }
   8625 
   8626 static __inline __mmask8 __DEFAULT_FN_ATTRS512
   8627 _mm512_test_epi64_mask (__m512i __A, __m512i __B)
   8628 {
   8629   return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B),
   8630                                    _mm512_setzero_si512());
   8631 }
   8632 
   8633 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
   8634 _mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
   8635 {
   8636   return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
   8637                                         _mm512_setzero_si512());
   8638 }
   8639 
   8640 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
   8641 _mm512_testn_epi32_mask (__m512i __A, __m512i __B)
   8642 {
   8643   return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B),
   8644                                   _mm512_setzero_si512());
   8645 }
   8646 
   8647 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
   8648 _mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
   8649 {
   8650   return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
   8651                                        _mm512_setzero_si512());
   8652 }
   8653 
   8654 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
   8655 _mm512_testn_epi64_mask (__m512i __A, __m512i __B)
   8656 {
   8657   return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B),
   8658                                   _mm512_setzero_si512());
   8659 }
   8660 
   8661 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
   8662 _mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
   8663 {
   8664   return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
   8665                                        _mm512_setzero_si512());
   8666 }
   8667 
   8668 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8669 _mm512_movehdup_ps (__m512 __A)
   8670 {
   8671   return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
   8672                          1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
   8673 }
   8674 
   8675 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8676 _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
   8677 {
   8678   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   8679                                              (__v16sf)_mm512_movehdup_ps(__A),
   8680                                              (__v16sf)__W);
   8681 }
   8682 
   8683 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8684 _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
   8685 {
   8686   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   8687                                              (__v16sf)_mm512_movehdup_ps(__A),
   8688                                              (__v16sf)_mm512_setzero_ps());
   8689 }
   8690 
   8691 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8692 _mm512_moveldup_ps (__m512 __A)
   8693 {
   8694   return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
   8695                          0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
   8696 }
   8697 
   8698 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8699 _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
   8700 {
   8701   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   8702                                              (__v16sf)_mm512_moveldup_ps(__A),
   8703                                              (__v16sf)__W);
   8704 }
   8705 
   8706 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8707 _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
   8708 {
   8709   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
   8710                                              (__v16sf)_mm512_moveldup_ps(__A),
   8711                                              (__v16sf)_mm512_setzero_ps());
   8712 }
   8713 
   8714 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   8715 _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
   8716 {
   8717   return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W);
   8718 }
   8719 
   8720 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   8721 _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
   8722 {
   8723   return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B),
   8724                                      _mm_setzero_ps());
   8725 }
   8726 
   8727 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8728 _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
   8729 {
   8730   return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W);
   8731 }
   8732 
   8733 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8734 _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
   8735 {
   8736   return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B),
   8737                                      _mm_setzero_pd());
   8738 }
   8739 
   8740 static __inline__ void __DEFAULT_FN_ATTRS128
   8741 _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
   8742 {
   8743   __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1);
   8744 }
   8745 
   8746 static __inline__ void __DEFAULT_FN_ATTRS128
   8747 _mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
   8748 {
   8749   __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1);
   8750 }
   8751 
   8752 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   8753 _mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
   8754 {
   8755   __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
   8756                                                 (__v4sf)_mm_setzero_ps(),
   8757                                                 0, 4, 4, 4);
   8758 
   8759   return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1);
   8760 }
   8761 
   8762 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   8763 _mm_maskz_load_ss (__mmask8 __U, const float* __A)
   8764 {
   8765   return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A,
   8766                                                 (__v4sf) _mm_setzero_ps(),
   8767                                                 __U & 1);
   8768 }
   8769 
   8770 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8771 _mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
   8772 {
   8773   __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
   8774                                                  (__v2df)_mm_setzero_pd(),
   8775                                                  0, 2);
   8776 
   8777   return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1);
   8778 }
   8779 
   8780 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   8781 _mm_maskz_load_sd (__mmask8 __U, const double* __A)
   8782 {
   8783   return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A,
   8784                                                   (__v2df) _mm_setzero_pd(),
   8785                                                   __U & 1);
   8786 }
   8787 
   8788 #define _mm512_shuffle_epi32(A, I) \
   8789   ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I)))
   8790 
   8791 #define _mm512_mask_shuffle_epi32(W, U, A, I) \
   8792   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
   8793                                        (__v16si)_mm512_shuffle_epi32((A), (I)), \
   8794                                        (__v16si)(__m512i)(W)))
   8795 
   8796 #define _mm512_maskz_shuffle_epi32(U, A, I) \
   8797   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
   8798                                        (__v16si)_mm512_shuffle_epi32((A), (I)), \
   8799                                        (__v16si)_mm512_setzero_si512()))
   8800 
   8801 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8802 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
   8803 {
   8804   return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
   8805                 (__v8df) __W,
   8806                 (__mmask8) __U);
   8807 }
   8808 
   8809 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8810 _mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
   8811 {
   8812   return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
   8813                 (__v8df) _mm512_setzero_pd (),
   8814                 (__mmask8) __U);
   8815 }
   8816 
   8817 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8818 _mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
   8819 {
   8820   return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
   8821                 (__v8di) __W,
   8822                 (__mmask8) __U);
   8823 }
   8824 
   8825 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8826 _mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
   8827 {
   8828   return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
   8829                 (__v8di) _mm512_setzero_si512 (),
   8830                 (__mmask8) __U);
   8831 }
   8832 
   8833 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8834 _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
   8835 {
   8836   return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
   8837               (__v8df) __W,
   8838               (__mmask8) __U);
   8839 }
   8840 
   8841 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8842 _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
   8843 {
   8844   return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
   8845               (__v8df) _mm512_setzero_pd(),
   8846               (__mmask8) __U);
   8847 }
   8848 
   8849 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8850 _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
   8851 {
   8852   return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
   8853               (__v8di) __W,
   8854               (__mmask8) __U);
   8855 }
   8856 
   8857 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8858 _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
   8859 {
   8860   return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
   8861               (__v8di) _mm512_setzero_si512(),
   8862               (__mmask8) __U);
   8863 }
   8864 
   8865 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8866 _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
   8867 {
   8868   return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
   8869                    (__v16sf) __W,
   8870                    (__mmask16) __U);
   8871 }
   8872 
   8873 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8874 _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
   8875 {
   8876   return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
   8877                    (__v16sf) _mm512_setzero_ps(),
   8878                    (__mmask16) __U);
   8879 }
   8880 
   8881 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8882 _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
   8883 {
   8884   return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
   8885               (__v16si) __W,
   8886               (__mmask16) __U);
   8887 }
   8888 
   8889 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8890 _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
   8891 {
   8892   return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
   8893               (__v16si) _mm512_setzero_si512(),
   8894               (__mmask16) __U);
   8895 }
   8896 
   8897 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8898 _mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
   8899 {
   8900   return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
   8901                (__v16sf) __W,
   8902                (__mmask16) __U);
   8903 }
   8904 
   8905 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8906 _mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
   8907 {
   8908   return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
   8909                (__v16sf) _mm512_setzero_ps(),
   8910                (__mmask16) __U);
   8911 }
   8912 
   8913 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8914 _mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
   8915 {
   8916   return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
   8917                 (__v16si) __W,
   8918                 (__mmask16) __U);
   8919 }
   8920 
   8921 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   8922 _mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
   8923 {
   8924   return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
   8925                 (__v16si) _mm512_setzero_si512(),
   8926                 (__mmask16) __U);
   8927 }
   8928 
   8929 #define _mm512_cvt_roundps_pd(A, R) \
   8930   ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
   8931                                             (__v8df)_mm512_undefined_pd(), \
   8932                                             (__mmask8)-1, (int)(R)))
   8933 
   8934 #define _mm512_mask_cvt_roundps_pd(W, U, A, R) \
   8935   ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
   8936                                             (__v8df)(__m512d)(W), \
   8937                                             (__mmask8)(U), (int)(R)))
   8938 
   8939 #define _mm512_maskz_cvt_roundps_pd(U, A, R) \
   8940   ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
   8941                                             (__v8df)_mm512_setzero_pd(), \
   8942                                             (__mmask8)(U), (int)(R)))
   8943 
   8944 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8945 _mm512_cvtps_pd (__m256 __A)
   8946 {
   8947   return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df);
   8948 }
   8949 
   8950 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8951 _mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
   8952 {
   8953   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   8954                                               (__v8df)_mm512_cvtps_pd(__A),
   8955                                               (__v8df)__W);
   8956 }
   8957 
   8958 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8959 _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
   8960 {
   8961   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
   8962                                               (__v8df)_mm512_cvtps_pd(__A),
   8963                                               (__v8df)_mm512_setzero_pd());
   8964 }
   8965 
   8966 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8967 _mm512_cvtpslo_pd (__m512 __A)
   8968 {
   8969   return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
   8970 }
   8971 
   8972 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8973 _mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
   8974 {
   8975   return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
   8976 }
   8977 
   8978 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8979 _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
   8980 {
   8981   return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
   8982               (__v8df) __A,
   8983               (__v8df) __W);
   8984 }
   8985 
   8986 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   8987 _mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
   8988 {
   8989   return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
   8990               (__v8df) __A,
   8991               (__v8df) _mm512_setzero_pd ());
   8992 }
   8993 
   8994 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   8995 _mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
   8996 {
   8997   return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
   8998              (__v16sf) __A,
   8999              (__v16sf) __W);
   9000 }
   9001 
   9002 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   9003 _mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
   9004 {
   9005   return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
   9006              (__v16sf) __A,
   9007              (__v16sf) _mm512_setzero_ps ());
   9008 }
   9009 
   9010 static __inline__ void __DEFAULT_FN_ATTRS512
   9011 _mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
   9012 {
   9013   __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
   9014             (__mmask8) __U);
   9015 }
   9016 
   9017 static __inline__ void __DEFAULT_FN_ATTRS512
   9018 _mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
   9019 {
   9020   __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
   9021             (__mmask8) __U);
   9022 }
   9023 
   9024 static __inline__ void __DEFAULT_FN_ATTRS512
   9025 _mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
   9026 {
   9027   __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
   9028             (__mmask16) __U);
   9029 }
   9030 
   9031 static __inline__ void __DEFAULT_FN_ATTRS512
   9032 _mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
   9033 {
   9034   __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
   9035             (__mmask16) __U);
   9036 }
   9037 
   9038 #define _mm_cvt_roundsd_ss(A, B, R) \
   9039   ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
   9040                                               (__v2df)(__m128d)(B), \
   9041                                               (__v4sf)_mm_undefined_ps(), \
   9042                                               (__mmask8)-1, (int)(R)))
   9043 
   9044 #define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \
   9045   ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
   9046                                               (__v2df)(__m128d)(B), \
   9047                                               (__v4sf)(__m128)(W), \
   9048                                               (__mmask8)(U), (int)(R)))
   9049 
   9050 #define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \
   9051   ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
   9052                                               (__v2df)(__m128d)(B), \
   9053                                               (__v4sf)_mm_setzero_ps(), \
   9054                                               (__mmask8)(U), (int)(R)))
   9055 
   9056 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   9057 _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
   9058 {
   9059   return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
   9060                                              (__v2df)__B,
   9061                                              (__v4sf)__W,
   9062                                              (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   9063 }
   9064 
   9065 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   9066 _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
   9067 {
   9068   return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
   9069                                              (__v2df)__B,
   9070                                              (__v4sf)_mm_setzero_ps(),
   9071                                              (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   9072 }
   9073 
   9074 #define _mm_cvtss_i32 _mm_cvtss_si32
   9075 #define _mm_cvtsd_i32 _mm_cvtsd_si32
   9076 #define _mm_cvti32_sd _mm_cvtsi32_sd
   9077 #define _mm_cvti32_ss _mm_cvtsi32_ss
   9078 #ifdef __x86_64__
   9079 #define _mm_cvtss_i64 _mm_cvtss_si64
   9080 #define _mm_cvtsd_i64 _mm_cvtsd_si64
   9081 #define _mm_cvti64_sd _mm_cvtsi64_sd
   9082 #define _mm_cvti64_ss _mm_cvtsi64_ss
   9083 #endif
   9084 
   9085 #ifdef __x86_64__
   9086 #define _mm_cvt_roundi64_sd(A, B, R) \
   9087   ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
   9088                                       (int)(R)))
   9089 
   9090 #define _mm_cvt_roundsi64_sd(A, B, R) \
   9091   ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
   9092                                       (int)(R)))
   9093 #endif
   9094 
   9095 #define _mm_cvt_roundsi32_ss(A, B, R) \
   9096   ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
   9097 
   9098 #define _mm_cvt_roundi32_ss(A, B, R) \
   9099   ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
   9100 
   9101 #ifdef __x86_64__
   9102 #define _mm_cvt_roundsi64_ss(A, B, R) \
   9103   ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
   9104                                      (int)(R)))
   9105 
   9106 #define _mm_cvt_roundi64_ss(A, B, R) \
   9107   ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
   9108                                      (int)(R)))
   9109 #endif
   9110 
   9111 #define _mm_cvt_roundss_sd(A, B, R) \
   9112   ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
   9113                                                (__v4sf)(__m128)(B), \
   9114                                                (__v2df)_mm_undefined_pd(), \
   9115                                                (__mmask8)-1, (int)(R)))
   9116 
   9117 #define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \
   9118   ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
   9119                                                (__v4sf)(__m128)(B), \
   9120                                                (__v2df)(__m128d)(W), \
   9121                                                (__mmask8)(U), (int)(R)))
   9122 
   9123 #define _mm_maskz_cvt_roundss_sd(U, A, B, R) \
   9124   ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
   9125                                                (__v4sf)(__m128)(B), \
   9126                                                (__v2df)_mm_setzero_pd(), \
   9127                                                (__mmask8)(U), (int)(R)))
   9128 
   9129 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   9130 _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
   9131 {
   9132   return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
   9133                                             (__v4sf)__B,
   9134                                             (__v2df)__W,
   9135                                             (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   9136 }
   9137 
   9138 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   9139 _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
   9140 {
   9141   return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
   9142                                             (__v4sf)__B,
   9143                                             (__v2df)_mm_setzero_pd(),
   9144                                             (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
   9145 }
   9146 
   9147 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   9148 _mm_cvtu32_sd (__m128d __A, unsigned __B)
   9149 {
   9150   __A[0] = __B;
   9151   return __A;
   9152 }
   9153 
   9154 #ifdef __x86_64__
   9155 #define _mm_cvt_roundu64_sd(A, B, R) \
   9156   ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
   9157                                        (unsigned long long)(B), (int)(R)))
   9158 
   9159 static __inline__ __m128d __DEFAULT_FN_ATTRS128
   9160 _mm_cvtu64_sd (__m128d __A, unsigned long long __B)
   9161 {
   9162   __A[0] = __B;
   9163   return __A;
   9164 }
   9165 #endif
   9166 
   9167 #define _mm_cvt_roundu32_ss(A, B, R) \
   9168   ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
   9169                                       (int)(R)))
   9170 
   9171 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   9172 _mm_cvtu32_ss (__m128 __A, unsigned __B)
   9173 {
   9174   __A[0] = __B;
   9175   return __A;
   9176 }
   9177 
   9178 #ifdef __x86_64__
   9179 #define _mm_cvt_roundu64_ss(A, B, R) \
   9180   ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
   9181                                       (unsigned long long)(B), (int)(R)))
   9182 
   9183 static __inline__ __m128 __DEFAULT_FN_ATTRS128
   9184 _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
   9185 {
   9186   __A[0] = __B;
   9187   return __A;
   9188 }
   9189 #endif
   9190 
   9191 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   9192 _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
   9193 {
   9194   return (__m512i) __builtin_ia32_selectd_512(__M,
   9195                                               (__v16si) _mm512_set1_epi32(__A),
   9196                                               (__v16si) __O);
   9197 }
   9198 
   9199 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   9200 _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
   9201 {
   9202   return (__m512i) __builtin_ia32_selectq_512(__M,
   9203                                               (__v8di) _mm512_set1_epi64(__A),
   9204                                               (__v8di) __O);
   9205 }
   9206 
   9207 static  __inline __m512i __DEFAULT_FN_ATTRS512
   9208 _mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
   9209     char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
   9210     char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
   9211     char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
   9212     char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
   9213     char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
   9214     char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
   9215     char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
   9216     char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
   9217     char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
   9218     char __e4, char __e3, char __e2, char __e1, char __e0) {
   9219 
   9220   return __extension__ (__m512i)(__v64qi)
   9221     {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
   9222      __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
   9223      __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
   9224      __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
   9225      __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
   9226      __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
   9227      __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
   9228      __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
   9229 }
   9230 
   9231 static  __inline __m512i __DEFAULT_FN_ATTRS512
   9232 _mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
   9233     short __e27, short __e26, short __e25, short __e24, short __e23,
   9234     short __e22, short __e21, short __e20, short __e19, short __e18,
   9235     short __e17, short __e16, short __e15, short __e14, short __e13,
   9236     short __e12, short __e11, short __e10, short __e9, short __e8,
   9237     short __e7, short __e6, short __e5, short __e4, short __e3,
   9238     short __e2, short __e1, short __e0) {
   9239   return __extension__ (__m512i)(__v32hi)
   9240     {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
   9241      __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
   9242      __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
   9243      __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
   9244 }
   9245 
   9246 static __inline __m512i __DEFAULT_FN_ATTRS512
   9247 _mm512_set_epi32 (int __A, int __B, int __C, int __D,
   9248      int __E, int __F, int __G, int __H,
   9249      int __I, int __J, int __K, int __L,
   9250      int __M, int __N, int __O, int __P)
   9251 {
   9252   return __extension__ (__m512i)(__v16si)
   9253   { __P, __O, __N, __M, __L, __K, __J, __I,
   9254     __H, __G, __F, __E, __D, __C, __B, __A };
   9255 }
   9256 
   9257 #define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7,           \
   9258        e8,e9,e10,e11,e12,e13,e14,e15)          \
   9259   _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
   9260                    (e5),(e4),(e3),(e2),(e1),(e0))
   9261 
   9262 static __inline__ __m512i __DEFAULT_FN_ATTRS512
   9263 _mm512_set_epi64 (long long __A, long long __B, long long __C,
   9264      long long __D, long long __E, long long __F,
   9265      long long __G, long long __H)
   9266 {
   9267   return __extension__ (__m512i) (__v8di)
   9268   { __H, __G, __F, __E, __D, __C, __B, __A };
   9269 }
   9270 
   9271 #define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7)           \
   9272   _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
   9273 
   9274 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   9275 _mm512_set_pd (double __A, double __B, double __C, double __D,
   9276         double __E, double __F, double __G, double __H)
   9277 {
   9278   return __extension__ (__m512d)
   9279   { __H, __G, __F, __E, __D, __C, __B, __A };
   9280 }
   9281 
   9282 #define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7)              \
   9283   _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
   9284 
   9285 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   9286 _mm512_set_ps (float __A, float __B, float __C, float __D,
   9287         float __E, float __F, float __G, float __H,
   9288         float __I, float __J, float __K, float __L,
   9289         float __M, float __N, float __O, float __P)
   9290 {
   9291   return __extension__ (__m512)
   9292   { __P, __O, __N, __M, __L, __K, __J, __I,
   9293     __H, __G, __F, __E, __D, __C, __B, __A };
   9294 }
   9295 
   9296 #define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
   9297   _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
   9298                 (e4),(e3),(e2),(e1),(e0))
   9299 
   9300 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   9301 _mm512_abs_ps(__m512 __A)
   9302 {
   9303   return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
   9304 }
   9305 
   9306 static __inline__ __m512 __DEFAULT_FN_ATTRS512
   9307 _mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
   9308 {
   9309   return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
   9310 }
   9311 
   9312 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   9313 _mm512_abs_pd(__m512d __A)
   9314 {
   9315   return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
   9316 }
   9317 
   9318 static __inline__ __m512d __DEFAULT_FN_ATTRS512
   9319 _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
   9320 {
   9321   return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
   9322 }
   9323 
   9324 /* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
   9325  * outputs. This class of vector operation forms the basis of many scientific
   9326  * computations. In vector-reduction arithmetic, the evaluation order is
   9327  * independent of the order of the input elements of V.
   9328 
   9329  * For floating-point intrinsics:
   9330  * 1. When using fadd/fmul intrinsics, the order of operations within the
   9331  * vector is unspecified (associative math).
   9332  * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector
   9333  * produce unspecified results.
   9334 
   9335  * Used bisection method. At each step, we partition the vector with previous
   9336  * step in half, and the operation is performed on its two halves.
   9337  * This takes log2(n) steps where n is the number of elements in the vector.
   9338  */
   9339 
   9340 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
   9341   return __builtin_reduce_add((__v8di)__W);
   9342 }
   9343 
   9344 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
   9345   return __builtin_reduce_mul((__v8di)__W);
   9346 }
   9347 
   9348 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
   9349   return __builtin_reduce_and((__v8di)__W);
   9350 }
   9351 
   9352 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
   9353   return __builtin_reduce_or((__v8di)__W);
   9354 }
   9355 
   9356 static __inline__ long long __DEFAULT_FN_ATTRS512
   9357 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
   9358   __W = _mm512_maskz_mov_epi64(__M, __W);
   9359   return __builtin_reduce_add((__v8di)__W);
   9360 }
   9361 
   9362 static __inline__ long long __DEFAULT_FN_ATTRS512
   9363 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
   9364   __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
   9365   return __builtin_reduce_mul((__v8di)__W);
   9366 }
   9367 
   9368 static __inline__ long long __DEFAULT_FN_ATTRS512
   9369 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
   9370   __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __W);
   9371   return __builtin_reduce_and((__v8di)__W);
   9372 }
   9373 
   9374 static __inline__ long long __DEFAULT_FN_ATTRS512
   9375 _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
   9376   __W = _mm512_maskz_mov_epi64(__M, __W);
   9377   return __builtin_reduce_or((__v8di)__W);
   9378 }
   9379 
   9380 // -0.0 is used to ignore the start value since it is the neutral value of
   9381 // floating point addition. For more information, please refer to
   9382 // https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic
   9383 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
   9384   return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
   9385 }
   9386 
   9387 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
   9388   return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
   9389 }
   9390 
   9391 static __inline__ double __DEFAULT_FN_ATTRS512
   9392 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
   9393   __W = _mm512_maskz_mov_pd(__M, __W);
   9394   return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
   9395 }
   9396 
   9397 static __inline__ double __DEFAULT_FN_ATTRS512
   9398 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
   9399   __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
   9400   return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
   9401 }
   9402 
   9403 static __inline__ int __DEFAULT_FN_ATTRS512
   9404 _mm512_reduce_add_epi32(__m512i __W) {
   9405   return __builtin_reduce_add((__v16si)__W);
   9406 }
   9407 
   9408 static __inline__ int __DEFAULT_FN_ATTRS512
   9409 _mm512_reduce_mul_epi32(__m512i __W) {
   9410   return __builtin_reduce_mul((__v16si)__W);
   9411 }
   9412 
   9413 static __inline__ int __DEFAULT_FN_ATTRS512
   9414 _mm512_reduce_and_epi32(__m512i __W) {
   9415   return __builtin_reduce_and((__v16si)__W);
   9416 }
   9417 
   9418 static __inline__ int __DEFAULT_FN_ATTRS512
   9419 _mm512_reduce_or_epi32(__m512i __W) {
   9420   return __builtin_reduce_or((__v16si)__W);
   9421 }
   9422 
   9423 static __inline__ int __DEFAULT_FN_ATTRS512
   9424 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
   9425   __W = _mm512_maskz_mov_epi32(__M, __W);
   9426   return __builtin_reduce_add((__v16si)__W);
   9427 }
   9428 
   9429 static __inline__ int __DEFAULT_FN_ATTRS512
   9430 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
   9431   __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
   9432   return __builtin_reduce_mul((__v16si)__W);
   9433 }
   9434 
   9435 static __inline__ int __DEFAULT_FN_ATTRS512
   9436 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
   9437   __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __W);
   9438   return __builtin_reduce_and((__v16si)__W);
   9439 }
   9440 
   9441 static __inline__ int __DEFAULT_FN_ATTRS512
   9442 _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
   9443   __W = _mm512_maskz_mov_epi32(__M, __W);
   9444   return __builtin_reduce_or((__v16si)__W);
   9445 }
   9446 
   9447 static __inline__ float __DEFAULT_FN_ATTRS512
   9448 _mm512_reduce_add_ps(__m512 __W) {
   9449   return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
   9450 }
   9451 
   9452 static __inline__ float __DEFAULT_FN_ATTRS512
   9453 _mm512_reduce_mul_ps(__m512 __W) {
   9454   return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
   9455 }
   9456 
   9457 static __inline__ float __DEFAULT_FN_ATTRS512
   9458 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
   9459   __W = _mm512_maskz_mov_ps(__M, __W);
   9460   return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
   9461 }
   9462 
   9463 static __inline__ float __DEFAULT_FN_ATTRS512
   9464 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
   9465   __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
   9466   return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
   9467 }
   9468 
   9469 static __inline__ long long __DEFAULT_FN_ATTRS512
   9470 _mm512_reduce_max_epi64(__m512i __V) {
   9471   return __builtin_reduce_max((__v8di)__V);
   9472 }
   9473 
   9474 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
   9475 _mm512_reduce_max_epu64(__m512i __V) {
   9476   return __builtin_reduce_max((__v8du)__V);
   9477 }
   9478 
   9479 static __inline__ long long __DEFAULT_FN_ATTRS512
   9480 _mm512_reduce_min_epi64(__m512i __V) {
   9481   return __builtin_reduce_min((__v8di)__V);
   9482 }
   9483 
   9484 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
   9485 _mm512_reduce_min_epu64(__m512i __V) {
   9486   return __builtin_reduce_min((__v8du)__V);
   9487 }
   9488 
   9489 static __inline__ long long __DEFAULT_FN_ATTRS512
   9490 _mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
   9491   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
   9492   return __builtin_reduce_max((__v8di)__V);
   9493 }
   9494 
   9495 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
   9496 _mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
   9497   __V = _mm512_maskz_mov_epi64(__M, __V);
   9498   return __builtin_reduce_max((__v8du)__V);
   9499 }
   9500 
   9501 static __inline__ long long __DEFAULT_FN_ATTRS512
   9502 _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
   9503   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
   9504   return __builtin_reduce_min((__v8di)__V);
   9505 }
   9506 
   9507 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
   9508 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
   9509   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V);
   9510   return __builtin_reduce_min((__v8du)__V);
   9511 }
   9512 static __inline__ int __DEFAULT_FN_ATTRS512
   9513 _mm512_reduce_max_epi32(__m512i __V) {
   9514   return __builtin_reduce_max((__v16si)__V);
   9515 }
   9516 
   9517 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
   9518 _mm512_reduce_max_epu32(__m512i __V) {
   9519   return __builtin_reduce_max((__v16su)__V);
   9520 }
   9521 
   9522 static __inline__ int __DEFAULT_FN_ATTRS512
   9523 _mm512_reduce_min_epi32(__m512i __V) {
   9524   return __builtin_reduce_min((__v16si)__V);
   9525 }
   9526 
   9527 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
   9528 _mm512_reduce_min_epu32(__m512i __V) {
   9529   return __builtin_reduce_min((__v16su)__V);
   9530 }
   9531 
   9532 static __inline__ int __DEFAULT_FN_ATTRS512
   9533 _mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
   9534   __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
   9535   return __builtin_reduce_max((__v16si)__V);
   9536 }
   9537 
   9538 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
   9539 _mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
   9540   __V = _mm512_maskz_mov_epi32(__M, __V);
   9541   return __builtin_reduce_max((__v16su)__V);
   9542 }
   9543 
   9544 static __inline__ int __DEFAULT_FN_ATTRS512
   9545 _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
   9546   __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
   9547   return __builtin_reduce_min((__v16si)__V);
   9548 }
   9549 
   9550 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
   9551 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
   9552   __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __V);
   9553   return __builtin_reduce_min((__v16su)__V);
   9554 }
   9555 
   9556 static __inline__ double __DEFAULT_FN_ATTRS512
   9557 _mm512_reduce_max_pd(__m512d __V) {
   9558   return __builtin_ia32_reduce_fmax_pd512(__V);
   9559 }
   9560 
   9561 static __inline__ double __DEFAULT_FN_ATTRS512
   9562 _mm512_reduce_min_pd(__m512d __V) {
   9563   return __builtin_ia32_reduce_fmin_pd512(__V);
   9564 }
   9565 
   9566 static __inline__ double __DEFAULT_FN_ATTRS512
   9567 _mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
   9568   __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
   9569   return __builtin_ia32_reduce_fmax_pd512(__V);
   9570 }
   9571 
   9572 static __inline__ double __DEFAULT_FN_ATTRS512
   9573 _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
   9574   __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
   9575   return __builtin_ia32_reduce_fmin_pd512(__V);
   9576 }
   9577 
   9578 static __inline__ float __DEFAULT_FN_ATTRS512
   9579 _mm512_reduce_max_ps(__m512 __V) {
   9580   return __builtin_ia32_reduce_fmax_ps512(__V);
   9581 }
   9582 
   9583 static __inline__ float __DEFAULT_FN_ATTRS512
   9584 _mm512_reduce_min_ps(__m512 __V) {
   9585   return __builtin_ia32_reduce_fmin_ps512(__V);
   9586 }
   9587 
   9588 static __inline__ float __DEFAULT_FN_ATTRS512
   9589 _mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
   9590   __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
   9591   return __builtin_ia32_reduce_fmax_ps512(__V);
   9592 }
   9593 
   9594 static __inline__ float __DEFAULT_FN_ATTRS512
   9595 _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
   9596   __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
   9597   return __builtin_ia32_reduce_fmin_ps512(__V);
   9598 }
   9599 
   9600 /// Moves the least significant 32 bits of a vector of [16 x i32] to a
   9601 ///    32-bit signed integer value.
   9602 ///
   9603 /// \headerfile <x86intrin.h>
   9604 ///
   9605 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
   9606 ///
   9607 /// \param __A
   9608 ///    A vector of [16 x i32]. The least significant 32 bits are moved to the
   9609 ///    destination.
   9610 /// \returns A 32-bit signed integer containing the moved value.
   9611 static __inline__ int __DEFAULT_FN_ATTRS512
   9612 _mm512_cvtsi512_si32(__m512i __A) {
   9613   __v16si __b = (__v16si)__A;
   9614   return __b[0];
   9615 }
   9616 
   9617 /// Loads 8 double-precision (64-bit) floating-point elements stored at memory
   9618 /// locations starting at location \a base_addr at packed 32-bit integer indices
   9619 /// stored in the lower half of \a vindex scaled by \a scale them in dst.
   9620 ///
   9621 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
   9622 ///
   9623 /// \code{.operation}
   9624 /// FOR j := 0 to 7
   9625 ///   i := j*64
   9626 ///   m := j*32
   9627 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
   9628 ///   dst[i+63:i] := MEM[addr+63:addr]
   9629 /// ENDFOR
   9630 /// dst[MAX:512] := 0
   9631 /// \endcode
   9632 #define _mm512_i32logather_pd(vindex, base_addr, scale)                        \
   9633   _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
   9634 
   9635 /// Loads 8 double-precision (64-bit) floating-point elements from memory
   9636 /// starting at location \a base_addr at packed 32-bit integer indices stored in
   9637 /// the lower half of \a vindex scaled by \a scale into dst using writemask
   9638 /// \a mask (elements are copied from \a src when the corresponding mask bit is
   9639 /// not set).
   9640 ///
   9641 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
   9642 ///
   9643 /// \code{.operation}
   9644 /// FOR j := 0 to 7
   9645 ///   i := j*64
   9646 ///   m := j*32
   9647 ///   IF mask[j]
   9648 ///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
   9649 ///     dst[i+63:i] := MEM[addr+63:addr]
   9650 ///   ELSE
   9651 ///     dst[i+63:i] := src[i+63:i]
   9652 ///   FI
   9653 /// ENDFOR
   9654 /// dst[MAX:512] := 0
   9655 /// \endcode
   9656 #define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale)        \
   9657   _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex),      \
   9658                            (base_addr), (scale))
   9659 
   9660 /// Loads 8 64-bit integer elements from memory starting at location \a base_addr
   9661 /// at packed 32-bit integer indices stored in the lower half of \a vindex
   9662 /// scaled by \a scale and stores them in dst.
   9663 ///
   9664 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
   9665 ///
   9666 /// \code{.operation}
   9667 /// FOR j := 0 to 7
   9668 ///   i := j*64
   9669 ///   m := j*32
   9670 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
   9671 ///   dst[i+63:i] := MEM[addr+63:addr]
   9672 /// ENDFOR
   9673 /// dst[MAX:512] := 0
   9674 /// \endcode
   9675 #define _mm512_i32logather_epi64(vindex, base_addr, scale)                     \
   9676   _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
   9677 
   9678 /// Loads 8 64-bit integer elements from memory starting at location \a base_addr
   9679 /// at packed 32-bit integer indices stored in the lower half of \a vindex
   9680 /// scaled by \a scale and stores them in dst using writemask \a mask (elements
   9681 /// are copied from \a src when the corresponding mask bit is not set).
   9682 ///
   9683 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
   9684 ///
   9685 /// \code{.operation}
   9686 /// FOR j := 0 to 7
   9687 ///   i := j*64
   9688 ///   m := j*32
   9689 ///   IF mask[j]
   9690 ///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
   9691 ///     dst[i+63:i] := MEM[addr+63:addr]
   9692 ///   ELSE
   9693 ///     dst[i+63:i] := src[i+63:i]
   9694 ///   FI
   9695 /// ENDFOR
   9696 /// dst[MAX:512] := 0
   9697 /// \endcode
   9698 #define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale)     \
   9699   _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex),   \
   9700                               (base_addr), (scale))
   9701 
   9702 /// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
   9703 /// and to memory locations starting at location \a base_addr at packed 32-bit
   9704 /// integer indices stored in \a vindex scaled by \a scale.
   9705 ///
   9706 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
   9707 ///
   9708 /// \code{.operation}
   9709 /// FOR j := 0 to 7
   9710 ///   i := j*64
   9711 ///   m := j*32
   9712 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
   9713 ///   MEM[addr+63:addr] := v1[i+63:i]
   9714 /// ENDFOR
   9715 /// \endcode
   9716 #define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale)                   \
   9717   _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
   9718 
   9719 /// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
   9720 /// to memory locations starting at location \a base_addr at packed 32-bit
   9721 /// integer indices stored in \a vindex scaled by \a scale. Only those elements
   9722 /// whose corresponding mask bit is set in writemask \a mask are written to
   9723 /// memory.
   9724 ///
   9725 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
   9726 ///
   9727 /// \code{.operation}
   9728 /// FOR j := 0 to 7
   9729 ///   i := j*64
   9730 ///   m := j*32
   9731 ///   IF mask[j]
   9732 ///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
   9733 ///     MEM[addr+63:addr] := a[i+63:i]
   9734 ///   FI
   9735 /// ENDFOR
   9736 /// \endcode
   9737 #define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale)        \
   9738   _mm512_mask_i32scatter_pd((base_addr), (mask),                               \
   9739                             _mm512_castsi512_si256(vindex), (v1), (scale))
   9740 
   9741 /// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in
   9742 /// memory locations starting at location \a base_addr at packed 32-bit integer
   9743 /// indices stored in \a vindex scaled by \a scale.
   9744 ///
   9745 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
   9746 ///
   9747 /// \code{.operation}
   9748 /// FOR j := 0 to 7
   9749 ///   i := j*64
   9750 ///   m := j*32
   9751 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
   9752 ///   MEM[addr+63:addr] := a[i+63:i]
   9753 /// ENDFOR
   9754 /// \endcode
   9755 #define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale)                \
   9756   _mm512_i32scatter_epi64((base_addr),                                         \
   9757                           _mm512_castsi512_si256(vindex), (v1), (scale))
   9758 
   9759 /// Stores 8 packed 64-bit integer elements located in a and stores them in
   9760 /// memory locations starting at location \a base_addr at packed 32-bit integer
   9761 /// indices stored in \a vindex scaled by scale using writemask \a mask (elements
   9762 /// whose corresponding mask bit is not set are not written to memory).
   9763 ///
   9764 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
   9765 ///
   9766 /// \code{.operation}
   9767 /// FOR j := 0 to 7
   9768 ///   i := j*64
   9769 ///   m := j*32
   9770 ///   IF mask[j]
   9771 ///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
   9772 ///     MEM[addr+63:addr] := a[i+63:i]
   9773 ///   FI
   9774 /// ENDFOR
   9775 /// \endcode
   9776 #define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale)     \
   9777   _mm512_mask_i32scatter_epi64((base_addr), (mask),                            \
   9778                                _mm512_castsi512_si256(vindex), (v1), (scale))
   9779 
   9780 #undef __DEFAULT_FN_ATTRS512
   9781 #undef __DEFAULT_FN_ATTRS128
   9782 #undef __DEFAULT_FN_ATTRS
   9783 #undef __DEFAULT_FN_ATTRS512_CONSTEXPR
   9784 #undef __DEFAULT_FN_ATTRS128_CONSTEXPR
   9785 #undef __DEFAULT_FN_ATTRS_CONSTEXPR
   9786 
   9787 #endif /* __AVX512FINTRIN_H */