zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

emmintrin.h (71273B) - Raw


      1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 /* Implemented from the specification included in the Intel C++ Compiler
     11    User Guide and Reference, version 9.0.  */
     12 
     13 #ifndef NO_WARN_X86_INTRINSICS
     14 /* This header file is to help porting code using Intel intrinsics
     15    explicitly from x86_64 to powerpc64/powerpc64le.
     16 
     17    Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
     18    PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
     19    However scalar float operations in vector (XMM) registers require
     20    the POWER8 VSX ISA (2.07) level. There are differences for data
     21    format and placement of float scalars in the vector register, which
     22    require extra steps to match SSE2 scalar float semantics on POWER.
     23 
     24    It should be noted that there's much difference between X86_64's
     25    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
     26    portable <fenv.h> instead of access MXSCR directly.
     27 
     28    Most SSE2 scalar float intrinsic operations can be performed more
     29    efficiently as C language float scalar operations or optimized to
     30    use vector SIMD operations. We recommend this for new applications.
     31 */
     32 #error                                                                         \
     33     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
     34 #endif
     35 
     36 #ifndef EMMINTRIN_H_
     37 #define EMMINTRIN_H_
     38 
     39 #if defined(__powerpc64__) &&                                                  \
     40     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
     41 
     42 #include <altivec.h>
     43 
     44 /* We need definitions from the SSE header files.  */
     45 #include <xmmintrin.h>
     46 
     47 /* SSE2 */
     48 typedef __vector double __v2df;
     49 typedef __vector float __v4f;
     50 typedef __vector long long __v2di;
     51 typedef __vector unsigned long long __v2du;
     52 typedef __vector int __v4si;
     53 typedef __vector unsigned int __v4su;
     54 typedef __vector short __v8hi;
     55 typedef __vector unsigned short __v8hu;
     56 typedef __vector signed char __v16qi;
     57 typedef __vector unsigned char __v16qu;
     58 
     59 /* The Intel API is flexible enough that we must allow aliasing with other
     60    vector types, and their scalar components.  */
     61 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
     62 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
     63 
     64 /* Unaligned version of the same types.  */
     65 typedef long long __m128i_u
     66     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
     67 typedef double __m128d_u
     68     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
     69 
     70 /* Define two value permute mask.  */
     71 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
     72 
     73 /* Create a vector with element 0 as F and the rest zero.  */
     74 extern __inline __m128d
     75     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     76     _mm_set_sd(double __F) {
     77   return __extension__(__m128d){__F, 0.0};
     78 }
     79 
     80 /* Create a vector with both elements equal to F.  */
     81 extern __inline __m128d
     82     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     83     _mm_set1_pd(double __F) {
     84   return __extension__(__m128d){__F, __F};
     85 }
     86 
     87 extern __inline __m128d
     88     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     89     _mm_set_pd1(double __F) {
     90   return _mm_set1_pd(__F);
     91 }
     92 
     93 /* Create a vector with the lower value X and upper value W.  */
     94 extern __inline __m128d
     95     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     96     _mm_set_pd(double __W, double __X) {
     97   return __extension__(__m128d){__X, __W};
     98 }
     99 
    100 /* Create a vector with the lower value W and upper value X.  */
    101 extern __inline __m128d
    102     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    103     _mm_setr_pd(double __W, double __X) {
    104   return __extension__(__m128d){__W, __X};
    105 }
    106 
    107 /* Create an undefined vector.  */
    108 extern __inline __m128d
    109     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    110     _mm_undefined_pd(void) {
    111   __m128d __Y = __Y;
    112   return __Y;
    113 }
    114 
    115 /* Create a vector of zeros.  */
    116 extern __inline __m128d
    117     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    118     _mm_setzero_pd(void) {
    119   return (__m128d)vec_splats(0);
    120 }
    121 
    122 /* Sets the low DPFP value of A from the low value of B.  */
    123 extern __inline __m128d
    124     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    125     _mm_move_sd(__m128d __A, __m128d __B) {
    126   __v2df __result = (__v2df)__A;
    127   __result[0] = ((__v2df)__B)[0];
    128   return (__m128d)__result;
    129 }
    130 
    131 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
    132 extern __inline __m128d
    133     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    134     _mm_load_pd(double const *__P) {
    135   return ((__m128d)vec_ld(0, (__v16qu *)__P));
    136 }
    137 
    138 /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
    139 extern __inline __m128d
    140     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    141     _mm_loadu_pd(double const *__P) {
    142   return (vec_vsx_ld(0, __P));
    143 }
    144 
    145 /* Create a vector with all two elements equal to *P.  */
    146 extern __inline __m128d
    147     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    148     _mm_load1_pd(double const *__P) {
    149   return (vec_splats(*__P));
    150 }
    151 
    152 /* Create a vector with element 0 as *P and the rest zero.  */
    153 extern __inline __m128d
    154     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    155     _mm_load_sd(double const *__P) {
    156   return _mm_set_sd(*__P);
    157 }
    158 
    159 extern __inline __m128d
    160     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    161     _mm_load_pd1(double const *__P) {
    162   return _mm_load1_pd(__P);
    163 }
    164 
    165 /* Load two DPFP values in reverse order.  The address must be aligned.  */
    166 extern __inline __m128d
    167     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    168     _mm_loadr_pd(double const *__P) {
    169   __v2df __tmp = _mm_load_pd(__P);
    170   return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
    171 }
    172 
    173 /* Store two DPFP values.  The address must be 16-byte aligned.  */
    174 extern __inline void
    175     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    176     _mm_store_pd(double *__P, __m128d __A) {
    177   vec_st((__v16qu)__A, 0, (__v16qu *)__P);
    178 }
    179 
    180 /* Store two DPFP values.  The address need not be 16-byte aligned.  */
    181 extern __inline void
    182     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    183     _mm_storeu_pd(double *__P, __m128d __A) {
    184   *(__m128d_u *)__P = __A;
    185 }
    186 
    187 /* Stores the lower DPFP value.  */
    188 extern __inline void
    189     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    190     _mm_store_sd(double *__P, __m128d __A) {
    191   *__P = ((__v2df)__A)[0];
    192 }
    193 
    194 extern __inline double
    195     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    196     _mm_cvtsd_f64(__m128d __A) {
    197   return ((__v2df)__A)[0];
    198 }
    199 
    200 extern __inline void
    201     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    202     _mm_storel_pd(double *__P, __m128d __A) {
    203   _mm_store_sd(__P, __A);
    204 }
    205 
    206 /* Stores the upper DPFP value.  */
    207 extern __inline void
    208     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    209     _mm_storeh_pd(double *__P, __m128d __A) {
    210   *__P = ((__v2df)__A)[1];
    211 }
    212 /* Store the lower DPFP value across two words.
    213    The address must be 16-byte aligned.  */
    214 extern __inline void
    215     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    216     _mm_store1_pd(double *__P, __m128d __A) {
    217   _mm_store_pd(__P, vec_splat(__A, 0));
    218 }
    219 
    220 extern __inline void
    221     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    222     _mm_store_pd1(double *__P, __m128d __A) {
    223   _mm_store1_pd(__P, __A);
    224 }
    225 
    226 /* Store two DPFP values in reverse order.  The address must be aligned.  */
    227 extern __inline void
    228     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    229     _mm_storer_pd(double *__P, __m128d __A) {
    230   _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
    231 }
    232 
    233 /* Intel intrinsic.  */
    234 extern __inline long long
    235     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    236     _mm_cvtsi128_si64(__m128i __A) {
    237   return ((__v2di)__A)[0];
    238 }
    239 
    240 /* Microsoft intrinsic.  */
    241 extern __inline long long
    242     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    243     _mm_cvtsi128_si64x(__m128i __A) {
    244   return ((__v2di)__A)[0];
    245 }
    246 
    247 extern __inline __m128d
    248     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    249     _mm_add_pd(__m128d __A, __m128d __B) {
    250   return (__m128d)((__v2df)__A + (__v2df)__B);
    251 }
    252 
    253 /* Add the lower double-precision (64-bit) floating-point element in
    254    a and b, store the result in the lower element of dst, and copy
    255    the upper element from a to the upper element of dst. */
    256 extern __inline __m128d
    257     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    258     _mm_add_sd(__m128d __A, __m128d __B) {
    259   __A[0] = __A[0] + __B[0];
    260   return (__A);
    261 }
    262 
    263 extern __inline __m128d
    264     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    265     _mm_sub_pd(__m128d __A, __m128d __B) {
    266   return (__m128d)((__v2df)__A - (__v2df)__B);
    267 }
    268 
    269 extern __inline __m128d
    270     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    271     _mm_sub_sd(__m128d __A, __m128d __B) {
    272   __A[0] = __A[0] - __B[0];
    273   return (__A);
    274 }
    275 
    276 extern __inline __m128d
    277     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    278     _mm_mul_pd(__m128d __A, __m128d __B) {
    279   return (__m128d)((__v2df)__A * (__v2df)__B);
    280 }
    281 
    282 extern __inline __m128d
    283     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    284     _mm_mul_sd(__m128d __A, __m128d __B) {
    285   __A[0] = __A[0] * __B[0];
    286   return (__A);
    287 }
    288 
    289 extern __inline __m128d
    290     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    291     _mm_div_pd(__m128d __A, __m128d __B) {
    292   return (__m128d)((__v2df)__A / (__v2df)__B);
    293 }
    294 
    295 extern __inline __m128d
    296     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    297     _mm_div_sd(__m128d __A, __m128d __B) {
    298   __A[0] = __A[0] / __B[0];
    299   return (__A);
    300 }
    301 
    302 extern __inline __m128d
    303     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    304     _mm_sqrt_pd(__m128d __A) {
    305   return (vec_sqrt(__A));
    306 }
    307 
    308 /* Return pair {sqrt (B[0]), A[1]}.  */
    309 extern __inline __m128d
    310     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    311     _mm_sqrt_sd(__m128d __A, __m128d __B) {
    312   __v2df __c;
    313   __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
    314   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
    315 }
    316 
    317 extern __inline __m128d
    318     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    319     _mm_min_pd(__m128d __A, __m128d __B) {
    320   return (vec_min(__A, __B));
    321 }
    322 
    323 extern __inline __m128d
    324     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    325     _mm_min_sd(__m128d __A, __m128d __B) {
    326   __v2df __a, __b, __c;
    327   __a = vec_splats(__A[0]);
    328   __b = vec_splats(__B[0]);
    329   __c = vec_min(__a, __b);
    330   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
    331 }
    332 
    333 extern __inline __m128d
    334     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    335     _mm_max_pd(__m128d __A, __m128d __B) {
    336   return (vec_max(__A, __B));
    337 }
    338 
    339 extern __inline __m128d
    340     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    341     _mm_max_sd(__m128d __A, __m128d __B) {
    342   __v2df __a, __b, __c;
    343   __a = vec_splats(__A[0]);
    344   __b = vec_splats(__B[0]);
    345   __c = vec_max(__a, __b);
    346   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
    347 }
    348 
    349 extern __inline __m128d
    350     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    351     _mm_cmpeq_pd(__m128d __A, __m128d __B) {
    352   return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
    353 }
    354 
    355 extern __inline __m128d
    356     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    357     _mm_cmplt_pd(__m128d __A, __m128d __B) {
    358   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
    359 }
    360 
    361 extern __inline __m128d
    362     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    363     _mm_cmple_pd(__m128d __A, __m128d __B) {
    364   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
    365 }
    366 
    367 extern __inline __m128d
    368     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    369     _mm_cmpgt_pd(__m128d __A, __m128d __B) {
    370   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
    371 }
    372 
    373 extern __inline __m128d
    374     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    375     _mm_cmpge_pd(__m128d __A, __m128d __B) {
    376   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
    377 }
    378 
    379 extern __inline __m128d
    380     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    381     _mm_cmpneq_pd(__m128d __A, __m128d __B) {
    382   __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
    383   return ((__m128d)vec_nor(__temp, __temp));
    384 }
    385 
    386 extern __inline __m128d
    387     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    388     _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
    389   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
    390 }
    391 
    392 extern __inline __m128d
    393     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    394     _mm_cmpnle_pd(__m128d __A, __m128d __B) {
    395   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
    396 }
    397 
    398 extern __inline __m128d
    399     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    400     _mm_cmpngt_pd(__m128d __A, __m128d __B) {
    401   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
    402 }
    403 
    404 extern __inline __m128d
    405     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    406     _mm_cmpnge_pd(__m128d __A, __m128d __B) {
    407   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
    408 }
    409 
    410 extern __inline __m128d
    411     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    412     _mm_cmpord_pd(__m128d __A, __m128d __B) {
    413   __v2du __c, __d;
    414   /* Compare against self will return false (0's) if NAN.  */
    415   __c = (__v2du)vec_cmpeq(__A, __A);
    416   __d = (__v2du)vec_cmpeq(__B, __B);
    417   /* A != NAN and B != NAN.  */
    418   return ((__m128d)vec_and(__c, __d));
    419 }
    420 
    421 extern __inline __m128d
    422     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    423     _mm_cmpunord_pd(__m128d __A, __m128d __B) {
    424 #if _ARCH_PWR8
    425   __v2du __c, __d;
    426   /* Compare against self will return false (0's) if NAN.  */
    427   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
    428   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
    429   /* A == NAN OR B == NAN converts too:
    430      NOT(A != NAN) OR NOT(B != NAN).  */
    431   __c = vec_nor(__c, __c);
    432   return ((__m128d)vec_orc(__c, __d));
    433 #else
    434   __v2du __c, __d;
    435   /* Compare against self will return false (0's) if NAN.  */
    436   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
    437   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
    438   /* Convert the true ('1's) is NAN.  */
    439   __c = vec_nor(__c, __c);
    440   __d = vec_nor(__d, __d);
    441   return ((__m128d)vec_or(__c, __d));
    442 #endif
    443 }
    444 
    445 extern __inline __m128d
    446     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    447     _mm_cmpeq_sd(__m128d __A, __m128d __B) {
    448   __v2df __a, __b, __c;
    449   /* PowerISA VSX does not allow partial (for just lower double)
    450      results. So to insure we don't generate spurious exceptions
    451      (from the upper double values) we splat the lower double
    452      before we do the operation. */
    453   __a = vec_splats(__A[0]);
    454   __b = vec_splats(__B[0]);
    455   __c = (__v2df)vec_cmpeq(__a, __b);
    456   /* Then we merge the lower double result with the original upper
    457      double from __A.  */
    458   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
    459 }
    460 
    461 extern __inline __m128d
    462     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    463     _mm_cmplt_sd(__m128d __A, __m128d __B) {
    464   __v2df __a, __b, __c;
    465   __a = vec_splats(__A[0]);
    466   __b = vec_splats(__B[0]);
    467   __c = (__v2df)vec_cmplt(__a, __b);
    468   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
    469 }
    470 
    471 extern __inline __m128d
    472     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    473     _mm_cmple_sd(__m128d __A, __m128d __B) {
    474   __v2df __a, __b, __c;
    475   __a = vec_splats(__A[0]);
    476   __b = vec_splats(__B[0]);
    477   __c = (__v2df)vec_cmple(__a, __b);
    478   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
    479 }
    480 
    481 extern __inline __m128d
    482     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    483     _mm_cmpgt_sd(__m128d __A, __m128d __B) {
    484   __v2df __a, __b, __c;
    485   __a = vec_splats(__A[0]);
    486   __b = vec_splats(__B[0]);
    487   __c = (__v2df)vec_cmpgt(__a, __b);
    488   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
    489 }
    490 
    491 extern __inline __m128d
    492     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    493     _mm_cmpge_sd(__m128d __A, __m128d __B) {
    494   __v2df __a, __b, __c;
    495   __a = vec_splats(__A[0]);
    496   __b = vec_splats(__B[0]);
    497   __c = (__v2df)vec_cmpge(__a, __b);
    498   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
    499 }
    500 
    501 extern __inline __m128d
    502     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    503     _mm_cmpneq_sd(__m128d __A, __m128d __B) {
    504   __v2df __a, __b, __c;
    505   __a = vec_splats(__A[0]);
    506   __b = vec_splats(__B[0]);
    507   __c = (__v2df)vec_cmpeq(__a, __b);
    508   __c = vec_nor(__c, __c);
    509   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
    510 }
    511 
    512 extern __inline __m128d
    513     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    514     _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
    515   __v2df __a, __b, __c;
    516   __a = vec_splats(__A[0]);
    517   __b = vec_splats(__B[0]);
    518   /* Not less than is just greater than or equal.  */
    519   __c = (__v2df)vec_cmpge(__a, __b);
    520   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
    521 }
    522 
    523 extern __inline __m128d
    524     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    525     _mm_cmpnle_sd(__m128d __A, __m128d __B) {
    526   __v2df __a, __b, __c;
    527   __a = vec_splats(__A[0]);
    528   __b = vec_splats(__B[0]);
    529   /* Not less than or equal is just greater than.  */
    530   __c = (__v2df)vec_cmpge(__a, __b);
    531   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
    532 }
    533 
    534 extern __inline __m128d
    535     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    536     _mm_cmpngt_sd(__m128d __A, __m128d __B) {
    537   __v2df __a, __b, __c;
    538   __a = vec_splats(__A[0]);
    539   __b = vec_splats(__B[0]);
    540   /* Not greater than is just less than or equal.  */
    541   __c = (__v2df)vec_cmple(__a, __b);
    542   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
    543 }
    544 
    545 extern __inline __m128d
    546     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    547     _mm_cmpnge_sd(__m128d __A, __m128d __B) {
    548   __v2df __a, __b, __c;
    549   __a = vec_splats(__A[0]);
    550   __b = vec_splats(__B[0]);
    551   /* Not greater than or equal is just less than.  */
    552   __c = (__v2df)vec_cmplt(__a, __b);
    553   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
    554 }
    555 
    556 extern __inline __m128d
    557     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    558     _mm_cmpord_sd(__m128d __A, __m128d __B) {
    559   __v2df __r;
    560   __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
    561   return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
    562 }
    563 
    564 extern __inline __m128d
    565     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    566     _mm_cmpunord_sd(__m128d __A, __m128d __B) {
    567   __v2df __r;
    568   __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
    569   return (__m128d)_mm_setr_pd(__r[0], __A[1]);
    570 }
    571 
    572 /* FIXME
    573    The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
    574    exactly the same because GCC for PowerPC only generates unordered
    575    compares (scalar and vector).
    576    Technically __mm_comieq_sp et all should be using the ordered
    577    compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
    578    be OK.   */
    579 extern __inline int
    580     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    581     _mm_comieq_sd(__m128d __A, __m128d __B) {
    582   return (__A[0] == __B[0]);
    583 }
    584 
    585 extern __inline int
    586     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    587     _mm_comilt_sd(__m128d __A, __m128d __B) {
    588   return (__A[0] < __B[0]);
    589 }
    590 
    591 extern __inline int
    592     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    593     _mm_comile_sd(__m128d __A, __m128d __B) {
    594   return (__A[0] <= __B[0]);
    595 }
    596 
    597 extern __inline int
    598     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    599     _mm_comigt_sd(__m128d __A, __m128d __B) {
    600   return (__A[0] > __B[0]);
    601 }
    602 
    603 extern __inline int
    604     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    605     _mm_comige_sd(__m128d __A, __m128d __B) {
    606   return (__A[0] >= __B[0]);
    607 }
    608 
    609 extern __inline int
    610     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    611     _mm_comineq_sd(__m128d __A, __m128d __B) {
    612   return (__A[0] != __B[0]);
    613 }
    614 
    615 extern __inline int
    616     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    617     _mm_ucomieq_sd(__m128d __A, __m128d __B) {
    618   return (__A[0] == __B[0]);
    619 }
    620 
    621 extern __inline int
    622     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    623     _mm_ucomilt_sd(__m128d __A, __m128d __B) {
    624   return (__A[0] < __B[0]);
    625 }
    626 
    627 extern __inline int
    628     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    629     _mm_ucomile_sd(__m128d __A, __m128d __B) {
    630   return (__A[0] <= __B[0]);
    631 }
    632 
    633 extern __inline int
    634     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    635     _mm_ucomigt_sd(__m128d __A, __m128d __B) {
    636   return (__A[0] > __B[0]);
    637 }
    638 
    639 extern __inline int
    640     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    641     _mm_ucomige_sd(__m128d __A, __m128d __B) {
    642   return (__A[0] >= __B[0]);
    643 }
    644 
    645 extern __inline int
    646     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    647     _mm_ucomineq_sd(__m128d __A, __m128d __B) {
    648   return (__A[0] != __B[0]);
    649 }
    650 
    651 /* Create a vector of Qi, where i is the element number.  */
    652 extern __inline __m128i
    653     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    654     _mm_set_epi64x(long long __q1, long long __q0) {
    655   return __extension__(__m128i)(__v2di){__q0, __q1};
    656 }
    657 
    658 extern __inline __m128i
    659     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    660     _mm_set_epi64(__m64 __q1, __m64 __q0) {
    661   return _mm_set_epi64x((long long)__q1, (long long)__q0);
    662 }
    663 
    664 extern __inline __m128i
    665     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    666     _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
    667   return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
    668 }
    669 
    670 extern __inline __m128i
    671     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    672     _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
    673                   short __q2, short __q1, short __q0) {
    674   return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
    675                                         __q4, __q5, __q6, __q7};
    676 }
    677 
    678 extern __inline __m128i
    679     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    680     _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
    681                  char __q10, char __q09, char __q08, char __q07, char __q06,
    682                  char __q05, char __q04, char __q03, char __q02, char __q01,
    683                  char __q00) {
    684   return __extension__(__m128i)(__v16qi){
    685       __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
    686       __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
    687 }
    688 
    689 /* Set all of the elements of the vector to A.  */
    690 extern __inline __m128i
    691     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    692     _mm_set1_epi64x(long long __A) {
    693   return _mm_set_epi64x(__A, __A);
    694 }
    695 
    696 extern __inline __m128i
    697     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    698     _mm_set1_epi64(__m64 __A) {
    699   return _mm_set_epi64(__A, __A);
    700 }
    701 
    702 extern __inline __m128i
    703     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    704     _mm_set1_epi32(int __A) {
    705   return _mm_set_epi32(__A, __A, __A, __A);
    706 }
    707 
    708 extern __inline __m128i
    709     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    710     _mm_set1_epi16(short __A) {
    711   return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
    712 }
    713 
    714 extern __inline __m128i
    715     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    716     _mm_set1_epi8(char __A) {
    717   return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
    718                       __A, __A, __A, __A, __A);
    719 }
    720 
    721 /* Create a vector of Qi, where i is the element number.
    722    The parameter order is reversed from the _mm_set_epi* functions.  */
    723 extern __inline __m128i
    724     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    725     _mm_setr_epi64(__m64 __q0, __m64 __q1) {
    726   return _mm_set_epi64(__q1, __q0);
    727 }
    728 
    729 extern __inline __m128i
    730     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    731     _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
    732   return _mm_set_epi32(__q3, __q2, __q1, __q0);
    733 }
    734 
    735 extern __inline __m128i
    736     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    737     _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
    738                    short __q5, short __q6, short __q7) {
    739   return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
    740 }
    741 
    742 extern __inline __m128i
    743     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    744     _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
    745                   char __q05, char __q06, char __q07, char __q08, char __q09,
    746                   char __q10, char __q11, char __q12, char __q13, char __q14,
    747                   char __q15) {
    748   return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
    749                       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
    750 }
    751 
    752 /* Create a vector with element 0 as *P and the rest zero.  */
    753 extern __inline __m128i
    754     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    755     _mm_load_si128(__m128i const *__P) {
    756   return *__P;
    757 }
    758 
    759 extern __inline __m128i
    760     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    761     _mm_loadu_si128(__m128i_u const *__P) {
    762   return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
    763 }
    764 
    765 extern __inline __m128i
    766     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    767     _mm_loadl_epi64(__m128i_u const *__P) {
    768   return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
    769 }
    770 
    771 extern __inline void
    772     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    773     _mm_store_si128(__m128i *__P, __m128i __B) {
    774   vec_st((__v16qu)__B, 0, (__v16qu *)__P);
    775 }
    776 
    777 extern __inline void
    778     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    779     _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
    780   *__P = __B;
    781 }
    782 
    783 extern __inline void
    784     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    785     _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
    786   *(long long *)__P = ((__v2di)__B)[0];
    787 }
    788 
    789 extern __inline __m64
    790     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    791     _mm_movepi64_pi64(__m128i_u __B) {
    792   return (__m64)((__v2di)__B)[0];
    793 }
    794 
    795 extern __inline __m128i
    796     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    797     _mm_movpi64_epi64(__m64 __A) {
    798   return _mm_set_epi64((__m64)0LL, __A);
    799 }
    800 
    801 extern __inline __m128i
    802     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    803     _mm_move_epi64(__m128i __A) {
    804   return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
    805 }
    806 
    807 /* Create an undefined vector.  */
    808 extern __inline __m128i
    809     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    810     _mm_undefined_si128(void) {
    811   __m128i __Y = __Y;
    812   return __Y;
    813 }
    814 
    815 /* Create a vector of zeros.  */
    816 extern __inline __m128i
    817     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    818     _mm_setzero_si128(void) {
    819   return __extension__(__m128i)(__v4si){0, 0, 0, 0};
    820 }
    821 
    822 #ifdef _ARCH_PWR8
    823 extern __inline __m128d
    824     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    825     _mm_cvtepi32_pd(__m128i __A) {
    826   __v2di __val;
    827   /* For LE need to generate Vector Unpack Low Signed Word.
    828      Which is generated from unpackh.  */
    829   __val = (__v2di)vec_unpackh((__v4si)__A);
    830 
    831   return (__m128d)vec_ctf(__val, 0);
    832 }
    833 #endif
    834 
    835 extern __inline __m128
    836     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    837     _mm_cvtepi32_ps(__m128i __A) {
    838   return ((__m128)vec_ctf((__v4si)__A, 0));
    839 }
    840 
    841 extern __inline __m128i
    842     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    843     _mm_cvtpd_epi32(__m128d __A) {
    844   __v2df __rounded = vec_rint(__A);
    845   __v4si __result, __temp;
    846   const __v4si __vzero = {0, 0, 0, 0};
    847 
    848   /* VSX Vector truncate Double-Precision to integer and Convert to
    849    Signed Integer Word format with Saturate.  */
    850   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
    851 
    852 #ifdef _ARCH_PWR8
    853 #ifdef __LITTLE_ENDIAN__
    854   __temp = vec_mergeo(__temp, __temp);
    855 #else
    856   __temp = vec_mergee(__temp, __temp);
    857 #endif
    858   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
    859                                  (__vector long long)__vzero);
    860 #else
    861   {
    862     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
    863                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
    864     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
    865   }
    866 #endif
    867   return (__m128i)__result;
    868 }
    869 
    870 extern __inline __m64
    871     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    872     _mm_cvtpd_pi32(__m128d __A) {
    873   __m128i __result = _mm_cvtpd_epi32(__A);
    874 
    875   return (__m64)__result[0];
    876 }
    877 
    878 extern __inline __m128
    879     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    880     _mm_cvtpd_ps(__m128d __A) {
    881   __v4sf __result;
    882   __v4si __temp;
    883   const __v4si __vzero = {0, 0, 0, 0};
    884 
    885   __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
    886 
    887 #ifdef _ARCH_PWR8
    888 #ifdef __LITTLE_ENDIAN__
    889   __temp = vec_mergeo(__temp, __temp);
    890 #else
    891   __temp = vec_mergee(__temp, __temp);
    892 #endif
    893   __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
    894                                  (__vector long long)__vzero);
    895 #else
    896   {
    897     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
    898                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
    899     __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
    900   }
    901 #endif
    902   return ((__m128)__result);
    903 }
    904 
    905 extern __inline __m128i
    906     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    907     _mm_cvttpd_epi32(__m128d __A) {
    908   __v4si __result;
    909   __v4si __temp;
    910   const __v4si __vzero = {0, 0, 0, 0};
    911 
    912   /* VSX Vector truncate Double-Precision to integer and Convert to
    913    Signed Integer Word format with Saturate.  */
    914   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
    915 
    916 #ifdef _ARCH_PWR8
    917 #ifdef __LITTLE_ENDIAN__
    918   __temp = vec_mergeo(__temp, __temp);
    919 #else
    920   __temp = vec_mergee(__temp, __temp);
    921 #endif
    922   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
    923                                  (__vector long long)__vzero);
    924 #else
    925   {
    926     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
    927                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
    928     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
    929   }
    930 #endif
    931 
    932   return ((__m128i)__result);
    933 }
    934 
    935 extern __inline __m64
    936     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    937     _mm_cvttpd_pi32(__m128d __A) {
    938   __m128i __result = _mm_cvttpd_epi32(__A);
    939 
    940   return (__m64)__result[0];
    941 }
    942 
    943 extern __inline int
    944     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    945     _mm_cvtsi128_si32(__m128i __A) {
    946   return ((__v4si)__A)[0];
    947 }
    948 
    949 #ifdef _ARCH_PWR8
    950 extern __inline __m128d
    951     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    952     _mm_cvtpi32_pd(__m64 __A) {
    953   __v4si __temp;
    954   __v2di __tmp2;
    955   __v4f __result;
    956 
    957   __temp = (__v4si)vec_splats(__A);
    958   __tmp2 = (__v2di)vec_unpackl(__temp);
    959   __result = vec_ctf((__vector signed long long)__tmp2, 0);
    960   return (__m128d)__result;
    961 }
    962 #endif
    963 
    964 extern __inline __m128i
    965     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    966     _mm_cvtps_epi32(__m128 __A) {
    967   __v4sf __rounded;
    968   __v4si __result;
    969 
    970   __rounded = vec_rint((__v4sf)__A);
    971   __result = vec_cts(__rounded, 0);
    972   return (__m128i)__result;
    973 }
    974 
    975 extern __inline __m128i
    976     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    977     _mm_cvttps_epi32(__m128 __A) {
    978   __v4si __result;
    979 
    980   __result = vec_cts((__v4sf)__A, 0);
    981   return (__m128i)__result;
    982 }
    983 
    984 extern __inline __m128d
    985     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    986     _mm_cvtps_pd(__m128 __A) {
    987   /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
    988 #ifdef vec_doubleh
    989   return (__m128d)vec_doubleh((__v4sf)__A);
    990 #else
    991   /* Otherwise the compiler is not current and so need to generate the
    992      equivalent code.  */
    993   __v4sf __a = (__v4sf)__A;
    994   __v4sf __temp;
    995   __v2df __result;
    996 #ifdef __LITTLE_ENDIAN__
    997   /* The input float values are in elements {[0], [1]} but the convert
    998      instruction needs them in elements {[1], [3]}, So we use two
    999      shift left double vector word immediates to get the elements
   1000      lined up.  */
   1001   __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
   1002   __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
   1003 #else
   1004   /* The input float values are in elements {[0], [1]} but the convert
   1005      instruction needs them in elements {[0], [2]}, So we use two
   1006      shift left double vector word immediates to get the elements
   1007      lined up.  */
   1008   __temp = vec_vmrghw(__a, __a);
   1009 #endif
   1010   __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
   1011   return (__m128d)__result;
   1012 #endif
   1013 }
   1014 
   1015 extern __inline int
   1016     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1017     _mm_cvtsd_si32(__m128d __A) {
   1018   __v2df __rounded = vec_rint((__v2df)__A);
   1019   int __result = ((__v2df)__rounded)[0];
   1020 
   1021   return __result;
   1022 }
   1023 /* Intel intrinsic.  */
   1024 extern __inline long long
   1025     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1026     _mm_cvtsd_si64(__m128d __A) {
   1027   __v2df __rounded = vec_rint((__v2df)__A);
   1028   long long __result = ((__v2df)__rounded)[0];
   1029 
   1030   return __result;
   1031 }
   1032 
   1033 /* Microsoft intrinsic.  */
   1034 extern __inline long long
   1035     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1036     _mm_cvtsd_si64x(__m128d __A) {
   1037   return _mm_cvtsd_si64((__v2df)__A);
   1038 }
   1039 
   1040 extern __inline int
   1041     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1042     _mm_cvttsd_si32(__m128d __A) {
   1043   int __result = ((__v2df)__A)[0];
   1044 
   1045   return __result;
   1046 }
   1047 
   1048 /* Intel intrinsic.  */
   1049 extern __inline long long
   1050     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1051     _mm_cvttsd_si64(__m128d __A) {
   1052   long long __result = ((__v2df)__A)[0];
   1053 
   1054   return __result;
   1055 }
   1056 
   1057 /* Microsoft intrinsic.  */
   1058 extern __inline long long
   1059     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1060     _mm_cvttsd_si64x(__m128d __A) {
   1061   return _mm_cvttsd_si64(__A);
   1062 }
   1063 
   1064 extern __inline __m128
   1065     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1066     _mm_cvtsd_ss(__m128 __A, __m128d __B) {
   1067   __v4sf __result = (__v4sf)__A;
   1068 
   1069 #ifdef __LITTLE_ENDIAN__
   1070   __v4sf __temp_s;
   1071   /* Copy double element[0] to element [1] for conversion.  */
   1072   __v2df __temp_b = vec_splat((__v2df)__B, 0);
   1073 
   1074   /* Pre-rotate __A left 3 (logically right 1) elements.  */
   1075   __result = __builtin_vsx_xxsldwi(__result, __result, 3);
   1076   /* Convert double to single float scalar in a vector.  */
   1077   __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
   1078   /* Shift the resulting scalar into vector element [0].  */
   1079   __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
   1080 #else
   1081   __result[0] = ((__v2df)__B)[0];
   1082 #endif
   1083   return (__m128)__result;
   1084 }
   1085 
   1086 extern __inline __m128d
   1087     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1088     _mm_cvtsi32_sd(__m128d __A, int __B) {
   1089   __v2df __result = (__v2df)__A;
   1090   double __db = __B;
   1091   __result[0] = __db;
   1092   return (__m128d)__result;
   1093 }
   1094 
   1095 /* Intel intrinsic.  */
   1096 extern __inline __m128d
   1097     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1098     _mm_cvtsi64_sd(__m128d __A, long long __B) {
   1099   __v2df __result = (__v2df)__A;
   1100   double __db = __B;
   1101   __result[0] = __db;
   1102   return (__m128d)__result;
   1103 }
   1104 
   1105 /* Microsoft intrinsic.  */
   1106 extern __inline __m128d
   1107     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1108     _mm_cvtsi64x_sd(__m128d __A, long long __B) {
   1109   return _mm_cvtsi64_sd(__A, __B);
   1110 }
   1111 
   1112 extern __inline __m128d
   1113     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1114     _mm_cvtss_sd(__m128d __A, __m128 __B) {
   1115 #ifdef __LITTLE_ENDIAN__
   1116   /* Use splat to move element [0] into position for the convert. */
   1117   __v4sf __temp = vec_splat((__v4sf)__B, 0);
   1118   __v2df __res;
   1119   /* Convert single float scalar to double in a vector.  */
   1120   __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
   1121   return (__m128d)vec_mergel(__res, (__v2df)__A);
   1122 #else
   1123   __v2df __res = (__v2df)__A;
   1124   __res[0] = ((__v4sf)__B)[0];
   1125   return (__m128d)__res;
   1126 #endif
   1127 }
   1128 
   1129 extern __inline __m128d
   1130     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1131     _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
   1132   __vector double __result;
   1133   const int __litmsk = __mask & 0x3;
   1134 
   1135   if (__litmsk == 0)
   1136     __result = vec_mergeh(__A, __B);
   1137 #if __GNUC__ < 6
   1138   else if (__litmsk == 1)
   1139     __result = vec_xxpermdi(__B, __A, 2);
   1140   else if (__litmsk == 2)
   1141     __result = vec_xxpermdi(__B, __A, 1);
   1142 #else
   1143   else if (__litmsk == 1)
   1144     __result = vec_xxpermdi(__A, __B, 2);
   1145   else if (__litmsk == 2)
   1146     __result = vec_xxpermdi(__A, __B, 1);
   1147 #endif
   1148   else
   1149     __result = vec_mergel(__A, __B);
   1150 
   1151   return __result;
   1152 }
   1153 
   1154 extern __inline __m128d
   1155     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1156     _mm_unpackhi_pd(__m128d __A, __m128d __B) {
   1157   return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
   1158 }
   1159 
   1160 extern __inline __m128d
   1161     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1162     _mm_unpacklo_pd(__m128d __A, __m128d __B) {
   1163   return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
   1164 }
   1165 
   1166 extern __inline __m128d
   1167     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1168     _mm_loadh_pd(__m128d __A, double const *__B) {
   1169   __v2df __result = (__v2df)__A;
   1170   __result[1] = *__B;
   1171   return (__m128d)__result;
   1172 }
   1173 
   1174 extern __inline __m128d
   1175     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1176     _mm_loadl_pd(__m128d __A, double const *__B) {
   1177   __v2df __result = (__v2df)__A;
   1178   __result[0] = *__B;
   1179   return (__m128d)__result;
   1180 }
   1181 
   1182 #ifdef _ARCH_PWR8
   1183 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
   1184 
   1185 /* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
   1186 extern __inline int
   1187     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1188     _mm_movemask_pd(__m128d __A) {
   1189 #ifdef _ARCH_PWR10
   1190   return vec_extractm((__v2du)__A);
   1191 #else
   1192   __vector unsigned long long __result;
   1193   static const __vector unsigned int __perm_mask = {
   1194 #ifdef __LITTLE_ENDIAN__
   1195       0x80800040, 0x80808080, 0x80808080, 0x80808080
   1196 #else
   1197       0x80808080, 0x80808080, 0x80808080, 0x80804000
   1198 #endif
   1199   };
   1200 
   1201   __result = ((__vector unsigned long long)vec_vbpermq(
   1202       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
   1203 
   1204 #ifdef __LITTLE_ENDIAN__
   1205   return __result[1];
   1206 #else
   1207   return __result[0];
   1208 #endif
   1209 #endif /* !_ARCH_PWR10 */
   1210 }
   1211 #endif /* _ARCH_PWR8 */
   1212 
   1213 extern __inline __m128i
   1214     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1215     _mm_packs_epi16(__m128i __A, __m128i __B) {
   1216   return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
   1217 }
   1218 
   1219 extern __inline __m128i
   1220     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1221     _mm_packs_epi32(__m128i __A, __m128i __B) {
   1222   return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
   1223 }
   1224 
   1225 extern __inline __m128i
   1226     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1227     _mm_packus_epi16(__m128i __A, __m128i __B) {
   1228   return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
   1229 }
   1230 
   1231 extern __inline __m128i
   1232     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1233     _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
   1234   return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
   1235 }
   1236 
   1237 extern __inline __m128i
   1238     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1239     _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
   1240   return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
   1241 }
   1242 
   1243 extern __inline __m128i
   1244     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1245     _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
   1246   return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
   1247 }
   1248 
   1249 extern __inline __m128i
   1250     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1251     _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
   1252   return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
   1253 }
   1254 
   1255 extern __inline __m128i
   1256     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1257     _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
   1258   return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
   1259 }
   1260 
   1261 extern __inline __m128i
   1262     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1263     _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
   1264   return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
   1265 }
   1266 
   1267 extern __inline __m128i
   1268     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1269     _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
   1270   return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
   1271 }
   1272 
   1273 extern __inline __m128i
   1274     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1275     _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
   1276   return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
   1277 }
   1278 
   1279 extern __inline __m128i
   1280     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1281     _mm_add_epi8(__m128i __A, __m128i __B) {
   1282   return (__m128i)((__v16qu)__A + (__v16qu)__B);
   1283 }
   1284 
   1285 extern __inline __m128i
   1286     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1287     _mm_add_epi16(__m128i __A, __m128i __B) {
   1288   return (__m128i)((__v8hu)__A + (__v8hu)__B);
   1289 }
   1290 
   1291 extern __inline __m128i
   1292     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1293     _mm_add_epi32(__m128i __A, __m128i __B) {
   1294   return (__m128i)((__v4su)__A + (__v4su)__B);
   1295 }
   1296 
   1297 extern __inline __m128i
   1298     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1299     _mm_add_epi64(__m128i __A, __m128i __B) {
   1300   return (__m128i)((__v2du)__A + (__v2du)__B);
   1301 }
   1302 
   1303 extern __inline __m128i
   1304     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1305     _mm_adds_epi8(__m128i __A, __m128i __B) {
   1306   return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
   1307 }
   1308 
   1309 extern __inline __m128i
   1310     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1311     _mm_adds_epi16(__m128i __A, __m128i __B) {
   1312   return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
   1313 }
   1314 
   1315 extern __inline __m128i
   1316     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1317     _mm_adds_epu8(__m128i __A, __m128i __B) {
   1318   return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
   1319 }
   1320 
   1321 extern __inline __m128i
   1322     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1323     _mm_adds_epu16(__m128i __A, __m128i __B) {
   1324   return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
   1325 }
   1326 
   1327 extern __inline __m128i
   1328     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1329     _mm_sub_epi8(__m128i __A, __m128i __B) {
   1330   return (__m128i)((__v16qu)__A - (__v16qu)__B);
   1331 }
   1332 
   1333 extern __inline __m128i
   1334     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1335     _mm_sub_epi16(__m128i __A, __m128i __B) {
   1336   return (__m128i)((__v8hu)__A - (__v8hu)__B);
   1337 }
   1338 
   1339 extern __inline __m128i
   1340     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1341     _mm_sub_epi32(__m128i __A, __m128i __B) {
   1342   return (__m128i)((__v4su)__A - (__v4su)__B);
   1343 }
   1344 
   1345 extern __inline __m128i
   1346     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1347     _mm_sub_epi64(__m128i __A, __m128i __B) {
   1348   return (__m128i)((__v2du)__A - (__v2du)__B);
   1349 }
   1350 
   1351 extern __inline __m128i
   1352     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1353     _mm_subs_epi8(__m128i __A, __m128i __B) {
   1354   return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
   1355 }
   1356 
   1357 extern __inline __m128i
   1358     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1359     _mm_subs_epi16(__m128i __A, __m128i __B) {
   1360   return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
   1361 }
   1362 
   1363 extern __inline __m128i
   1364     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1365     _mm_subs_epu8(__m128i __A, __m128i __B) {
   1366   return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
   1367 }
   1368 
   1369 extern __inline __m128i
   1370     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1371     _mm_subs_epu16(__m128i __A, __m128i __B) {
   1372   return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
   1373 }
   1374 
   1375 extern __inline __m128i
   1376     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1377     _mm_madd_epi16(__m128i __A, __m128i __B) {
   1378   __vector signed int __zero = {0, 0, 0, 0};
   1379 
   1380   return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
   1381 }
   1382 
   1383 extern __inline __m128i
   1384     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1385     _mm_mulhi_epi16(__m128i __A, __m128i __B) {
   1386   __vector signed int __w0, __w1;
   1387 
   1388   __vector unsigned char __xform1 = {
   1389 #ifdef __LITTLE_ENDIAN__
   1390       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
   1391       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
   1392 #else
   1393       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
   1394       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
   1395 #endif
   1396   };
   1397 
   1398   __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
   1399   __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
   1400   return (__m128i)vec_perm(__w0, __w1, __xform1);
   1401 }
   1402 
   1403 extern __inline __m128i
   1404     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1405     _mm_mullo_epi16(__m128i __A, __m128i __B) {
   1406   return (__m128i)((__v8hi)__A * (__v8hi)__B);
   1407 }
   1408 
   1409 extern __inline __m64
   1410     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1411     _mm_mul_su32(__m64 __A, __m64 __B) {
   1412   unsigned int __a = __A;
   1413   unsigned int __b = __B;
   1414 
   1415   return ((__m64)__a * (__m64)__b);
   1416 }
   1417 
   1418 #ifdef _ARCH_PWR8
   1419 extern __inline __m128i
   1420     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1421     _mm_mul_epu32(__m128i __A, __m128i __B) {
   1422 #if __GNUC__ < 8
   1423   __v2du __result;
   1424 
   1425 #ifdef __LITTLE_ENDIAN__
   1426   /* VMX Vector Multiply Odd Unsigned Word.  */
   1427   __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
   1428 #else
   1429   /* VMX Vector Multiply Even Unsigned Word.  */
   1430   __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
   1431 #endif
   1432   return (__m128i)__result;
   1433 #else
   1434   return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
   1435 #endif
   1436 }
   1437 #endif
   1438 
   1439 extern __inline __m128i
   1440     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1441     _mm_slli_epi16(__m128i __A, int __B) {
   1442   __v8hu __lshift;
   1443   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
   1444 
   1445   if (__B >= 0 && __B < 16) {
   1446     if (__builtin_constant_p(__B))
   1447       __lshift = (__v8hu)vec_splat_s16(__B);
   1448     else
   1449       __lshift = vec_splats((unsigned short)__B);
   1450 
   1451     __result = vec_sl((__v8hi)__A, __lshift);
   1452   }
   1453 
   1454   return (__m128i)__result;
   1455 }
   1456 
   1457 extern __inline __m128i
   1458     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1459     _mm_slli_epi32(__m128i __A, int __B) {
   1460   __v4su __lshift;
   1461   __v4si __result = {0, 0, 0, 0};
   1462 
   1463   if (__B >= 0 && __B < 32) {
   1464     if (__builtin_constant_p(__B) && __B < 16)
   1465       __lshift = (__v4su)vec_splat_s32(__B);
   1466     else
   1467       __lshift = vec_splats((unsigned int)__B);
   1468 
   1469     __result = vec_sl((__v4si)__A, __lshift);
   1470   }
   1471 
   1472   return (__m128i)__result;
   1473 }
   1474 
   1475 #ifdef _ARCH_PWR8
   1476 extern __inline __m128i
   1477     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1478     _mm_slli_epi64(__m128i __A, int __B) {
   1479   __v2du __lshift;
   1480   __v2di __result = {0, 0};
   1481 
   1482   if (__B >= 0 && __B < 64) {
   1483     if (__builtin_constant_p(__B) && __B < 16)
   1484       __lshift = (__v2du)vec_splat_s32(__B);
   1485     else
   1486       __lshift = (__v2du)vec_splats((unsigned int)__B);
   1487 
   1488     __result = vec_sl((__v2di)__A, __lshift);
   1489   }
   1490 
   1491   return (__m128i)__result;
   1492 }
   1493 #endif
   1494 
   1495 extern __inline __m128i
   1496     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1497     _mm_srai_epi16(__m128i __A, int __B) {
   1498   __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
   1499   __v8hi __result;
   1500 
   1501   if (__B < 16) {
   1502     if (__builtin_constant_p(__B))
   1503       __rshift = (__v8hu)vec_splat_s16(__B);
   1504     else
   1505       __rshift = vec_splats((unsigned short)__B);
   1506   }
   1507   __result = vec_sra((__v8hi)__A, __rshift);
   1508 
   1509   return (__m128i)__result;
   1510 }
   1511 
   1512 extern __inline __m128i
   1513     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1514     _mm_srai_epi32(__m128i __A, int __B) {
   1515   __v4su __rshift = {31, 31, 31, 31};
   1516   __v4si __result;
   1517 
   1518   if (__B < 32) {
   1519     if (__builtin_constant_p(__B)) {
   1520       if (__B < 16)
   1521         __rshift = (__v4su)vec_splat_s32(__B);
   1522       else
   1523         __rshift = (__v4su)vec_splats((unsigned int)__B);
   1524     } else
   1525       __rshift = vec_splats((unsigned int)__B);
   1526   }
   1527   __result = vec_sra((__v4si)__A, __rshift);
   1528 
   1529   return (__m128i)__result;
   1530 }
   1531 
   1532 extern __inline __m128i
   1533     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1534     _mm_bslli_si128(__m128i __A, const int __N) {
   1535   __v16qu __result;
   1536   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   1537 
   1538   if (__N < 16)
   1539     __result = vec_sld((__v16qu)__A, __zeros, __N);
   1540   else
   1541     __result = __zeros;
   1542 
   1543   return (__m128i)__result;
   1544 }
   1545 
   1546 extern __inline __m128i
   1547     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1548     _mm_bsrli_si128(__m128i __A, const int __N) {
   1549   __v16qu __result;
   1550   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   1551 
   1552   if (__N < 16)
   1553 #ifdef __LITTLE_ENDIAN__
   1554     if (__builtin_constant_p(__N))
   1555       /* Would like to use Vector Shift Left Double by Octet
   1556          Immediate here to use the immediate form and avoid
   1557          load of __N * 8 value into a separate VR.  */
   1558       __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
   1559     else
   1560 #endif
   1561     {
   1562       __v16qu __shift = vec_splats((unsigned char)(__N * 8));
   1563 #ifdef __LITTLE_ENDIAN__
   1564       __result = vec_sro((__v16qu)__A, __shift);
   1565 #else
   1566     __result = vec_slo((__v16qu)__A, __shift);
   1567 #endif
   1568     }
   1569   else
   1570     __result = __zeros;
   1571 
   1572   return (__m128i)__result;
   1573 }
   1574 
   1575 extern __inline __m128i
   1576     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1577     _mm_srli_si128(__m128i __A, const int __N) {
   1578   return _mm_bsrli_si128(__A, __N);
   1579 }
   1580 
   1581 extern __inline __m128i
   1582     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1583     _mm_slli_si128(__m128i __A, const int _imm5) {
   1584   __v16qu __result;
   1585   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   1586 
   1587   if (_imm5 < 16)
   1588 #ifdef __LITTLE_ENDIAN__
   1589     __result = vec_sld((__v16qu)__A, __zeros, _imm5);
   1590 #else
   1591     __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
   1592 #endif
   1593   else
   1594     __result = __zeros;
   1595 
   1596   return (__m128i)__result;
   1597 }
   1598 
   1599 extern __inline __m128i
   1600     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1601 
   1602     _mm_srli_epi16(__m128i __A, int __B) {
   1603   __v8hu __rshift;
   1604   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
   1605 
   1606   if (__B < 16) {
   1607     if (__builtin_constant_p(__B))
   1608       __rshift = (__v8hu)vec_splat_s16(__B);
   1609     else
   1610       __rshift = vec_splats((unsigned short)__B);
   1611 
   1612     __result = vec_sr((__v8hi)__A, __rshift);
   1613   }
   1614 
   1615   return (__m128i)__result;
   1616 }
   1617 
   1618 extern __inline __m128i
   1619     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1620     _mm_srli_epi32(__m128i __A, int __B) {
   1621   __v4su __rshift;
   1622   __v4si __result = {0, 0, 0, 0};
   1623 
   1624   if (__B < 32) {
   1625     if (__builtin_constant_p(__B)) {
   1626       if (__B < 16)
   1627         __rshift = (__v4su)vec_splat_s32(__B);
   1628       else
   1629         __rshift = (__v4su)vec_splats((unsigned int)__B);
   1630     } else
   1631       __rshift = vec_splats((unsigned int)__B);
   1632 
   1633     __result = vec_sr((__v4si)__A, __rshift);
   1634   }
   1635 
   1636   return (__m128i)__result;
   1637 }
   1638 
   1639 #ifdef _ARCH_PWR8
   1640 extern __inline __m128i
   1641     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1642     _mm_srli_epi64(__m128i __A, int __B) {
   1643   __v2du __rshift;
   1644   __v2di __result = {0, 0};
   1645 
   1646   if (__B < 64) {
   1647     if (__builtin_constant_p(__B)) {
   1648       if (__B < 16)
   1649         __rshift = (__v2du)vec_splat_s32(__B);
   1650       else
   1651         __rshift = (__v2du)vec_splats((unsigned long long)__B);
   1652     } else
   1653       __rshift = (__v2du)vec_splats((unsigned int)__B);
   1654 
   1655     __result = vec_sr((__v2di)__A, __rshift);
   1656   }
   1657 
   1658   return (__m128i)__result;
   1659 }
   1660 #endif
   1661 
   1662 extern __inline __m128i
   1663     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1664     _mm_sll_epi16(__m128i __A, __m128i __B) {
   1665   __v8hu __lshift;
   1666   __vector __bool short __shmask;
   1667   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
   1668   __v8hu __result;
   1669 
   1670 #ifdef __LITTLE_ENDIAN__
   1671   __lshift = vec_splat((__v8hu)__B, 0);
   1672 #else
   1673   __lshift = vec_splat((__v8hu)__B, 3);
   1674 #endif
   1675   __shmask = vec_cmple(__lshift, __shmax);
   1676   __result = vec_sl((__v8hu)__A, __lshift);
   1677   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
   1678 
   1679   return (__m128i)__result;
   1680 }
   1681 
   1682 extern __inline __m128i
   1683     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1684     _mm_sll_epi32(__m128i __A, __m128i __B) {
   1685   __v4su __lshift;
   1686   __vector __bool int __shmask;
   1687   const __v4su __shmax = {32, 32, 32, 32};
   1688   __v4su __result;
   1689 #ifdef __LITTLE_ENDIAN__
   1690   __lshift = vec_splat((__v4su)__B, 0);
   1691 #else
   1692   __lshift = vec_splat((__v4su)__B, 1);
   1693 #endif
   1694   __shmask = vec_cmplt(__lshift, __shmax);
   1695   __result = vec_sl((__v4su)__A, __lshift);
   1696   __result = vec_sel((__v4su)__shmask, __result, __shmask);
   1697 
   1698   return (__m128i)__result;
   1699 }
   1700 
   1701 #ifdef _ARCH_PWR8
   1702 extern __inline __m128i
   1703     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1704     _mm_sll_epi64(__m128i __A, __m128i __B) {
   1705   __v2du __lshift;
   1706   __vector __bool long long __shmask;
   1707   const __v2du __shmax = {64, 64};
   1708   __v2du __result;
   1709 
   1710   __lshift = vec_splat((__v2du)__B, 0);
   1711   __shmask = vec_cmplt(__lshift, __shmax);
   1712   __result = vec_sl((__v2du)__A, __lshift);
   1713   __result = vec_sel((__v2du)__shmask, __result, __shmask);
   1714 
   1715   return (__m128i)__result;
   1716 }
   1717 #endif
   1718 
   1719 extern __inline __m128i
   1720     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1721     _mm_sra_epi16(__m128i __A, __m128i __B) {
   1722   const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
   1723   __v8hu __rshift;
   1724   __v8hi __result;
   1725 
   1726 #ifdef __LITTLE_ENDIAN__
   1727   __rshift = vec_splat((__v8hu)__B, 0);
   1728 #else
   1729   __rshift = vec_splat((__v8hu)__B, 3);
   1730 #endif
   1731   __rshift = vec_min(__rshift, __rshmax);
   1732   __result = vec_sra((__v8hi)__A, __rshift);
   1733 
   1734   return (__m128i)__result;
   1735 }
   1736 
   1737 extern __inline __m128i
   1738     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1739     _mm_sra_epi32(__m128i __A, __m128i __B) {
   1740   const __v4su __rshmax = {31, 31, 31, 31};
   1741   __v4su __rshift;
   1742   __v4si __result;
   1743 
   1744 #ifdef __LITTLE_ENDIAN__
   1745   __rshift = vec_splat((__v4su)__B, 0);
   1746 #else
   1747   __rshift = vec_splat((__v4su)__B, 1);
   1748 #endif
   1749   __rshift = vec_min(__rshift, __rshmax);
   1750   __result = vec_sra((__v4si)__A, __rshift);
   1751 
   1752   return (__m128i)__result;
   1753 }
   1754 
   1755 extern __inline __m128i
   1756     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1757     _mm_srl_epi16(__m128i __A, __m128i __B) {
   1758   __v8hu __rshift;
   1759   __vector __bool short __shmask;
   1760   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
   1761   __v8hu __result;
   1762 
   1763 #ifdef __LITTLE_ENDIAN__
   1764   __rshift = vec_splat((__v8hu)__B, 0);
   1765 #else
   1766   __rshift = vec_splat((__v8hu)__B, 3);
   1767 #endif
   1768   __shmask = vec_cmple(__rshift, __shmax);
   1769   __result = vec_sr((__v8hu)__A, __rshift);
   1770   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
   1771 
   1772   return (__m128i)__result;
   1773 }
   1774 
   1775 extern __inline __m128i
   1776     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1777     _mm_srl_epi32(__m128i __A, __m128i __B) {
   1778   __v4su __rshift;
   1779   __vector __bool int __shmask;
   1780   const __v4su __shmax = {32, 32, 32, 32};
   1781   __v4su __result;
   1782 
   1783 #ifdef __LITTLE_ENDIAN__
   1784   __rshift = vec_splat((__v4su)__B, 0);
   1785 #else
   1786   __rshift = vec_splat((__v4su)__B, 1);
   1787 #endif
   1788   __shmask = vec_cmplt(__rshift, __shmax);
   1789   __result = vec_sr((__v4su)__A, __rshift);
   1790   __result = vec_sel((__v4su)__shmask, __result, __shmask);
   1791 
   1792   return (__m128i)__result;
   1793 }
   1794 
   1795 #ifdef _ARCH_PWR8
   1796 extern __inline __m128i
   1797     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1798     _mm_srl_epi64(__m128i __A, __m128i __B) {
   1799   __v2du __rshift;
   1800   __vector __bool long long __shmask;
   1801   const __v2du __shmax = {64, 64};
   1802   __v2du __result;
   1803 
   1804   __rshift = vec_splat((__v2du)__B, 0);
   1805   __shmask = vec_cmplt(__rshift, __shmax);
   1806   __result = vec_sr((__v2du)__A, __rshift);
   1807   __result = vec_sel((__v2du)__shmask, __result, __shmask);
   1808 
   1809   return (__m128i)__result;
   1810 }
   1811 #endif
   1812 
   1813 extern __inline __m128d
   1814     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1815     _mm_and_pd(__m128d __A, __m128d __B) {
   1816   return (vec_and((__v2df)__A, (__v2df)__B));
   1817 }
   1818 
   1819 extern __inline __m128d
   1820     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1821     _mm_andnot_pd(__m128d __A, __m128d __B) {
   1822   return (vec_andc((__v2df)__B, (__v2df)__A));
   1823 }
   1824 
   1825 extern __inline __m128d
   1826     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1827     _mm_or_pd(__m128d __A, __m128d __B) {
   1828   return (vec_or((__v2df)__A, (__v2df)__B));
   1829 }
   1830 
   1831 extern __inline __m128d
   1832     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1833     _mm_xor_pd(__m128d __A, __m128d __B) {
   1834   return (vec_xor((__v2df)__A, (__v2df)__B));
   1835 }
   1836 
   1837 extern __inline __m128i
   1838     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1839     _mm_and_si128(__m128i __A, __m128i __B) {
   1840   return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
   1841 }
   1842 
   1843 extern __inline __m128i
   1844     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1845     _mm_andnot_si128(__m128i __A, __m128i __B) {
   1846   return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
   1847 }
   1848 
   1849 extern __inline __m128i
   1850     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1851     _mm_or_si128(__m128i __A, __m128i __B) {
   1852   return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
   1853 }
   1854 
   1855 extern __inline __m128i
   1856     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1857     _mm_xor_si128(__m128i __A, __m128i __B) {
   1858   return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
   1859 }
   1860 
   1861 extern __inline __m128i
   1862     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1863     _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
   1864   return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
   1865 }
   1866 
   1867 extern __inline __m128i
   1868     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1869     _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
   1870   return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
   1871 }
   1872 
   1873 extern __inline __m128i
   1874     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1875     _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
   1876   return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
   1877 }
   1878 
   1879 extern __inline __m128i
   1880     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1881     _mm_cmplt_epi8(__m128i __A, __m128i __B) {
   1882   return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
   1883 }
   1884 
   1885 extern __inline __m128i
   1886     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1887     _mm_cmplt_epi16(__m128i __A, __m128i __B) {
   1888   return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
   1889 }
   1890 
   1891 extern __inline __m128i
   1892     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1893     _mm_cmplt_epi32(__m128i __A, __m128i __B) {
   1894   return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
   1895 }
   1896 
   1897 extern __inline __m128i
   1898     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1899     _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
   1900   return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
   1901 }
   1902 
   1903 extern __inline __m128i
   1904     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1905     _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
   1906   return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
   1907 }
   1908 
   1909 extern __inline __m128i
   1910     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1911     _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
   1912   return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
   1913 }
   1914 
   1915 extern __inline int
   1916     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1917     _mm_extract_epi16(__m128i const __A, int const __N) {
   1918   return (unsigned short)((__v8hi)__A)[__N & 7];
   1919 }
   1920 
   1921 extern __inline __m128i
   1922     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1923     _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
   1924   __v8hi __result = (__v8hi)__A;
   1925 
   1926   __result[(__N & 7)] = __D;
   1927 
   1928   return (__m128i)__result;
   1929 }
   1930 
   1931 extern __inline __m128i
   1932     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1933     _mm_max_epi16(__m128i __A, __m128i __B) {
   1934   return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
   1935 }
   1936 
   1937 extern __inline __m128i
   1938     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1939     _mm_max_epu8(__m128i __A, __m128i __B) {
   1940   return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
   1941 }
   1942 
   1943 extern __inline __m128i
   1944     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1945     _mm_min_epi16(__m128i __A, __m128i __B) {
   1946   return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
   1947 }
   1948 
   1949 extern __inline __m128i
   1950     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1951     _mm_min_epu8(__m128i __A, __m128i __B) {
   1952   return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
   1953 }
   1954 
   1955 #ifdef _ARCH_PWR8
   1956 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
   1957 
   1958 /* Return a mask created from the most significant bit of each 8-bit
   1959    element in A.  */
   1960 extern __inline int
   1961     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1962     _mm_movemask_epi8(__m128i __A) {
   1963 #ifdef _ARCH_PWR10
   1964   return vec_extractm((__v16qu)__A);
   1965 #else
   1966   __vector unsigned long long __result;
   1967   static const __vector unsigned char __perm_mask = {
   1968       0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
   1969       0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
   1970 
   1971   __result = ((__vector unsigned long long)vec_vbpermq(
   1972       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
   1973 
   1974 #ifdef __LITTLE_ENDIAN__
   1975   return __result[1];
   1976 #else
   1977   return __result[0];
   1978 #endif
   1979 #endif /* !_ARCH_PWR10 */
   1980 }
   1981 #endif /* _ARCH_PWR8 */
   1982 
   1983 extern __inline __m128i
   1984     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1985     _mm_mulhi_epu16(__m128i __A, __m128i __B) {
   1986   __v4su __w0, __w1;
   1987   __v16qu __xform1 = {
   1988 #ifdef __LITTLE_ENDIAN__
   1989       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
   1990       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
   1991 #else
   1992       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
   1993       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
   1994 #endif
   1995   };
   1996 
   1997   __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
   1998   __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
   1999   return (__m128i)vec_perm(__w0, __w1, __xform1);
   2000 }
   2001 
   2002 extern __inline __m128i
   2003     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2004     _mm_shufflehi_epi16(__m128i __A, const int __mask) {
   2005   unsigned long __element_selector_98 = __mask & 0x03;
   2006   unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
   2007   unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
   2008   unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
   2009   static const unsigned short __permute_selectors[4] = {
   2010 #ifdef __LITTLE_ENDIAN__
   2011       0x0908, 0x0B0A, 0x0D0C, 0x0F0E
   2012 #else
   2013       0x0809, 0x0A0B, 0x0C0D, 0x0E0F
   2014 #endif
   2015   };
   2016   __v2du __pmask =
   2017 #ifdef __LITTLE_ENDIAN__
   2018       {0x1716151413121110UL, 0UL};
   2019 #else
   2020       {0x1011121314151617UL, 0UL};
   2021 #endif
   2022   __m64_union __t;
   2023   __v2du __a, __r;
   2024 
   2025   __t.as_short[0] = __permute_selectors[__element_selector_98];
   2026   __t.as_short[1] = __permute_selectors[__element_selector_BA];
   2027   __t.as_short[2] = __permute_selectors[__element_selector_DC];
   2028   __t.as_short[3] = __permute_selectors[__element_selector_FE];
   2029   __pmask[1] = __t.as_m64;
   2030   __a = (__v2du)__A;
   2031   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
   2032   return (__m128i)__r;
   2033 }
   2034 
   2035 extern __inline __m128i
   2036     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2037     _mm_shufflelo_epi16(__m128i __A, const int __mask) {
   2038   unsigned long __element_selector_10 = __mask & 0x03;
   2039   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
   2040   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
   2041   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
   2042   static const unsigned short __permute_selectors[4] = {
   2043 #ifdef __LITTLE_ENDIAN__
   2044       0x0100, 0x0302, 0x0504, 0x0706
   2045 #else
   2046       0x0001, 0x0203, 0x0405, 0x0607
   2047 #endif
   2048   };
   2049   __v2du __pmask =
   2050 #ifdef __LITTLE_ENDIAN__
   2051       {0UL, 0x1f1e1d1c1b1a1918UL};
   2052 #else
   2053       {0UL, 0x18191a1b1c1d1e1fUL};
   2054 #endif
   2055   __m64_union __t;
   2056   __v2du __a, __r;
   2057   __t.as_short[0] = __permute_selectors[__element_selector_10];
   2058   __t.as_short[1] = __permute_selectors[__element_selector_32];
   2059   __t.as_short[2] = __permute_selectors[__element_selector_54];
   2060   __t.as_short[3] = __permute_selectors[__element_selector_76];
   2061   __pmask[0] = __t.as_m64;
   2062   __a = (__v2du)__A;
   2063   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
   2064   return (__m128i)__r;
   2065 }
   2066 
   2067 extern __inline __m128i
   2068     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2069     _mm_shuffle_epi32(__m128i __A, const int __mask) {
   2070   unsigned long __element_selector_10 = __mask & 0x03;
   2071   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
   2072   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
   2073   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
   2074   static const unsigned int __permute_selectors[4] = {
   2075 #ifdef __LITTLE_ENDIAN__
   2076       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
   2077 #else
   2078       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
   2079 #endif
   2080   };
   2081   __v4su __t;
   2082 
   2083   __t[0] = __permute_selectors[__element_selector_10];
   2084   __t[1] = __permute_selectors[__element_selector_32];
   2085   __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
   2086   __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
   2087   return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
   2088                            (__vector unsigned char)__t);
   2089 }
   2090 
   2091 extern __inline void
   2092     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2093     _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
   2094   __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
   2095   __v16qu __mask, __tmp;
   2096   __m128i_u *__p = (__m128i_u *)__C;
   2097 
   2098   __tmp = (__v16qu)_mm_loadu_si128(__p);
   2099   __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
   2100   __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
   2101   _mm_storeu_si128(__p, (__m128i)__tmp);
   2102 }
   2103 
   2104 extern __inline __m128i
   2105     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2106     _mm_avg_epu8(__m128i __A, __m128i __B) {
   2107   return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
   2108 }
   2109 
   2110 extern __inline __m128i
   2111     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2112     _mm_avg_epu16(__m128i __A, __m128i __B) {
   2113   return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
   2114 }
   2115 
   2116 extern __inline __m128i
   2117     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2118     _mm_sad_epu8(__m128i __A, __m128i __B) {
   2119   __v16qu __a, __b;
   2120   __v16qu __vabsdiff;
   2121   __v4si __vsum;
   2122   const __v4su __zero = {0, 0, 0, 0};
   2123   __v4si __result;
   2124 
   2125   __a = (__v16qu)__A;
   2126   __b = (__v16qu)__B;
   2127 #ifndef _ARCH_PWR9
   2128   __v16qu __vmin = vec_min(__a, __b);
   2129   __v16qu __vmax = vec_max(__a, __b);
   2130   __vabsdiff = vec_sub(__vmax, __vmin);
   2131 #else
   2132   __vabsdiff = vec_absd(__a, __b);
   2133 #endif
   2134   /* Sum four groups of bytes into integers.  */
   2135   __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
   2136 #ifdef __LITTLE_ENDIAN__
   2137   /* Sum across four integers with two integer results.  */
   2138   __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
   2139   /* Note: vec_sum2s could be used here, but on little-endian, vector
   2140      shifts are added that are not needed for this use-case.
   2141      A vector shift to correctly position the 32-bit integer results
   2142      (currently at [0] and [2]) to [1] and [3] would then need to be
   2143      swapped back again since the desired results are two 64-bit
   2144      integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
   2145 #else
   2146   /* Sum across four integers with two integer results.  */
   2147   __result = vec_sum2s(__vsum, (__vector signed int)__zero);
   2148   /* Rotate the sums into the correct position.  */
   2149   __result = vec_sld(__result, __result, 6);
   2150 #endif
   2151   return (__m128i)__result;
   2152 }
   2153 
   2154 extern __inline void
   2155     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2156     _mm_stream_si32(int *__A, int __B) {
   2157   /* Use the data cache block touch for store transient.  */
   2158   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
   2159   *__A = __B;
   2160 }
   2161 
   2162 extern __inline void
   2163     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2164     _mm_stream_si64(long long int *__A, long long int __B) {
   2165   /* Use the data cache block touch for store transient.  */
   2166   __asm__("	dcbtstt	0,%0" : : "b"(__A) : "memory");
   2167   *__A = __B;
   2168 }
   2169 
   2170 extern __inline void
   2171     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2172     _mm_stream_si128(__m128i *__A, __m128i __B) {
   2173   /* Use the data cache block touch for store transient.  */
   2174   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
   2175   *__A = __B;
   2176 }
   2177 
   2178 extern __inline void
   2179     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2180     _mm_stream_pd(double *__A, __m128d __B) {
   2181   /* Use the data cache block touch for store transient.  */
   2182   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
   2183   *(__m128d *)__A = __B;
   2184 }
   2185 
   2186 extern __inline void
   2187     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2188     _mm_clflush(void const *__A) {
   2189   /* Use the data cache block flush.  */
   2190   __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
   2191 }
   2192 
   2193 extern __inline void
   2194     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2195     _mm_lfence(void) {
   2196   /* Use light weight sync for load to load ordering.  */
   2197   __atomic_thread_fence(__ATOMIC_RELEASE);
   2198 }
   2199 
   2200 extern __inline void
   2201     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2202     _mm_mfence(void) {
   2203   /* Use heavy weight sync for any to any ordering.  */
   2204   __atomic_thread_fence(__ATOMIC_SEQ_CST);
   2205 }
   2206 
   2207 extern __inline __m128i
   2208     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2209     _mm_cvtsi32_si128(int __A) {
   2210   return _mm_set_epi32(0, 0, 0, __A);
   2211 }
   2212 
   2213 extern __inline __m128i
   2214     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2215     _mm_cvtsi64_si128(long long __A) {
   2216   return __extension__(__m128i)(__v2di){__A, 0LL};
   2217 }
   2218 
   2219 /* Microsoft intrinsic.  */
   2220 extern __inline __m128i
   2221     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2222     _mm_cvtsi64x_si128(long long __A) {
   2223   return __extension__(__m128i)(__v2di){__A, 0LL};
   2224 }
   2225 
   2226 /* Casts between various SP, DP, INT vector types.  Note that these do no
   2227    conversion of values, they just change the type.  */
   2228 extern __inline __m128
   2229     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2230     _mm_castpd_ps(__m128d __A) {
   2231   return (__m128)__A;
   2232 }
   2233 
   2234 extern __inline __m128i
   2235     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2236     _mm_castpd_si128(__m128d __A) {
   2237   return (__m128i)__A;
   2238 }
   2239 
   2240 extern __inline __m128d
   2241     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2242     _mm_castps_pd(__m128 __A) {
   2243   return (__m128d)__A;
   2244 }
   2245 
   2246 extern __inline __m128i
   2247     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2248     _mm_castps_si128(__m128 __A) {
   2249   return (__m128i)__A;
   2250 }
   2251 
   2252 extern __inline __m128
   2253     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2254     _mm_castsi128_ps(__m128i __A) {
   2255   return (__m128)__A;
   2256 }
   2257 
   2258 extern __inline __m128d
   2259     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   2260     _mm_castsi128_pd(__m128i __A) {
   2261   return (__m128d)__A;
   2262 }
   2263 
   2264 #else
   2265 #include_next <emmintrin.h>
   2266 #endif /* defined(__powerpc64__) &&                                            \
   2267         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
   2268 
   2269 #endif /* EMMINTRIN_H_ */