zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

xmmintrin.h (64333B) - Raw


      1 /*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 /* Implemented from the specification included in the Intel C++ Compiler
     11    User Guide and Reference, version 9.0.  */
     12 
     13 #ifndef NO_WARN_X86_INTRINSICS
     14 /* This header file is to help porting code using Intel intrinsics
     15    explicitly from x86_64 to powerpc64/powerpc64le.
     16 
     17    Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
     18    VMX/VSX ISA is a good match for vector float SIMD operations.
     19    However scalar float operations in vector (XMM) registers require
     20    the POWER8 VSX ISA (2.07) level. There are differences for data
     21    format and placement of float scalars in the vector register, which
     22    require extra steps to match SSE scalar float semantics on POWER.
     23 
     24    It should be noted that there's much difference between X86_64's
     25    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
     26    portable <fenv.h> instead of access MXSCR directly.
     27 
     28    Most SSE scalar float intrinsic operations can be performed more
     29    efficiently as C language float scalar operations or optimized to
     30    use vector SIMD operations. We recommend this for new applications. */
     31 #error                                                                         \
     32     "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
     33 #endif
     34 
     35 #ifndef XMMINTRIN_H_
     36 #define XMMINTRIN_H_
     37 
     38 #if defined(__powerpc64__) &&                                                  \
     39     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
     40 
     41 /* Define four value permute mask */
     42 #define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
     43 
     44 #include <altivec.h>
     45 
     46 /* Avoid collisions between altivec.h and strict adherence to C++ and
     47    C11 standards.  This should eventually be done inside altivec.h itself,
     48    but only after testing a full distro build.  */
     49 #if defined(__STRICT_ANSI__) &&                                                \
     50     (defined(__cplusplus) ||                                                   \
     51      (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
     52 #undef vector
     53 #undef pixel
     54 #undef bool
     55 #endif
     56 
     57 /* We need type definitions from the MMX header file.  */
     58 #include <mmintrin.h>
     59 
     60 /* Get _mm_malloc () and _mm_free ().  */
     61 #if __STDC_HOSTED__
     62 #include <mm_malloc.h>
     63 #endif
     64 
     65 /* The Intel API is flexible enough that we must allow aliasing with other
     66    vector types, and their scalar components.  */
     67 typedef vector float __m128 __attribute__((__may_alias__));
     68 
     69 /* Unaligned version of the same type.  */
     70 typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
     71 
     72 /* Internal data types for implementing the intrinsics.  */
     73 typedef vector float __v4sf;
     74 
     75 /* Create an undefined vector.  */
     76 extern __inline __m128
     77     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     78     _mm_undefined_ps(void) {
     79   __m128 __Y = __Y;
     80   return __Y;
     81 }
     82 
     83 /* Create a vector of zeros.  */
     84 extern __inline __m128
     85     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     86     _mm_setzero_ps(void) {
     87   return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};
     88 }
     89 
     90 /* Load four SPFP values from P.  The address must be 16-byte aligned.  */
     91 extern __inline __m128
     92     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     93     _mm_load_ps(float const *__P) {
     94   return ((__m128)vec_ld(0, (__v4sf *)__P));
     95 }
     96 
     97 /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
     98 extern __inline __m128
     99     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    100     _mm_loadu_ps(float const *__P) {
    101   return (vec_vsx_ld(0, __P));
    102 }
    103 
    104 /* Load four SPFP values in reverse order.  The address must be aligned.  */
    105 extern __inline __m128
    106     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    107     _mm_loadr_ps(float const *__P) {
    108   __v4sf __tmp;
    109   __m128 __result;
    110   static const __vector unsigned char __permute_vector = {
    111       0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
    112       0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
    113 
    114   __tmp = vec_ld(0, (__v4sf *)__P);
    115   __result = (__m128)vec_perm(__tmp, __tmp, __permute_vector);
    116   return __result;
    117 }
    118 
    119 /* Create a vector with all four elements equal to F.  */
    120 extern __inline __m128
    121     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    122     _mm_set1_ps(float __F) {
    123   return __extension__(__m128)(__v4sf){__F, __F, __F, __F};
    124 }
    125 
    126 extern __inline __m128
    127     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    128     _mm_set_ps1(float __F) {
    129   return _mm_set1_ps(__F);
    130 }
    131 
    132 /* Create the vector [Z Y X W].  */
    133 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
    134                                       __artificial__))
    135 _mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) {
    136   return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z};
    137 }
    138 
    139 /* Create the vector [W X Y Z].  */
    140 extern __inline __m128
    141     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    142     _mm_setr_ps(float __Z, float __Y, float __X, float __W) {
    143   return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W};
    144 }
    145 
    146 /* Store four SPFP values.  The address must be 16-byte aligned.  */
    147 extern __inline void
    148     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    149     _mm_store_ps(float *__P, __m128 __A) {
    150   vec_st((__v4sf)__A, 0, (__v4sf *)__P);
    151 }
    152 
    153 /* Store four SPFP values.  The address need not be 16-byte aligned.  */
    154 extern __inline void
    155     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    156     _mm_storeu_ps(float *__P, __m128 __A) {
    157   *(__m128_u *)__P = __A;
    158 }
    159 
    160 /* Store four SPFP values in reverse order.  The address must be aligned.  */
    161 extern __inline void
    162     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    163     _mm_storer_ps(float *__P, __m128 __A) {
    164   __v4sf __tmp;
    165   static const __vector unsigned char __permute_vector = {
    166       0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
    167       0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
    168 
    169   __tmp = (__m128)vec_perm(__A, __A, __permute_vector);
    170 
    171   _mm_store_ps(__P, __tmp);
    172 }
    173 
    174 /* Store the lower SPFP value across four words.  */
    175 extern __inline void
    176     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    177     _mm_store1_ps(float *__P, __m128 __A) {
    178   __v4sf __va = vec_splat((__v4sf)__A, 0);
    179   _mm_store_ps(__P, __va);
    180 }
    181 
    182 extern __inline void
    183     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    184     _mm_store_ps1(float *__P, __m128 __A) {
    185   _mm_store1_ps(__P, __A);
    186 }
    187 
    188 /* Create a vector with element 0 as F and the rest zero.  */
    189 extern __inline __m128
    190     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    191     _mm_set_ss(float __F) {
    192   return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};
    193 }
    194 
    195 /* Sets the low SPFP value of A from the low value of B.  */
    196 extern __inline __m128
    197     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    198     _mm_move_ss(__m128 __A, __m128 __B) {
    199   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    200 
    201   return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask));
    202 }
    203 
    204 /* Create a vector with element 0 as *P and the rest zero.  */
    205 extern __inline __m128
    206     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    207     _mm_load_ss(float const *__P) {
    208   return _mm_set_ss(*__P);
    209 }
    210 
    211 /* Stores the lower SPFP value.  */
    212 extern __inline void
    213     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    214     _mm_store_ss(float *__P, __m128 __A) {
    215   *__P = ((__v4sf)__A)[0];
    216 }
    217 
    218 /* Perform the respective operation on the lower SPFP (single-precision
    219    floating-point) values of A and B; the upper three SPFP values are
    220    passed through from A.  */
    221 
    222 extern __inline __m128
    223     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    224     _mm_add_ss(__m128 __A, __m128 __B) {
    225 #ifdef _ARCH_PWR7
    226   __m128 __a, __b, __c;
    227   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    228   /* PowerISA VSX does not allow partial (for just lower double)
    229      results. So to insure we don't generate spurious exceptions
    230      (from the upper double values) we splat the lower double
    231      before we to the operation.  */
    232   __a = vec_splat(__A, 0);
    233   __b = vec_splat(__B, 0);
    234   __c = __a + __b;
    235   /* Then we merge the lower float result with the original upper
    236      float elements from __A.  */
    237   return (vec_sel(__A, __c, __mask));
    238 #else
    239   __A[0] = __A[0] + __B[0];
    240   return (__A);
    241 #endif
    242 }
    243 
    244 extern __inline __m128
    245     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    246     _mm_sub_ss(__m128 __A, __m128 __B) {
    247 #ifdef _ARCH_PWR7
    248   __m128 __a, __b, __c;
    249   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    250   /* PowerISA VSX does not allow partial (for just lower double)
    251      results. So to insure we don't generate spurious exceptions
    252      (from the upper double values) we splat the lower double
    253      before we to the operation.  */
    254   __a = vec_splat(__A, 0);
    255   __b = vec_splat(__B, 0);
    256   __c = __a - __b;
    257   /* Then we merge the lower float result with the original upper
    258      float elements from __A.  */
    259   return (vec_sel(__A, __c, __mask));
    260 #else
    261   __A[0] = __A[0] - __B[0];
    262   return (__A);
    263 #endif
    264 }
    265 
    266 extern __inline __m128
    267     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    268     _mm_mul_ss(__m128 __A, __m128 __B) {
    269 #ifdef _ARCH_PWR7
    270   __m128 __a, __b, __c;
    271   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    272   /* PowerISA VSX does not allow partial (for just lower double)
    273      results. So to insure we don't generate spurious exceptions
    274      (from the upper double values) we splat the lower double
    275      before we to the operation.  */
    276   __a = vec_splat(__A, 0);
    277   __b = vec_splat(__B, 0);
    278   __c = __a * __b;
    279   /* Then we merge the lower float result with the original upper
    280      float elements from __A.  */
    281   return (vec_sel(__A, __c, __mask));
    282 #else
    283   __A[0] = __A[0] * __B[0];
    284   return (__A);
    285 #endif
    286 }
    287 
    288 extern __inline __m128
    289     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    290     _mm_div_ss(__m128 __A, __m128 __B) {
    291 #ifdef _ARCH_PWR7
    292   __m128 __a, __b, __c;
    293   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    294   /* PowerISA VSX does not allow partial (for just lower double)
    295      results. So to insure we don't generate spurious exceptions
    296      (from the upper double values) we splat the lower double
    297      before we to the operation.  */
    298   __a = vec_splat(__A, 0);
    299   __b = vec_splat(__B, 0);
    300   __c = __a / __b;
    301   /* Then we merge the lower float result with the original upper
    302      float elements from __A.  */
    303   return (vec_sel(__A, __c, __mask));
    304 #else
    305   __A[0] = __A[0] / __B[0];
    306   return (__A);
    307 #endif
    308 }
    309 
    310 extern __inline __m128
    311     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    312     _mm_sqrt_ss(__m128 __A) {
    313   __m128 __a, __c;
    314   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    315   /* PowerISA VSX does not allow partial (for just lower double)
    316    * results. So to insure we don't generate spurious exceptions
    317    * (from the upper double values) we splat the lower double
    318    * before we to the operation. */
    319   __a = vec_splat(__A, 0);
    320   __c = vec_sqrt(__a);
    321   /* Then we merge the lower float result with the original upper
    322    * float elements from __A.  */
    323   return (vec_sel(__A, __c, __mask));
    324 }
    325 
    326 /* Perform the respective operation on the four SPFP values in A and B.  */
    327 extern __inline __m128
    328     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    329     _mm_add_ps(__m128 __A, __m128 __B) {
    330   return (__m128)((__v4sf)__A + (__v4sf)__B);
    331 }
    332 
    333 extern __inline __m128
    334     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    335     _mm_sub_ps(__m128 __A, __m128 __B) {
    336   return (__m128)((__v4sf)__A - (__v4sf)__B);
    337 }
    338 
    339 extern __inline __m128
    340     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    341     _mm_mul_ps(__m128 __A, __m128 __B) {
    342   return (__m128)((__v4sf)__A * (__v4sf)__B);
    343 }
    344 
    345 extern __inline __m128
    346     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    347     _mm_div_ps(__m128 __A, __m128 __B) {
    348   return (__m128)((__v4sf)__A / (__v4sf)__B);
    349 }
    350 
    351 extern __inline __m128
    352     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    353     _mm_sqrt_ps(__m128 __A) {
    354   return (vec_sqrt((__v4sf)__A));
    355 }
    356 
    357 extern __inline __m128
    358     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    359     _mm_rcp_ps(__m128 __A) {
    360   return (vec_re((__v4sf)__A));
    361 }
    362 
    363 extern __inline __m128
    364     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    365     _mm_rsqrt_ps(__m128 __A) {
    366   return (vec_rsqrte(__A));
    367 }
    368 
    369 extern __inline __m128
    370     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    371     _mm_rcp_ss(__m128 __A) {
    372   __m128 __a, __c;
    373   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    374   /* PowerISA VSX does not allow partial (for just lower double)
    375    * results. So to insure we don't generate spurious exceptions
    376    * (from the upper double values) we splat the lower double
    377    * before we to the operation. */
    378   __a = vec_splat(__A, 0);
    379   __c = _mm_rcp_ps(__a);
    380   /* Then we merge the lower float result with the original upper
    381    * float elements from __A.  */
    382   return (vec_sel(__A, __c, __mask));
    383 }
    384 
    385 extern __inline __m128
    386     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    387     _mm_rsqrt_ss(__m128 __A) {
    388   __m128 __a, __c;
    389   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    390   /* PowerISA VSX does not allow partial (for just lower double)
    391    * results. So to insure we don't generate spurious exceptions
    392    * (from the upper double values) we splat the lower double
    393    * before we to the operation. */
    394   __a = vec_splat(__A, 0);
    395   __c = vec_rsqrte(__a);
    396   /* Then we merge the lower float result with the original upper
    397    * float elements from __A.  */
    398   return (vec_sel(__A, __c, __mask));
    399 }
    400 
    401 extern __inline __m128
    402     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    403     _mm_min_ss(__m128 __A, __m128 __B) {
    404   __v4sf __a, __b, __c;
    405   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    406   /* PowerISA VSX does not allow partial (for just lower float)
    407    * results. So to insure we don't generate spurious exceptions
    408    * (from the upper float values) we splat the lower float
    409    * before we to the operation. */
    410   __a = vec_splat((__v4sf)__A, 0);
    411   __b = vec_splat((__v4sf)__B, 0);
    412   __c = vec_min(__a, __b);
    413   /* Then we merge the lower float result with the original upper
    414    * float elements from __A.  */
    415   return (vec_sel((__v4sf)__A, __c, __mask));
    416 }
    417 
    418 extern __inline __m128
    419     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    420     _mm_max_ss(__m128 __A, __m128 __B) {
    421   __v4sf __a, __b, __c;
    422   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    423   /* PowerISA VSX does not allow partial (for just lower float)
    424    * results. So to insure we don't generate spurious exceptions
    425    * (from the upper float values) we splat the lower float
    426    * before we to the operation. */
    427   __a = vec_splat(__A, 0);
    428   __b = vec_splat(__B, 0);
    429   __c = vec_max(__a, __b);
    430   /* Then we merge the lower float result with the original upper
    431    * float elements from __A.  */
    432   return (vec_sel((__v4sf)__A, __c, __mask));
    433 }
    434 
    435 extern __inline __m128
    436     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    437     _mm_min_ps(__m128 __A, __m128 __B) {
    438   __vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A);
    439   return vec_sel(__B, __A, __m);
    440 }
    441 
    442 extern __inline __m128
    443     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    444     _mm_max_ps(__m128 __A, __m128 __B) {
    445   __vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B);
    446   return vec_sel(__B, __A, __m);
    447 }
    448 
    449 /* Perform logical bit-wise operations on 128-bit values.  */
    450 extern __inline __m128
    451     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    452     _mm_and_ps(__m128 __A, __m128 __B) {
    453   return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B));
    454   //  return __builtin_ia32_andps (__A, __B);
    455 }
    456 
    457 extern __inline __m128
    458     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    459     _mm_andnot_ps(__m128 __A, __m128 __B) {
    460   return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A));
    461 }
    462 
    463 extern __inline __m128
    464     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    465     _mm_or_ps(__m128 __A, __m128 __B) {
    466   return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B));
    467 }
    468 
    469 extern __inline __m128
    470     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    471     _mm_xor_ps(__m128 __A, __m128 __B) {
    472   return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B));
    473 }
    474 
    475 /* Perform a comparison on the four SPFP values of A and B.  For each
    476    element, if the comparison is true, place a mask of all ones in the
    477    result, otherwise a mask of zeros.  */
    478 extern __inline __m128
    479     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    480     _mm_cmpeq_ps(__m128 __A, __m128 __B) {
    481   return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B));
    482 }
    483 
    484 extern __inline __m128
    485     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    486     _mm_cmplt_ps(__m128 __A, __m128 __B) {
    487   return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
    488 }
    489 
    490 extern __inline __m128
    491     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    492     _mm_cmple_ps(__m128 __A, __m128 __B) {
    493   return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
    494 }
    495 
    496 extern __inline __m128
    497     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    498     _mm_cmpgt_ps(__m128 __A, __m128 __B) {
    499   return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
    500 }
    501 
    502 extern __inline __m128
    503     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    504     _mm_cmpge_ps(__m128 __A, __m128 __B) {
    505   return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
    506 }
    507 
    508 extern __inline __m128
    509     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    510     _mm_cmpneq_ps(__m128 __A, __m128 __B) {
    511   __v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B);
    512   return ((__m128)vec_nor(__temp, __temp));
    513 }
    514 
    515 extern __inline __m128
    516     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    517     _mm_cmpnlt_ps(__m128 __A, __m128 __B) {
    518   return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
    519 }
    520 
    521 extern __inline __m128
    522     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    523     _mm_cmpnle_ps(__m128 __A, __m128 __B) {
    524   return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
    525 }
    526 
    527 extern __inline __m128
    528     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    529     _mm_cmpngt_ps(__m128 __A, __m128 __B) {
    530   return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
    531 }
    532 
    533 extern __inline __m128
    534     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    535     _mm_cmpnge_ps(__m128 __A, __m128 __B) {
    536   return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
    537 }
    538 
    539 extern __inline __m128
    540     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    541     _mm_cmpord_ps(__m128 __A, __m128 __B) {
    542   __vector unsigned int __a, __b;
    543   __vector unsigned int __c, __d;
    544   static const __vector unsigned int __float_exp_mask = {
    545       0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
    546 
    547   __a = (__vector unsigned int)vec_abs((__v4sf)__A);
    548   __b = (__vector unsigned int)vec_abs((__v4sf)__B);
    549   __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
    550   __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
    551   return ((__m128)vec_and(__c, __d));
    552 }
    553 
    554 extern __inline __m128
    555     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    556     _mm_cmpunord_ps(__m128 __A, __m128 __B) {
    557   __vector unsigned int __a, __b;
    558   __vector unsigned int __c, __d;
    559   static const __vector unsigned int __float_exp_mask = {
    560       0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
    561 
    562   __a = (__vector unsigned int)vec_abs((__v4sf)__A);
    563   __b = (__vector unsigned int)vec_abs((__v4sf)__B);
    564   __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
    565   __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
    566   return ((__m128)vec_or(__c, __d));
    567 }
    568 
    569 /* Perform a comparison on the lower SPFP values of A and B.  If the
    570    comparison is true, place a mask of all ones in the result, otherwise a
    571    mask of zeros.  The upper three SPFP values are passed through from A.  */
    572 extern __inline __m128
    573     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    574     _mm_cmpeq_ss(__m128 __A, __m128 __B) {
    575   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    576   __v4sf __a, __b, __c;
    577   /* PowerISA VMX does not allow partial (for just element 0)
    578    * results. So to insure we don't generate spurious exceptions
    579    * (from the upper elements) we splat the lower float
    580    * before we to the operation. */
    581   __a = vec_splat((__v4sf)__A, 0);
    582   __b = vec_splat((__v4sf)__B, 0);
    583   __c = (__v4sf)vec_cmpeq(__a, __b);
    584   /* Then we merge the lower float result with the original upper
    585    * float elements from __A.  */
    586   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
    587 }
    588 
    589 extern __inline __m128
    590     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    591     _mm_cmplt_ss(__m128 __A, __m128 __B) {
    592   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    593   __v4sf __a, __b, __c;
    594   /* PowerISA VMX does not allow partial (for just element 0)
    595    * results. So to insure we don't generate spurious exceptions
    596    * (from the upper elements) we splat the lower float
    597    * before we to the operation. */
    598   __a = vec_splat((__v4sf)__A, 0);
    599   __b = vec_splat((__v4sf)__B, 0);
    600   __c = (__v4sf)vec_cmplt(__a, __b);
    601   /* Then we merge the lower float result with the original upper
    602    * float elements from __A.  */
    603   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
    604 }
    605 
    606 extern __inline __m128
    607     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    608     _mm_cmple_ss(__m128 __A, __m128 __B) {
    609   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    610   __v4sf __a, __b, __c;
    611   /* PowerISA VMX does not allow partial (for just element 0)
    612    * results. So to insure we don't generate spurious exceptions
    613    * (from the upper elements) we splat the lower float
    614    * before we to the operation. */
    615   __a = vec_splat((__v4sf)__A, 0);
    616   __b = vec_splat((__v4sf)__B, 0);
    617   __c = (__v4sf)vec_cmple(__a, __b);
    618   /* Then we merge the lower float result with the original upper
    619    * float elements from __A.  */
    620   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
    621 }
    622 
    623 extern __inline __m128
    624     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    625     _mm_cmpgt_ss(__m128 __A, __m128 __B) {
    626   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    627   __v4sf __a, __b, __c;
    628   /* PowerISA VMX does not allow partial (for just element 0)
    629    * results. So to insure we don't generate spurious exceptions
    630    * (from the upper elements) we splat the lower float
    631    * before we to the operation. */
    632   __a = vec_splat((__v4sf)__A, 0);
    633   __b = vec_splat((__v4sf)__B, 0);
    634   __c = (__v4sf)vec_cmpgt(__a, __b);
    635   /* Then we merge the lower float result with the original upper
    636    * float elements from __A.  */
    637   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
    638 }
    639 
    640 extern __inline __m128
    641     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    642     _mm_cmpge_ss(__m128 __A, __m128 __B) {
    643   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    644   __v4sf __a, __b, __c;
    645   /* PowerISA VMX does not allow partial (for just element 0)
    646    * results. So to insure we don't generate spurious exceptions
    647    * (from the upper elements) we splat the lower float
    648    * before we to the operation. */
    649   __a = vec_splat((__v4sf)__A, 0);
    650   __b = vec_splat((__v4sf)__B, 0);
    651   __c = (__v4sf)vec_cmpge(__a, __b);
    652   /* Then we merge the lower float result with the original upper
    653    * float elements from __A.  */
    654   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
    655 }
    656 
    657 extern __inline __m128
    658     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    659     _mm_cmpneq_ss(__m128 __A, __m128 __B) {
    660   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    661   __v4sf __a, __b, __c;
    662   /* PowerISA VMX does not allow partial (for just element 0)
    663    * results. So to insure we don't generate spurious exceptions
    664    * (from the upper elements) we splat the lower float
    665    * before we to the operation. */
    666   __a = vec_splat((__v4sf)__A, 0);
    667   __b = vec_splat((__v4sf)__B, 0);
    668   __c = (__v4sf)vec_cmpeq(__a, __b);
    669   __c = vec_nor(__c, __c);
    670   /* Then we merge the lower float result with the original upper
    671    * float elements from __A.  */
    672   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
    673 }
    674 
    675 extern __inline __m128
    676     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    677     _mm_cmpnlt_ss(__m128 __A, __m128 __B) {
    678   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    679   __v4sf __a, __b, __c;
    680   /* PowerISA VMX does not allow partial (for just element 0)
    681    * results. So to insure we don't generate spurious exceptions
    682    * (from the upper elements) we splat the lower float
    683    * before we to the operation. */
    684   __a = vec_splat((__v4sf)__A, 0);
    685   __b = vec_splat((__v4sf)__B, 0);
    686   __c = (__v4sf)vec_cmpge(__a, __b);
    687   /* Then we merge the lower float result with the original upper
    688    * float elements from __A.  */
    689   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
    690 }
    691 
    692 extern __inline __m128
    693     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    694     _mm_cmpnle_ss(__m128 __A, __m128 __B) {
    695   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    696   __v4sf __a, __b, __c;
    697   /* PowerISA VMX does not allow partial (for just element 0)
    698    * results. So to insure we don't generate spurious exceptions
    699    * (from the upper elements) we splat the lower float
    700    * before we to the operation. */
    701   __a = vec_splat((__v4sf)__A, 0);
    702   __b = vec_splat((__v4sf)__B, 0);
    703   __c = (__v4sf)vec_cmpgt(__a, __b);
    704   /* Then we merge the lower float result with the original upper
    705    * float elements from __A.  */
    706   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
    707 }
    708 
    709 extern __inline __m128
    710     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    711     _mm_cmpngt_ss(__m128 __A, __m128 __B) {
    712   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    713   __v4sf __a, __b, __c;
    714   /* PowerISA VMX does not allow partial (for just element 0)
    715    * results. So to insure we don't generate spurious exceptions
    716    * (from the upper elements) we splat the lower float
    717    * before we to the operation. */
    718   __a = vec_splat((__v4sf)__A, 0);
    719   __b = vec_splat((__v4sf)__B, 0);
    720   __c = (__v4sf)vec_cmple(__a, __b);
    721   /* Then we merge the lower float result with the original upper
    722    * float elements from __A.  */
    723   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
    724 }
    725 
    726 extern __inline __m128
    727     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    728     _mm_cmpnge_ss(__m128 __A, __m128 __B) {
    729   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    730   __v4sf __a, __b, __c;
    731   /* PowerISA VMX does not allow partial (for just element 0)
    732    * results. So to insure we don't generate spurious exceptions
    733    * (from the upper elements) we splat the lower float
    734    * before we do the operation. */
    735   __a = vec_splat((__v4sf)__A, 0);
    736   __b = vec_splat((__v4sf)__B, 0);
    737   __c = (__v4sf)vec_cmplt(__a, __b);
    738   /* Then we merge the lower float result with the original upper
    739    * float elements from __A.  */
    740   return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
    741 }
    742 
    743 extern __inline __m128
    744     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    745     _mm_cmpord_ss(__m128 __A, __m128 __B) {
    746   __vector unsigned int __a, __b;
    747   __vector unsigned int __c, __d;
    748   static const __vector unsigned int __float_exp_mask = {
    749       0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
    750   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    751 
    752   __a = (__vector unsigned int)vec_abs((__v4sf)__A);
    753   __b = (__vector unsigned int)vec_abs((__v4sf)__B);
    754   __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
    755   __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
    756   __c = vec_and(__c, __d);
    757   /* Then we merge the lower float result with the original upper
    758    * float elements from __A.  */
    759   return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
    760 }
    761 
    762 extern __inline __m128
    763     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    764     _mm_cmpunord_ss(__m128 __A, __m128 __B) {
    765   __vector unsigned int __a, __b;
    766   __vector unsigned int __c, __d;
    767   static const __vector unsigned int __float_exp_mask = {
    768       0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
    769   static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
    770 
    771   __a = (__vector unsigned int)vec_abs((__v4sf)__A);
    772   __b = (__vector unsigned int)vec_abs((__v4sf)__B);
    773   __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
    774   __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
    775   __c = vec_or(__c, __d);
    776   /* Then we merge the lower float result with the original upper
    777    * float elements from __A.  */
    778   return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
    779 }
    780 
    781 /* Compare the lower SPFP values of A and B and return 1 if true
    782    and 0 if false.  */
    783 extern __inline int
    784     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    785     _mm_comieq_ss(__m128 __A, __m128 __B) {
    786   return (__A[0] == __B[0]);
    787 }
    788 
    789 extern __inline int
    790     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    791     _mm_comilt_ss(__m128 __A, __m128 __B) {
    792   return (__A[0] < __B[0]);
    793 }
    794 
    795 extern __inline int
    796     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    797     _mm_comile_ss(__m128 __A, __m128 __B) {
    798   return (__A[0] <= __B[0]);
    799 }
    800 
    801 extern __inline int
    802     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    803     _mm_comigt_ss(__m128 __A, __m128 __B) {
    804   return (__A[0] > __B[0]);
    805 }
    806 
    807 extern __inline int
    808     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    809     _mm_comige_ss(__m128 __A, __m128 __B) {
    810   return (__A[0] >= __B[0]);
    811 }
    812 
    813 extern __inline int
    814     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    815     _mm_comineq_ss(__m128 __A, __m128 __B) {
    816   return (__A[0] != __B[0]);
    817 }
    818 
    819 /* FIXME
    820  * The __mm_ucomi??_ss implementations below are exactly the same as
    821  * __mm_comi??_ss because GCC for PowerPC only generates unordered
    822  * compares (scalar and vector).
    823  * Technically __mm_comieq_ss et al should be using the ordered
    824  * compare and signal for QNaNs.
    825  * The __mm_ucomieq_sd et all should be OK, as is.
    826  */
    827 extern __inline int
    828     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    829     _mm_ucomieq_ss(__m128 __A, __m128 __B) {
    830   return (__A[0] == __B[0]);
    831 }
    832 
    833 extern __inline int
    834     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    835     _mm_ucomilt_ss(__m128 __A, __m128 __B) {
    836   return (__A[0] < __B[0]);
    837 }
    838 
    839 extern __inline int
    840     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    841     _mm_ucomile_ss(__m128 __A, __m128 __B) {
    842   return (__A[0] <= __B[0]);
    843 }
    844 
    845 extern __inline int
    846     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    847     _mm_ucomigt_ss(__m128 __A, __m128 __B) {
    848   return (__A[0] > __B[0]);
    849 }
    850 
    851 extern __inline int
    852     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    853     _mm_ucomige_ss(__m128 __A, __m128 __B) {
    854   return (__A[0] >= __B[0]);
    855 }
    856 
    857 extern __inline int
    858     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    859     _mm_ucomineq_ss(__m128 __A, __m128 __B) {
    860   return (__A[0] != __B[0]);
    861 }
    862 
    863 extern __inline float
    864     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    865     _mm_cvtss_f32(__m128 __A) {
    866   return ((__v4sf)__A)[0];
    867 }
    868 
    869 /* Convert the lower SPFP value to a 32-bit integer according to the current
    870    rounding mode.  */
    871 extern __inline int
    872     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    873     _mm_cvtss_si32(__m128 __A) {
    874   int __res;
    875 #ifdef _ARCH_PWR8
    876   double __dtmp;
    877   __asm__(
    878 #ifdef __LITTLE_ENDIAN__
    879       "xxsldwi %x0,%x0,%x0,3;\n"
    880 #endif
    881       "xscvspdp %x2,%x0;\n"
    882       "fctiw  %2,%2;\n"
    883       "mfvsrd  %1,%x2;\n"
    884       : "+wa"(__A), "=r"(__res), "=f"(__dtmp)
    885       :);
    886 #else
    887   __res = __builtin_rint(__A[0]);
    888 #endif
    889   return __res;
    890 }
    891 
    892 extern __inline int
    893     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    894     _mm_cvt_ss2si(__m128 __A) {
    895   return _mm_cvtss_si32(__A);
    896 }
    897 
    898 /* Convert the lower SPFP value to a 32-bit integer according to the
    899    current rounding mode.  */
    900 
    901 /* Intel intrinsic.  */
    902 extern __inline long long
    903     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    904     _mm_cvtss_si64(__m128 __A) {
    905   long long __res;
    906 #if defined(_ARCH_PWR8) && defined(__powerpc64__)
    907   double __dtmp;
    908   __asm__(
    909 #ifdef __LITTLE_ENDIAN__
    910       "xxsldwi %x0,%x0,%x0,3;\n"
    911 #endif
    912       "xscvspdp %x2,%x0;\n"
    913       "fctid  %2,%2;\n"
    914       "mfvsrd  %1,%x2;\n"
    915       : "+wa"(__A), "=r"(__res), "=f"(__dtmp)
    916       :);
    917 #else
    918   __res = __builtin_llrint(__A[0]);
    919 #endif
    920   return __res;
    921 }
    922 
    923 /* Microsoft intrinsic.  */
    924 extern __inline long long
    925     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    926     _mm_cvtss_si64x(__m128 __A) {
    927   return _mm_cvtss_si64((__v4sf)__A);
    928 }
    929 
    930 /* Constants for use with _mm_prefetch.  */
    931 enum _mm_hint {
    932   /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
    933   _MM_HINT_ET0 = 7,
    934   _MM_HINT_ET1 = 6,
    935   _MM_HINT_T0 = 3,
    936   _MM_HINT_T1 = 2,
    937   _MM_HINT_T2 = 1,
    938   _MM_HINT_NTA = 0
    939 };
    940 
    941 /* Loads one cache line from address P to a location "closer" to the
    942    processor.  The selector I specifies the type of prefetch operation.  */
    943 extern __inline void
    944     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    945     _mm_prefetch(const void *__P, enum _mm_hint __I) {
    946   /* Current PowerPC will ignores the hint parameters.  */
    947   __builtin_prefetch(__P);
    948 }
    949 
    950 /* Convert the two lower SPFP values to 32-bit integers according to the
    951    current rounding mode.  Return the integers in packed form.  */
    952 extern __inline __m64
    953     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    954     _mm_cvtps_pi32(__m128 __A) {
    955   /* Splat two lower SPFP values to both halves.  */
    956   __v4sf __temp, __rounded;
    957   __vector unsigned long long __result;
    958 
    959   /* Splat two lower SPFP values to both halves.  */
    960   __temp = (__v4sf)vec_splat((__vector long long)__A, 0);
    961   __rounded = vec_rint(__temp);
    962   __result = (__vector unsigned long long)vec_cts(__rounded, 0);
    963 
    964   return (__m64)((__vector long long)__result)[0];
    965 }
    966 
    967 extern __inline __m64
    968     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    969     _mm_cvt_ps2pi(__m128 __A) {
    970   return _mm_cvtps_pi32(__A);
    971 }
    972 
    973 /* Truncate the lower SPFP value to a 32-bit integer.  */
    974 extern __inline int
    975     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    976     _mm_cvttss_si32(__m128 __A) {
    977   /* Extract the lower float element.  */
    978   float __temp = __A[0];
    979   /* truncate to 32-bit integer and return.  */
    980   return __temp;
    981 }
    982 
    983 extern __inline int
    984     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    985     _mm_cvtt_ss2si(__m128 __A) {
    986   return _mm_cvttss_si32(__A);
    987 }
    988 
    989 /* Intel intrinsic.  */
    990 extern __inline long long
    991     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    992     _mm_cvttss_si64(__m128 __A) {
    993   /* Extract the lower float element.  */
    994   float __temp = __A[0];
    995   /* truncate to 32-bit integer and return.  */
    996   return __temp;
    997 }
    998 
    999 /* Microsoft intrinsic.  */
   1000 extern __inline long long
   1001     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1002     _mm_cvttss_si64x(__m128 __A) {
   1003   /* Extract the lower float element.  */
   1004   float __temp = __A[0];
   1005   /* truncate to 32-bit integer and return.  */
   1006   return __temp;
   1007 }
   1008 
   1009 /* Truncate the two lower SPFP values to 32-bit integers.  Return the
   1010    integers in packed form.  */
   1011 extern __inline __m64
   1012     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1013     _mm_cvttps_pi32(__m128 __A) {
   1014   __v4sf __temp;
   1015   __vector unsigned long long __result;
   1016 
   1017   /* Splat two lower SPFP values to both halves.  */
   1018   __temp = (__v4sf)vec_splat((__vector long long)__A, 0);
   1019   __result = (__vector unsigned long long)vec_cts(__temp, 0);
   1020 
   1021   return (__m64)((__vector long long)__result)[0];
   1022 }
   1023 
   1024 extern __inline __m64
   1025     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1026     _mm_cvtt_ps2pi(__m128 __A) {
   1027   return _mm_cvttps_pi32(__A);
   1028 }
   1029 
   1030 /* Convert B to a SPFP value and insert it as element zero in A.  */
   1031 extern __inline __m128
   1032     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1033     _mm_cvtsi32_ss(__m128 __A, int __B) {
   1034   float __temp = __B;
   1035   __A[0] = __temp;
   1036 
   1037   return __A;
   1038 }
   1039 
   1040 extern __inline __m128
   1041     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1042     _mm_cvt_si2ss(__m128 __A, int __B) {
   1043   return _mm_cvtsi32_ss(__A, __B);
   1044 }
   1045 
   1046 /* Convert B to a SPFP value and insert it as element zero in A.  */
   1047 /* Intel intrinsic.  */
   1048 extern __inline __m128
   1049     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1050     _mm_cvtsi64_ss(__m128 __A, long long __B) {
   1051   float __temp = __B;
   1052   __A[0] = __temp;
   1053 
   1054   return __A;
   1055 }
   1056 
   1057 /* Microsoft intrinsic.  */
   1058 extern __inline __m128
   1059     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1060     _mm_cvtsi64x_ss(__m128 __A, long long __B) {
   1061   return _mm_cvtsi64_ss(__A, __B);
   1062 }
   1063 
   1064 /* Convert the two 32-bit values in B to SPFP form and insert them
   1065    as the two lower elements in A.  */
   1066 extern __inline __m128
   1067     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1068     _mm_cvtpi32_ps(__m128 __A, __m64 __B) {
   1069   __vector signed int __vm1;
   1070   __vector float __vf1;
   1071 
   1072   __vm1 = (__vector signed int)(__vector unsigned long long){__B, __B};
   1073   __vf1 = (__vector float)vec_ctf(__vm1, 0);
   1074 
   1075   return ((__m128)(__vector unsigned long long){
   1076       ((__vector unsigned long long)__vf1)[0],
   1077       ((__vector unsigned long long)__A)[1]});
   1078 }
   1079 
   1080 extern __inline __m128
   1081     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1082     _mm_cvt_pi2ps(__m128 __A, __m64 __B) {
   1083   return _mm_cvtpi32_ps(__A, __B);
   1084 }
   1085 
   1086 /* Convert the four signed 16-bit values in A to SPFP form.  */
   1087 extern __inline __m128
   1088     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1089     _mm_cvtpi16_ps(__m64 __A) {
   1090   __vector signed short __vs8;
   1091   __vector signed int __vi4;
   1092   __vector float __vf1;
   1093 
   1094   __vs8 = (__vector signed short)(__vector unsigned long long){__A, __A};
   1095   __vi4 = vec_vupklsh(__vs8);
   1096   __vf1 = (__vector float)vec_ctf(__vi4, 0);
   1097 
   1098   return (__m128)__vf1;
   1099 }
   1100 
   1101 /* Convert the four unsigned 16-bit values in A to SPFP form.  */
   1102 extern __inline __m128
   1103     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1104     _mm_cvtpu16_ps(__m64 __A) {
   1105   const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0};
   1106   __vector unsigned short __vs8;
   1107   __vector unsigned int __vi4;
   1108   __vector float __vf1;
   1109 
   1110   __vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A};
   1111   __vi4 = (__vector unsigned int)vec_mergel
   1112 #ifdef __LITTLE_ENDIAN__
   1113       (__vs8, __zero);
   1114 #else
   1115       (__zero, __vs8);
   1116 #endif
   1117   __vf1 = (__vector float)vec_ctf(__vi4, 0);
   1118 
   1119   return (__m128)__vf1;
   1120 }
   1121 
   1122 /* Convert the low four signed 8-bit values in A to SPFP form.  */
   1123 extern __inline __m128
   1124     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1125     _mm_cvtpi8_ps(__m64 __A) {
   1126   __vector signed char __vc16;
   1127   __vector signed short __vs8;
   1128   __vector signed int __vi4;
   1129   __vector float __vf1;
   1130 
   1131   __vc16 = (__vector signed char)(__vector unsigned long long){__A, __A};
   1132   __vs8 = vec_vupkhsb(__vc16);
   1133   __vi4 = vec_vupkhsh(__vs8);
   1134   __vf1 = (__vector float)vec_ctf(__vi4, 0);
   1135 
   1136   return (__m128)__vf1;
   1137 }
   1138 
   1139 /* Convert the low four unsigned 8-bit values in A to SPFP form.  */
   1140 extern __inline __m128
   1141     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1142 
   1143     _mm_cvtpu8_ps(__m64 __A) {
   1144   const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0};
   1145   __vector unsigned char __vc16;
   1146   __vector unsigned short __vs8;
   1147   __vector unsigned int __vi4;
   1148   __vector float __vf1;
   1149 
   1150   __vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A};
   1151 #ifdef __LITTLE_ENDIAN__
   1152   __vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero);
   1153   __vi4 =
   1154       (__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero);
   1155 #else
   1156   __vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16);
   1157   __vi4 =
   1158       (__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8);
   1159 #endif
   1160   __vf1 = (__vector float)vec_ctf(__vi4, 0);
   1161 
   1162   return (__m128)__vf1;
   1163 }
   1164 
   1165 /* Convert the four signed 32-bit values in A and B to SPFP form.  */
   1166 extern __inline __m128
   1167     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1168     _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) {
   1169   __vector signed int __vi4;
   1170   __vector float __vf4;
   1171 
   1172   __vi4 = (__vector signed int)(__vector unsigned long long){__A, __B};
   1173   __vf4 = (__vector float)vec_ctf(__vi4, 0);
   1174   return (__m128)__vf4;
   1175 }
   1176 
   1177 /* Convert the four SPFP values in A to four signed 16-bit integers.  */
   1178 extern __inline __m64
   1179     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1180     _mm_cvtps_pi16(__m128 __A) {
   1181   __v4sf __rounded;
   1182   __vector signed int __temp;
   1183   __vector unsigned long long __result;
   1184 
   1185   __rounded = vec_rint(__A);
   1186   __temp = vec_cts(__rounded, 0);
   1187   __result = (__vector unsigned long long)vec_pack(__temp, __temp);
   1188 
   1189   return (__m64)((__vector long long)__result)[0];
   1190 }
   1191 
   1192 /* Convert the four SPFP values in A to four signed 8-bit integers.  */
   1193 extern __inline __m64
   1194     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1195     _mm_cvtps_pi8(__m128 __A) {
   1196   __v4sf __rounded;
   1197   __vector signed int __tmp_i;
   1198   static const __vector signed int __zero = {0, 0, 0, 0};
   1199   __vector signed short __tmp_s;
   1200   __vector signed char __res_v;
   1201 
   1202   __rounded = vec_rint(__A);
   1203   __tmp_i = vec_cts(__rounded, 0);
   1204   __tmp_s = vec_pack(__tmp_i, __zero);
   1205   __res_v = vec_pack(__tmp_s, __tmp_s);
   1206   return (__m64)((__vector long long)__res_v)[0];
   1207 }
   1208 
   1209 /* Selects four specific SPFP values from A and B based on MASK.  */
   1210 extern __inline __m128
   1211     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1212 
   1213     _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) {
   1214   unsigned long __element_selector_10 = __mask & 0x03;
   1215   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
   1216   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
   1217   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
   1218   static const unsigned int __permute_selectors[4] = {
   1219 #ifdef __LITTLE_ENDIAN__
   1220       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
   1221 #else
   1222       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
   1223 #endif
   1224   };
   1225   __vector unsigned int __t;
   1226 
   1227   __t[0] = __permute_selectors[__element_selector_10];
   1228   __t[1] = __permute_selectors[__element_selector_32];
   1229   __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
   1230   __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
   1231   return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t);
   1232 }
   1233 
   1234 /* Selects and interleaves the upper two SPFP values from A and B.  */
   1235 extern __inline __m128
   1236     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1237     _mm_unpackhi_ps(__m128 __A, __m128 __B) {
   1238   return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B);
   1239 }
   1240 
   1241 /* Selects and interleaves the lower two SPFP values from A and B.  */
   1242 extern __inline __m128
   1243     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1244     _mm_unpacklo_ps(__m128 __A, __m128 __B) {
   1245   return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B);
   1246 }
   1247 
   1248 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
   1249    the lower two values are passed through from A.  */
   1250 extern __inline __m128
   1251     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1252     _mm_loadh_pi(__m128 __A, __m64 const *__P) {
   1253   __vector unsigned long long __a = (__vector unsigned long long)__A;
   1254   __vector unsigned long long __p = vec_splats(*__P);
   1255   __a[1] = __p[1];
   1256 
   1257   return (__m128)__a;
   1258 }
   1259 
   1260 /* Stores the upper two SPFP values of A into P.  */
   1261 extern __inline void
   1262     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1263     _mm_storeh_pi(__m64 *__P, __m128 __A) {
   1264   __vector unsigned long long __a = (__vector unsigned long long)__A;
   1265 
   1266   *__P = __a[1];
   1267 }
   1268 
   1269 /* Moves the upper two values of B into the lower two values of A.  */
   1270 extern __inline __m128
   1271     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1272     _mm_movehl_ps(__m128 __A, __m128 __B) {
   1273   return (__m128)vec_mergel((__vector unsigned long long)__B,
   1274                             (__vector unsigned long long)__A);
   1275 }
   1276 
   1277 /* Moves the lower two values of B into the upper two values of A.  */
   1278 extern __inline __m128
   1279     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1280     _mm_movelh_ps(__m128 __A, __m128 __B) {
   1281   return (__m128)vec_mergeh((__vector unsigned long long)__A,
   1282                             (__vector unsigned long long)__B);
   1283 }
   1284 
   1285 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
   1286    the upper two values are passed through from A.  */
   1287 extern __inline __m128
   1288     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1289     _mm_loadl_pi(__m128 __A, __m64 const *__P) {
   1290   __vector unsigned long long __a = (__vector unsigned long long)__A;
   1291   __vector unsigned long long __p = vec_splats(*__P);
   1292   __a[0] = __p[0];
   1293 
   1294   return (__m128)__a;
   1295 }
   1296 
   1297 /* Stores the lower two SPFP values of A into P.  */
   1298 extern __inline void
   1299     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1300     _mm_storel_pi(__m64 *__P, __m128 __A) {
   1301   __vector unsigned long long __a = (__vector unsigned long long)__A;
   1302 
   1303   *__P = __a[0];
   1304 }
   1305 
   1306 #ifdef _ARCH_PWR8
   1307 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
   1308 
   1309 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
   1310 extern __inline int
   1311     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1312     _mm_movemask_ps(__m128 __A) {
   1313 #ifdef _ARCH_PWR10
   1314   return vec_extractm((__vector unsigned int)__A);
   1315 #else
   1316   __vector unsigned long long __result;
   1317   static const __vector unsigned int __perm_mask = {
   1318 #ifdef __LITTLE_ENDIAN__
   1319       0x00204060, 0x80808080, 0x80808080, 0x80808080
   1320 #else
   1321       0x80808080, 0x80808080, 0x80808080, 0x00204060
   1322 #endif
   1323   };
   1324 
   1325   __result = ((__vector unsigned long long)vec_vbpermq(
   1326       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
   1327 
   1328 #ifdef __LITTLE_ENDIAN__
   1329   return __result[1];
   1330 #else
   1331   return __result[0];
   1332 #endif
   1333 #endif /* !_ARCH_PWR10 */
   1334 }
   1335 #endif /* _ARCH_PWR8 */
   1336 
   1337 /* Create a vector with all four elements equal to *P.  */
   1338 extern __inline __m128
   1339     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1340     _mm_load1_ps(float const *__P) {
   1341   return _mm_set1_ps(*__P);
   1342 }
   1343 
   1344 extern __inline __m128
   1345     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1346     _mm_load_ps1(float const *__P) {
   1347   return _mm_load1_ps(__P);
   1348 }
   1349 
   1350 /* Extracts one of the four words of A.  The selector N must be immediate.  */
   1351 extern __inline int
   1352     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1353     _mm_extract_pi16(__m64 const __A, int const __N) {
   1354   unsigned int __shiftr = __N & 3;
   1355 #ifdef __BIG_ENDIAN__
   1356   __shiftr = 3 - __shiftr;
   1357 #endif
   1358 
   1359   return ((__A >> (__shiftr * 16)) & 0xffff);
   1360 }
   1361 
   1362 extern __inline int
   1363     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1364     _m_pextrw(__m64 const __A, int const __N) {
   1365   return _mm_extract_pi16(__A, __N);
   1366 }
   1367 
   1368 /* Inserts word D into one of four words of A.  The selector N must be
   1369    immediate.  */
   1370 extern __inline __m64
   1371     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1372     _mm_insert_pi16(__m64 const __A, int const __D, int const __N) {
   1373   const int __shiftl = (__N & 3) * 16;
   1374   const __m64 __shiftD = (const __m64)__D << __shiftl;
   1375   const __m64 __mask = 0xffffUL << __shiftl;
   1376   __m64 __result = (__A & (~__mask)) | (__shiftD & __mask);
   1377 
   1378   return __result;
   1379 }
   1380 
   1381 extern __inline __m64
   1382     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1383     _m_pinsrw(__m64 const __A, int const __D, int const __N) {
   1384   return _mm_insert_pi16(__A, __D, __N);
   1385 }
   1386 
   1387 /* Compute the element-wise maximum of signed 16-bit values.  */
   1388 extern __inline __m64
   1389     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1390 
   1391     _mm_max_pi16(__m64 __A, __m64 __B) {
   1392 #if _ARCH_PWR8
   1393   __vector signed short __a, __b, __r;
   1394   __vector __bool short __c;
   1395 
   1396   __a = (__vector signed short)vec_splats(__A);
   1397   __b = (__vector signed short)vec_splats(__B);
   1398   __c = (__vector __bool short)vec_cmpgt(__a, __b);
   1399   __r = vec_sel(__b, __a, __c);
   1400   return (__m64)((__vector long long)__r)[0];
   1401 #else
   1402   __m64_union __m1, __m2, __res;
   1403 
   1404   __m1.as_m64 = __A;
   1405   __m2.as_m64 = __B;
   1406 
   1407   __res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0]
   1408                                                             : __m2.as_short[0];
   1409   __res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1]
   1410                                                             : __m2.as_short[1];
   1411   __res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2]
   1412                                                             : __m2.as_short[2];
   1413   __res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3]
   1414                                                             : __m2.as_short[3];
   1415 
   1416   return (__m64)__res.as_m64;
   1417 #endif
   1418 }
   1419 
   1420 extern __inline __m64
   1421     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1422     _m_pmaxsw(__m64 __A, __m64 __B) {
   1423   return _mm_max_pi16(__A, __B);
   1424 }
   1425 
   1426 /* Compute the element-wise maximum of unsigned 8-bit values.  */
   1427 extern __inline __m64
   1428     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1429     _mm_max_pu8(__m64 __A, __m64 __B) {
   1430 #if _ARCH_PWR8
   1431   __vector unsigned char __a, __b, __r;
   1432   __vector __bool char __c;
   1433 
   1434   __a = (__vector unsigned char)vec_splats(__A);
   1435   __b = (__vector unsigned char)vec_splats(__B);
   1436   __c = (__vector __bool char)vec_cmpgt(__a, __b);
   1437   __r = vec_sel(__b, __a, __c);
   1438   return (__m64)((__vector long long)__r)[0];
   1439 #else
   1440   __m64_union __m1, __m2, __res;
   1441   long __i;
   1442 
   1443   __m1.as_m64 = __A;
   1444   __m2.as_m64 = __B;
   1445 
   1446   for (__i = 0; __i < 8; __i++)
   1447     __res.as_char[__i] =
   1448         ((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i])
   1449             ? __m1.as_char[__i]
   1450             : __m2.as_char[__i];
   1451 
   1452   return (__m64)__res.as_m64;
   1453 #endif
   1454 }
   1455 
   1456 extern __inline __m64
   1457     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1458     _m_pmaxub(__m64 __A, __m64 __B) {
   1459   return _mm_max_pu8(__A, __B);
   1460 }
   1461 
   1462 /* Compute the element-wise minimum of signed 16-bit values.  */
   1463 extern __inline __m64
   1464     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1465     _mm_min_pi16(__m64 __A, __m64 __B) {
   1466 #if _ARCH_PWR8
   1467   __vector signed short __a, __b, __r;
   1468   __vector __bool short __c;
   1469 
   1470   __a = (__vector signed short)vec_splats(__A);
   1471   __b = (__vector signed short)vec_splats(__B);
   1472   __c = (__vector __bool short)vec_cmplt(__a, __b);
   1473   __r = vec_sel(__b, __a, __c);
   1474   return (__m64)((__vector long long)__r)[0];
   1475 #else
   1476   __m64_union __m1, __m2, __res;
   1477 
   1478   __m1.as_m64 = __A;
   1479   __m2.as_m64 = __B;
   1480 
   1481   __res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0]
   1482                                                             : __m2.as_short[0];
   1483   __res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1]
   1484                                                             : __m2.as_short[1];
   1485   __res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2]
   1486                                                             : __m2.as_short[2];
   1487   __res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3]
   1488                                                             : __m2.as_short[3];
   1489 
   1490   return (__m64)__res.as_m64;
   1491 #endif
   1492 }
   1493 
   1494 extern __inline __m64
   1495     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1496     _m_pminsw(__m64 __A, __m64 __B) {
   1497   return _mm_min_pi16(__A, __B);
   1498 }
   1499 
   1500 /* Compute the element-wise minimum of unsigned 8-bit values.  */
   1501 extern __inline __m64
   1502     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1503     _mm_min_pu8(__m64 __A, __m64 __B) {
   1504 #if _ARCH_PWR8
   1505   __vector unsigned char __a, __b, __r;
   1506   __vector __bool char __c;
   1507 
   1508   __a = (__vector unsigned char)vec_splats(__A);
   1509   __b = (__vector unsigned char)vec_splats(__B);
   1510   __c = (__vector __bool char)vec_cmplt(__a, __b);
   1511   __r = vec_sel(__b, __a, __c);
   1512   return (__m64)((__vector long long)__r)[0];
   1513 #else
   1514   __m64_union __m1, __m2, __res;
   1515   long __i;
   1516 
   1517   __m1.as_m64 = __A;
   1518   __m2.as_m64 = __B;
   1519 
   1520   for (__i = 0; __i < 8; __i++)
   1521     __res.as_char[__i] =
   1522         ((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i])
   1523             ? __m1.as_char[__i]
   1524             : __m2.as_char[__i];
   1525 
   1526   return (__m64)__res.as_m64;
   1527 #endif
   1528 }
   1529 
   1530 extern __inline __m64
   1531     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1532     _m_pminub(__m64 __A, __m64 __B) {
   1533   return _mm_min_pu8(__A, __B);
   1534 }
   1535 
   1536 /* Create an 8-bit mask of the signs of 8-bit values.  */
   1537 extern __inline int
   1538     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1539     _mm_movemask_pi8(__m64 __A) {
   1540 #ifdef __powerpc64__
   1541   unsigned long long __p =
   1542 #ifdef __LITTLE_ENDIAN__
   1543       0x0008101820283038UL; // permute control for sign bits
   1544 #else
   1545       0x3830282018100800UL; // permute control for sign bits
   1546 #endif
   1547   return __builtin_bpermd(__p, __A);
   1548 #else
   1549 #ifdef __LITTLE_ENDIAN__
   1550   unsigned int __mask = 0x20283038UL;
   1551   unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf;
   1552   unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
   1553 #else
   1554   unsigned int __mask = 0x38302820UL;
   1555   unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
   1556   unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf;
   1557 #endif
   1558   return (__r2 << 4) | __r1;
   1559 #endif
   1560 }
   1561 
   1562 extern __inline int
   1563     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1564     _m_pmovmskb(__m64 __A) {
   1565   return _mm_movemask_pi8(__A);
   1566 }
   1567 
   1568 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
   1569    in B and produce the high 16 bits of the 32-bit results.  */
   1570 extern __inline __m64
   1571     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1572     _mm_mulhi_pu16(__m64 __A, __m64 __B) {
   1573   __vector unsigned short __a, __b;
   1574   __vector unsigned short __c;
   1575   __vector unsigned int __w0, __w1;
   1576   __vector unsigned char __xform1 = {
   1577 #ifdef __LITTLE_ENDIAN__
   1578       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
   1579       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
   1580 #else
   1581       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
   1582       0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
   1583 #endif
   1584   };
   1585 
   1586   __a = (__vector unsigned short)vec_splats(__A);
   1587   __b = (__vector unsigned short)vec_splats(__B);
   1588 
   1589   __w0 = vec_vmuleuh(__a, __b);
   1590   __w1 = vec_vmulouh(__a, __b);
   1591   __c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1);
   1592 
   1593   return (__m64)((__vector long long)__c)[0];
   1594 }
   1595 
   1596 extern __inline __m64
   1597     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1598     _m_pmulhuw(__m64 __A, __m64 __B) {
   1599   return _mm_mulhi_pu16(__A, __B);
   1600 }
   1601 
   1602 /* Return a combination of the four 16-bit values in A.  The selector
   1603    must be an immediate.  */
   1604 extern __inline __m64
   1605     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1606     _mm_shuffle_pi16(__m64 __A, int const __N) {
   1607   unsigned long __element_selector_10 = __N & 0x03;
   1608   unsigned long __element_selector_32 = (__N >> 2) & 0x03;
   1609   unsigned long __element_selector_54 = (__N >> 4) & 0x03;
   1610   unsigned long __element_selector_76 = (__N >> 6) & 0x03;
   1611   static const unsigned short __permute_selectors[4] = {
   1612 #ifdef __LITTLE_ENDIAN__
   1613       0x0908, 0x0B0A, 0x0D0C, 0x0F0E
   1614 #else
   1615       0x0607, 0x0405, 0x0203, 0x0001
   1616 #endif
   1617   };
   1618   __m64_union __t;
   1619   __vector unsigned long long __a, __p, __r;
   1620 
   1621 #ifdef __LITTLE_ENDIAN__
   1622   __t.as_short[0] = __permute_selectors[__element_selector_10];
   1623   __t.as_short[1] = __permute_selectors[__element_selector_32];
   1624   __t.as_short[2] = __permute_selectors[__element_selector_54];
   1625   __t.as_short[3] = __permute_selectors[__element_selector_76];
   1626 #else
   1627   __t.as_short[3] = __permute_selectors[__element_selector_10];
   1628   __t.as_short[2] = __permute_selectors[__element_selector_32];
   1629   __t.as_short[1] = __permute_selectors[__element_selector_54];
   1630   __t.as_short[0] = __permute_selectors[__element_selector_76];
   1631 #endif
   1632   __p = vec_splats(__t.as_m64);
   1633   __a = vec_splats(__A);
   1634   __r = vec_perm(__a, __a, (__vector unsigned char)__p);
   1635   return (__m64)((__vector long long)__r)[0];
   1636 }
   1637 
   1638 extern __inline __m64
   1639     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1640     _m_pshufw(__m64 __A, int const __N) {
   1641   return _mm_shuffle_pi16(__A, __N);
   1642 }
   1643 
   1644 /* Conditionally store byte elements of A into P.  The high bit of each
   1645    byte in the selector N determines whether the corresponding byte from
   1646    A is stored.  */
   1647 extern __inline void
   1648     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1649     _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) {
   1650   __m64 __hibit = 0x8080808080808080UL;
   1651   __m64 __mask, __tmp;
   1652   __m64 *__p = (__m64 *)__P;
   1653 
   1654   __tmp = *__p;
   1655   __mask = _mm_cmpeq_pi8((__N & __hibit), __hibit);
   1656   __tmp = (__tmp & (~__mask)) | (__A & __mask);
   1657   *__p = __tmp;
   1658 }
   1659 
   1660 extern __inline void
   1661     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1662     _m_maskmovq(__m64 __A, __m64 __N, char *__P) {
   1663   _mm_maskmove_si64(__A, __N, __P);
   1664 }
   1665 
   1666 /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
   1667 extern __inline __m64
   1668     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1669     _mm_avg_pu8(__m64 __A, __m64 __B) {
   1670   __vector unsigned char __a, __b, __c;
   1671 
   1672   __a = (__vector unsigned char)vec_splats(__A);
   1673   __b = (__vector unsigned char)vec_splats(__B);
   1674   __c = vec_avg(__a, __b);
   1675   return (__m64)((__vector long long)__c)[0];
   1676 }
   1677 
   1678 extern __inline __m64
   1679     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1680     _m_pavgb(__m64 __A, __m64 __B) {
   1681   return _mm_avg_pu8(__A, __B);
   1682 }
   1683 
   1684 /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
   1685 extern __inline __m64
   1686     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1687     _mm_avg_pu16(__m64 __A, __m64 __B) {
   1688   __vector unsigned short __a, __b, __c;
   1689 
   1690   __a = (__vector unsigned short)vec_splats(__A);
   1691   __b = (__vector unsigned short)vec_splats(__B);
   1692   __c = vec_avg(__a, __b);
   1693   return (__m64)((__vector long long)__c)[0];
   1694 }
   1695 
   1696 extern __inline __m64
   1697     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1698     _m_pavgw(__m64 __A, __m64 __B) {
   1699   return _mm_avg_pu16(__A, __B);
   1700 }
   1701 
   1702 /* Compute the sum of the absolute differences of the unsigned 8-bit
   1703    values in A and B.  Return the value in the lower 16-bit word; the
   1704    upper words are cleared.  */
   1705 extern __inline __m64
   1706     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1707     _mm_sad_pu8(__m64 __A, __m64 __B) {
   1708   __vector unsigned char __a, __b;
   1709   __vector unsigned char __vmin, __vmax, __vabsdiff;
   1710   __vector signed int __vsum;
   1711   const __vector unsigned int __zero = {0, 0, 0, 0};
   1712   __m64_union __result = {0};
   1713 
   1714   __a = (__vector unsigned char)(__vector unsigned long long){0UL, __A};
   1715   __b = (__vector unsigned char)(__vector unsigned long long){0UL, __B};
   1716   __vmin = vec_min(__a, __b);
   1717   __vmax = vec_max(__a, __b);
   1718   __vabsdiff = vec_sub(__vmax, __vmin);
   1719   /* Sum four groups of bytes into integers.  */
   1720   __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
   1721   /* Sum across four integers with integer result.  */
   1722   __vsum = vec_sums(__vsum, (__vector signed int)__zero);
   1723   /* The sum is in the right most 32-bits of the vector result.
   1724      Transfer to a GPR and truncate to 16 bits.  */
   1725   __result.as_short[0] = __vsum[3];
   1726   return __result.as_m64;
   1727 }
   1728 
   1729 extern __inline __m64
   1730     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1731     _m_psadbw(__m64 __A, __m64 __B) {
   1732   return _mm_sad_pu8(__A, __B);
   1733 }
   1734 
   1735 /* Stores the data in A to the address P without polluting the caches.  */
   1736 extern __inline void
   1737     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1738     _mm_stream_pi(__m64 *__P, __m64 __A) {
   1739   /* Use the data cache block touch for store transient.  */
   1740   __asm__("	dcbtstt	0,%0" : : "b"(__P) : "memory");
   1741   *__P = __A;
   1742 }
   1743 
   1744 /* Likewise.  The address must be 16-byte aligned.  */
   1745 extern __inline void
   1746     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1747     _mm_stream_ps(float *__P, __m128 __A) {
   1748   /* Use the data cache block touch for store transient.  */
   1749   __asm__("	dcbtstt	0,%0" : : "b"(__P) : "memory");
   1750   _mm_store_ps(__P, __A);
   1751 }
   1752 
   1753 /* Guarantees that every preceding store is globally visible before
   1754    any subsequent store.  */
   1755 extern __inline void
   1756     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1757     _mm_sfence(void) {
   1758   /* Generate a light weight sync.  */
   1759   __atomic_thread_fence(__ATOMIC_RELEASE);
   1760 }
   1761 
   1762 /* The execution of the next instruction is delayed by an implementation
   1763    specific amount of time.  The instruction does not modify the
   1764    architectural state.  This is after the pop_options pragma because
   1765    it does not require SSE support in the processor--the encoding is a
   1766    nop on processors that do not support it.  */
   1767 extern __inline void
   1768     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1769     _mm_pause(void) {
   1770   /* There is no exact match with this construct, but the following is
   1771      close to the desired effect.  */
   1772 #if _ARCH_PWR8
   1773   /* On power8 and later processors we can depend on Program Priority
   1774      (PRI) and associated "very low" PPI setting.  Since we don't know
   1775      what PPI this thread is running at we: 1) save the current PRI
   1776      from the PPR SPR into a local GRP, 2) set the PRI to "very low*
   1777      via the special or 31,31,31 encoding. 3) issue an "isync" to
   1778      insure the PRI change takes effect before we execute any more
   1779      instructions.
   1780      Now we can execute a lwsync (release barrier) while we execute
   1781      this thread at "very low" PRI.  Finally we restore the original
   1782      PRI and continue execution.  */
   1783   unsigned long __PPR;
   1784 
   1785   __asm__ volatile("	mfppr	%0;"
   1786                    "   or 31,31,31;"
   1787                    "   isync;"
   1788                    "   lwsync;"
   1789                    "   isync;"
   1790                    "   mtppr	%0;"
   1791                    : "=r"(__PPR)
   1792                    :
   1793                    : "memory");
   1794 #else
   1795   /* For older processor where we may not even have Program Priority
   1796      controls we can only depend on Heavy Weight Sync.  */
   1797   __atomic_thread_fence(__ATOMIC_SEQ_CST);
   1798 #endif
   1799 }
   1800 
   1801 /* Transpose the 4x4 matrix composed of row[0-3].  */
   1802 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                              \
   1803   do {                                                                         \
   1804     __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);         \
   1805     __v4sf __t0 = vec_vmrghw(__r0, __r1);                                      \
   1806     __v4sf __t1 = vec_vmrghw(__r2, __r3);                                      \
   1807     __v4sf __t2 = vec_vmrglw(__r0, __r1);                                      \
   1808     __v4sf __t3 = vec_vmrglw(__r2, __r3);                                      \
   1809     (row0) = (__v4sf)vec_mergeh((__vector long long)__t0,                      \
   1810                                 (__vector long long)__t1);                     \
   1811     (row1) = (__v4sf)vec_mergel((__vector long long)__t0,                      \
   1812                                 (__vector long long)__t1);                     \
   1813     (row2) = (__v4sf)vec_mergeh((__vector long long)__t2,                      \
   1814                                 (__vector long long)__t3);                     \
   1815     (row3) = (__v4sf)vec_mergel((__vector long long)__t2,                      \
   1816                                 (__vector long long)__t3);                     \
   1817   } while (0)
   1818 
   1819 /* For backward source compatibility.  */
   1820 //# include <emmintrin.h>
   1821 
   1822 #else
   1823 #include_next <xmmintrin.h>
   1824 #endif /* defined(__powerpc64__) &&                                            \
   1825         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
   1826 
   1827 #endif /* XMMINTRIN_H_ */