zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

mmintrin.h (45763B) - Raw


      1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 /* Implemented from the specification included in the Intel C++ Compiler
     11    User Guide and Reference, version 9.0.  */
     12 
     13 #ifndef NO_WARN_X86_INTRINSICS
     14 /* This header file is to help porting code using Intel intrinsics
     15    explicitly from x86_64 to powerpc64/powerpc64le.
     16 
     17    Since PowerPC target doesn't support native 64-bit vector type, we
     18    typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
     19    works well for _si64 and some _pi32 operations.
     20 
     21    For _pi16 and _pi8 operations, it's better to transfer __m64 into
     22    128-bit PowerPC vector first. Power8 introduced direct register
     23    move instructions which helps for more efficient implementation.
     24 
     25    It's user's responsibility to determine if the results of such port
     26    are acceptable or further changes are needed. Please note that much
     27    code using Intel intrinsics CAN BE REWRITTEN in more portable and
     28    efficient standard C or GNU C extensions with 64-bit scalar
     29    operations, or 128-bit SSE/Altivec operations, which are more
     30    recommended. */
     31 #error                                                                         \
     32     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
     33 #endif
     34 
     35 #ifndef _MMINTRIN_H_INCLUDED
     36 #define _MMINTRIN_H_INCLUDED
     37 
     38 #if defined(__powerpc64__) &&                                                  \
     39     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
     40 
     41 #include <altivec.h>
     42 /* The Intel API is flexible enough that we must allow aliasing with other
     43    vector types, and their scalar components.  */
     44 typedef __attribute__((__aligned__(8))) unsigned long long __m64;
     45 
     46 typedef __attribute__((__aligned__(8))) union {
     47   __m64 as_m64;
     48   char as_char[8];
     49   signed char as_signed_char[8];
     50   short as_short[4];
     51   int as_int[2];
     52   long long as_long_long;
     53   float as_float[2];
     54   double as_double;
     55 } __m64_union;
     56 
     57 /* Empty the multimedia state.  */
     58 extern __inline void
     59     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     60     _mm_empty(void) {
     61   /* nothing to do on PowerPC.  */
     62 }
     63 
     64 extern __inline void
     65     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     66     _m_empty(void) {
     67   /* nothing to do on PowerPC.  */
     68 }
     69 
     70 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
     71 extern __inline __m64
     72     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     73     _mm_cvtsi32_si64(int __i) {
     74   return (__m64)(unsigned int)__i;
     75 }
     76 
     77 extern __inline __m64
     78     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     79     _m_from_int(int __i) {
     80   return _mm_cvtsi32_si64(__i);
     81 }
     82 
     83 /* Convert the lower 32 bits of the __m64 object into an integer.  */
     84 extern __inline int
     85     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     86     _mm_cvtsi64_si32(__m64 __i) {
     87   return ((int)__i);
     88 }
     89 
     90 extern __inline int
     91     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     92     _m_to_int(__m64 __i) {
     93   return _mm_cvtsi64_si32(__i);
     94 }
     95 
     96 /* Convert I to a __m64 object.  */
     97 
     98 /* Intel intrinsic.  */
     99 extern __inline __m64
    100     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    101     _m_from_int64(long long __i) {
    102   return (__m64)__i;
    103 }
    104 
    105 extern __inline __m64
    106     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    107     _mm_cvtsi64_m64(long long __i) {
    108   return (__m64)__i;
    109 }
    110 
    111 /* Microsoft intrinsic.  */
    112 extern __inline __m64
    113     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    114     _mm_cvtsi64x_si64(long long __i) {
    115   return (__m64)__i;
    116 }
    117 
    118 extern __inline __m64
    119     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    120     _mm_set_pi64x(long long __i) {
    121   return (__m64)__i;
    122 }
    123 
    124 /* Convert the __m64 object to a 64bit integer.  */
    125 
    126 /* Intel intrinsic.  */
    127 extern __inline long long
    128     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    129     _m_to_int64(__m64 __i) {
    130   return (long long)__i;
    131 }
    132 
    133 extern __inline long long
    134     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    135     _mm_cvtm64_si64(__m64 __i) {
    136   return (long long)__i;
    137 }
    138 
    139 /* Microsoft intrinsic.  */
    140 extern __inline long long
    141     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    142     _mm_cvtsi64_si64x(__m64 __i) {
    143   return (long long)__i;
    144 }
    145 
    146 #ifdef _ARCH_PWR8
    147 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    148    the result, and the four 16-bit values from M2 into the upper four 8-bit
    149    values of the result, all with signed saturation.  */
    150 extern __inline __m64
    151     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    152     _mm_packs_pi16(__m64 __m1, __m64 __m2) {
    153   __vector signed short __vm1;
    154   __vector signed char __vresult;
    155 
    156   __vm1 = (__vector signed short)(__vector unsigned long long)
    157 #ifdef __LITTLE_ENDIAN__
    158       {__m1, __m2};
    159 #else
    160       {__m2, __m1};
    161 #endif
    162   __vresult = vec_packs(__vm1, __vm1);
    163   return (__m64)((__vector long long)__vresult)[0];
    164 }
    165 
    166 extern __inline __m64
    167     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    168     _m_packsswb(__m64 __m1, __m64 __m2) {
    169   return _mm_packs_pi16(__m1, __m2);
    170 }
    171 
    172 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
    173    the result, and the two 32-bit values from M2 into the upper two 16-bit
    174    values of the result, all with signed saturation.  */
    175 extern __inline __m64
    176     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    177     _mm_packs_pi32(__m64 __m1, __m64 __m2) {
    178   __vector signed int __vm1;
    179   __vector signed short __vresult;
    180 
    181   __vm1 = (__vector signed int)(__vector unsigned long long)
    182 #ifdef __LITTLE_ENDIAN__
    183       {__m1, __m2};
    184 #else
    185       {__m2, __m1};
    186 #endif
    187   __vresult = vec_packs(__vm1, __vm1);
    188   return (__m64)((__vector long long)__vresult)[0];
    189 }
    190 
    191 extern __inline __m64
    192     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    193     _m_packssdw(__m64 __m1, __m64 __m2) {
    194   return _mm_packs_pi32(__m1, __m2);
    195 }
    196 
    197 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    198    the result, and the four 16-bit values from M2 into the upper four 8-bit
    199    values of the result, all with unsigned saturation.  */
    200 extern __inline __m64
    201     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    202     _mm_packs_pu16(__m64 __m1, __m64 __m2) {
    203   __vector unsigned char __r;
    204   __vector signed short __vm1 = (__vector signed short)(__vector long long)
    205 #ifdef __LITTLE_ENDIAN__
    206       {__m1, __m2};
    207 #else
    208       {__m2, __m1};
    209 #endif
    210   const __vector signed short __zero = {0};
    211   __vector __bool short __select = vec_cmplt(__vm1, __zero);
    212   __r =
    213       vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1);
    214   __vector __bool char __packsel = vec_pack(__select, __select);
    215   __r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel);
    216   return (__m64)((__vector long long)__r)[0];
    217 }
    218 
    219 extern __inline __m64
    220     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    221     _m_packuswb(__m64 __m1, __m64 __m2) {
    222   return _mm_packs_pu16(__m1, __m2);
    223 }
    224 #endif /* end ARCH_PWR8 */
    225 
    226 /* Interleave the four 8-bit values from the high half of M1 with the four
    227    8-bit values from the high half of M2.  */
    228 extern __inline __m64
    229     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    230     _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
    231 #if _ARCH_PWR8
    232   __vector unsigned char __a, __b, __c;
    233 
    234   __a = (__vector unsigned char)vec_splats(__m1);
    235   __b = (__vector unsigned char)vec_splats(__m2);
    236   __c = vec_mergel(__a, __b);
    237   return (__m64)((__vector long long)__c)[1];
    238 #else
    239   __m64_union __mu1, __mu2, __res;
    240 
    241   __mu1.as_m64 = __m1;
    242   __mu2.as_m64 = __m2;
    243 
    244   __res.as_char[0] = __mu1.as_char[4];
    245   __res.as_char[1] = __mu2.as_char[4];
    246   __res.as_char[2] = __mu1.as_char[5];
    247   __res.as_char[3] = __mu2.as_char[5];
    248   __res.as_char[4] = __mu1.as_char[6];
    249   __res.as_char[5] = __mu2.as_char[6];
    250   __res.as_char[6] = __mu1.as_char[7];
    251   __res.as_char[7] = __mu2.as_char[7];
    252 
    253   return (__m64)__res.as_m64;
    254 #endif
    255 }
    256 
    257 extern __inline __m64
    258     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    259     _m_punpckhbw(__m64 __m1, __m64 __m2) {
    260   return _mm_unpackhi_pi8(__m1, __m2);
    261 }
    262 
    263 /* Interleave the two 16-bit values from the high half of M1 with the two
    264    16-bit values from the high half of M2.  */
    265 extern __inline __m64
    266     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    267     _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
    268   __m64_union __mu1, __mu2, __res;
    269 
    270   __mu1.as_m64 = __m1;
    271   __mu2.as_m64 = __m2;
    272 
    273   __res.as_short[0] = __mu1.as_short[2];
    274   __res.as_short[1] = __mu2.as_short[2];
    275   __res.as_short[2] = __mu1.as_short[3];
    276   __res.as_short[3] = __mu2.as_short[3];
    277 
    278   return (__m64)__res.as_m64;
    279 }
    280 
    281 extern __inline __m64
    282     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    283     _m_punpckhwd(__m64 __m1, __m64 __m2) {
    284   return _mm_unpackhi_pi16(__m1, __m2);
    285 }
    286 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
    287    value from the high half of M2.  */
    288 extern __inline __m64
    289     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    290     _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
    291   __m64_union __mu1, __mu2, __res;
    292 
    293   __mu1.as_m64 = __m1;
    294   __mu2.as_m64 = __m2;
    295 
    296   __res.as_int[0] = __mu1.as_int[1];
    297   __res.as_int[1] = __mu2.as_int[1];
    298 
    299   return (__m64)__res.as_m64;
    300 }
    301 
    302 extern __inline __m64
    303     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    304     _m_punpckhdq(__m64 __m1, __m64 __m2) {
    305   return _mm_unpackhi_pi32(__m1, __m2);
    306 }
    307 /* Interleave the four 8-bit values from the low half of M1 with the four
    308    8-bit values from the low half of M2.  */
    309 extern __inline __m64
    310     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    311     _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
    312 #if _ARCH_PWR8
    313   __vector unsigned char __a, __b, __c;
    314 
    315   __a = (__vector unsigned char)vec_splats(__m1);
    316   __b = (__vector unsigned char)vec_splats(__m2);
    317   __c = vec_mergel(__a, __b);
    318   return (__m64)((__vector long long)__c)[0];
    319 #else
    320   __m64_union __mu1, __mu2, __res;
    321 
    322   __mu1.as_m64 = __m1;
    323   __mu2.as_m64 = __m2;
    324 
    325   __res.as_char[0] = __mu1.as_char[0];
    326   __res.as_char[1] = __mu2.as_char[0];
    327   __res.as_char[2] = __mu1.as_char[1];
    328   __res.as_char[3] = __mu2.as_char[1];
    329   __res.as_char[4] = __mu1.as_char[2];
    330   __res.as_char[5] = __mu2.as_char[2];
    331   __res.as_char[6] = __mu1.as_char[3];
    332   __res.as_char[7] = __mu2.as_char[3];
    333 
    334   return (__m64)__res.as_m64;
    335 #endif
    336 }
    337 
    338 extern __inline __m64
    339     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    340     _m_punpcklbw(__m64 __m1, __m64 __m2) {
    341   return _mm_unpacklo_pi8(__m1, __m2);
    342 }
    343 /* Interleave the two 16-bit values from the low half of M1 with the two
    344    16-bit values from the low half of M2.  */
    345 extern __inline __m64
    346     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    347     _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
    348   __m64_union __mu1, __mu2, __res;
    349 
    350   __mu1.as_m64 = __m1;
    351   __mu2.as_m64 = __m2;
    352 
    353   __res.as_short[0] = __mu1.as_short[0];
    354   __res.as_short[1] = __mu2.as_short[0];
    355   __res.as_short[2] = __mu1.as_short[1];
    356   __res.as_short[3] = __mu2.as_short[1];
    357 
    358   return (__m64)__res.as_m64;
    359 }
    360 
    361 extern __inline __m64
    362     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    363     _m_punpcklwd(__m64 __m1, __m64 __m2) {
    364   return _mm_unpacklo_pi16(__m1, __m2);
    365 }
    366 
    367 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
    368    value from the low half of M2.  */
    369 extern __inline __m64
    370     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    371     _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
    372   __m64_union __mu1, __mu2, __res;
    373 
    374   __mu1.as_m64 = __m1;
    375   __mu2.as_m64 = __m2;
    376 
    377   __res.as_int[0] = __mu1.as_int[0];
    378   __res.as_int[1] = __mu2.as_int[0];
    379 
    380   return (__m64)__res.as_m64;
    381 }
    382 
    383 extern __inline __m64
    384     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    385     _m_punpckldq(__m64 __m1, __m64 __m2) {
    386   return _mm_unpacklo_pi32(__m1, __m2);
    387 }
    388 
    389 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
    390 extern __inline __m64
    391     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    392     _mm_add_pi8(__m64 __m1, __m64 __m2) {
    393 #if _ARCH_PWR8
    394   __vector signed char __a, __b, __c;
    395 
    396   __a = (__vector signed char)vec_splats(__m1);
    397   __b = (__vector signed char)vec_splats(__m2);
    398   __c = vec_add(__a, __b);
    399   return (__m64)((__vector long long)__c)[0];
    400 #else
    401   __m64_union __mu1, __mu2, __res;
    402 
    403   __mu1.as_m64 = __m1;
    404   __mu2.as_m64 = __m2;
    405 
    406   __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
    407   __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
    408   __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
    409   __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
    410   __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
    411   __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
    412   __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
    413   __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
    414 
    415   return (__m64)__res.as_m64;
    416 #endif
    417 }
    418 
    419 extern __inline __m64
    420     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    421     _m_paddb(__m64 __m1, __m64 __m2) {
    422   return _mm_add_pi8(__m1, __m2);
    423 }
    424 
    425 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
    426 extern __inline __m64
    427     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    428     _mm_add_pi16(__m64 __m1, __m64 __m2) {
    429 #if _ARCH_PWR8
    430   __vector signed short __a, __b, __c;
    431 
    432   __a = (__vector signed short)vec_splats(__m1);
    433   __b = (__vector signed short)vec_splats(__m2);
    434   __c = vec_add(__a, __b);
    435   return (__m64)((__vector long long)__c)[0];
    436 #else
    437   __m64_union __mu1, __mu2, __res;
    438 
    439   __mu1.as_m64 = __m1;
    440   __mu2.as_m64 = __m2;
    441 
    442   __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
    443   __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
    444   __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
    445   __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
    446 
    447   return (__m64)__res.as_m64;
    448 #endif
    449 }
    450 
    451 extern __inline __m64
    452     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    453     _m_paddw(__m64 __m1, __m64 __m2) {
    454   return _mm_add_pi16(__m1, __m2);
    455 }
    456 
    457 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
    458 extern __inline __m64
    459     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    460     _mm_add_pi32(__m64 __m1, __m64 __m2) {
    461 #if _ARCH_PWR9
    462   __vector signed int __a, __b, __c;
    463 
    464   __a = (__vector signed int)vec_splats(__m1);
    465   __b = (__vector signed int)vec_splats(__m2);
    466   __c = vec_add(__a, __b);
    467   return (__m64)((__vector long long)__c)[0];
    468 #else
    469   __m64_union __mu1, __mu2, __res;
    470 
    471   __mu1.as_m64 = __m1;
    472   __mu2.as_m64 = __m2;
    473 
    474   __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
    475   __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
    476 
    477   return (__m64)__res.as_m64;
    478 #endif
    479 }
    480 
    481 extern __inline __m64
    482     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    483     _m_paddd(__m64 __m1, __m64 __m2) {
    484   return _mm_add_pi32(__m1, __m2);
    485 }
    486 
    487 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
    488 extern __inline __m64
    489     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    490     _mm_sub_pi8(__m64 __m1, __m64 __m2) {
    491 #if _ARCH_PWR8
    492   __vector signed char __a, __b, __c;
    493 
    494   __a = (__vector signed char)vec_splats(__m1);
    495   __b = (__vector signed char)vec_splats(__m2);
    496   __c = vec_sub(__a, __b);
    497   return (__m64)((__vector long long)__c)[0];
    498 #else
    499   __m64_union __mu1, __mu2, __res;
    500 
    501   __mu1.as_m64 = __m1;
    502   __mu2.as_m64 = __m2;
    503 
    504   __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
    505   __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
    506   __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
    507   __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
    508   __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
    509   __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
    510   __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
    511   __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
    512 
    513   return (__m64)__res.as_m64;
    514 #endif
    515 }
    516 
    517 extern __inline __m64
    518     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    519     _m_psubb(__m64 __m1, __m64 __m2) {
    520   return _mm_sub_pi8(__m1, __m2);
    521 }
    522 
    523 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
    524 extern __inline __m64
    525     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    526     _mm_sub_pi16(__m64 __m1, __m64 __m2) {
    527 #if _ARCH_PWR8
    528   __vector signed short __a, __b, __c;
    529 
    530   __a = (__vector signed short)vec_splats(__m1);
    531   __b = (__vector signed short)vec_splats(__m2);
    532   __c = vec_sub(__a, __b);
    533   return (__m64)((__vector long long)__c)[0];
    534 #else
    535   __m64_union __mu1, __mu2, __res;
    536 
    537   __mu1.as_m64 = __m1;
    538   __mu2.as_m64 = __m2;
    539 
    540   __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
    541   __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
    542   __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
    543   __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
    544 
    545   return (__m64)__res.as_m64;
    546 #endif
    547 }
    548 
    549 extern __inline __m64
    550     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    551     _m_psubw(__m64 __m1, __m64 __m2) {
    552   return _mm_sub_pi16(__m1, __m2);
    553 }
    554 
    555 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
    556 extern __inline __m64
    557     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    558     _mm_sub_pi32(__m64 __m1, __m64 __m2) {
    559 #if _ARCH_PWR9
    560   __vector signed int __a, __b, __c;
    561 
    562   __a = (__vector signed int)vec_splats(__m1);
    563   __b = (__vector signed int)vec_splats(__m2);
    564   __c = vec_sub(__a, __b);
    565   return (__m64)((__vector long long)__c)[0];
    566 #else
    567   __m64_union __mu1, __mu2, __res;
    568 
    569   __mu1.as_m64 = __m1;
    570   __mu2.as_m64 = __m2;
    571 
    572   __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
    573   __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
    574 
    575   return (__m64)__res.as_m64;
    576 #endif
    577 }
    578 
    579 extern __inline __m64
    580     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    581     _m_psubd(__m64 __m1, __m64 __m2) {
    582   return _mm_sub_pi32(__m1, __m2);
    583 }
    584 
    585 extern __inline __m64
    586     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    587     _mm_add_si64(__m64 __m1, __m64 __m2) {
    588   return (__m1 + __m2);
    589 }
    590 
    591 extern __inline __m64
    592     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    593     _mm_sub_si64(__m64 __m1, __m64 __m2) {
    594   return (__m1 - __m2);
    595 }
    596 
    597 /* Shift the 64-bit value in M left by COUNT.  */
    598 extern __inline __m64
    599     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    600     _mm_sll_si64(__m64 __m, __m64 __count) {
    601   return (__m << __count);
    602 }
    603 
    604 extern __inline __m64
    605     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    606     _m_psllq(__m64 __m, __m64 __count) {
    607   return _mm_sll_si64(__m, __count);
    608 }
    609 
    610 extern __inline __m64
    611     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    612     _mm_slli_si64(__m64 __m, const int __count) {
    613   return (__m << __count);
    614 }
    615 
    616 extern __inline __m64
    617     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    618     _m_psllqi(__m64 __m, const int __count) {
    619   return _mm_slli_si64(__m, __count);
    620 }
    621 
    622 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
    623 extern __inline __m64
    624     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    625     _mm_srl_si64(__m64 __m, __m64 __count) {
    626   return (__m >> __count);
    627 }
    628 
    629 extern __inline __m64
    630     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    631     _m_psrlq(__m64 __m, __m64 __count) {
    632   return _mm_srl_si64(__m, __count);
    633 }
    634 
    635 extern __inline __m64
    636     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    637     _mm_srli_si64(__m64 __m, const int __count) {
    638   return (__m >> __count);
    639 }
    640 
    641 extern __inline __m64
    642     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    643     _m_psrlqi(__m64 __m, const int __count) {
    644   return _mm_srli_si64(__m, __count);
    645 }
    646 
    647 /* Bit-wise AND the 64-bit values in M1 and M2.  */
    648 extern __inline __m64
    649     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    650     _mm_and_si64(__m64 __m1, __m64 __m2) {
    651   return (__m1 & __m2);
    652 }
    653 
    654 extern __inline __m64
    655     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    656     _m_pand(__m64 __m1, __m64 __m2) {
    657   return _mm_and_si64(__m1, __m2);
    658 }
    659 
    660 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
    661    64-bit value in M2.  */
    662 extern __inline __m64
    663     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    664     _mm_andnot_si64(__m64 __m1, __m64 __m2) {
    665   return (~__m1 & __m2);
    666 }
    667 
    668 extern __inline __m64
    669     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    670     _m_pandn(__m64 __m1, __m64 __m2) {
    671   return _mm_andnot_si64(__m1, __m2);
    672 }
    673 
    674 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
    675 extern __inline __m64
    676     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    677     _mm_or_si64(__m64 __m1, __m64 __m2) {
    678   return (__m1 | __m2);
    679 }
    680 
    681 extern __inline __m64
    682     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    683     _m_por(__m64 __m1, __m64 __m2) {
    684   return _mm_or_si64(__m1, __m2);
    685 }
    686 
    687 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
    688 extern __inline __m64
    689     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    690     _mm_xor_si64(__m64 __m1, __m64 __m2) {
    691   return (__m1 ^ __m2);
    692 }
    693 
    694 extern __inline __m64
    695     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    696     _m_pxor(__m64 __m1, __m64 __m2) {
    697   return _mm_xor_si64(__m1, __m2);
    698 }
    699 
    700 /* Creates a 64-bit zero.  */
    701 extern __inline __m64
    702     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    703     _mm_setzero_si64(void) {
    704   return (__m64)0;
    705 }
    706 
    707 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
    708    test is true and zero if false.  */
    709 extern __inline __m64
    710     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    711     _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
    712 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
    713   __m64 __res;
    714   __asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :);
    715   return (__res);
    716 #else
    717   __m64_union __mu1, __mu2, __res;
    718 
    719   __mu1.as_m64 = __m1;
    720   __mu2.as_m64 = __m2;
    721 
    722   __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0;
    723   __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0;
    724   __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0;
    725   __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0;
    726   __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0;
    727   __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0;
    728   __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0;
    729   __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0;
    730 
    731   return (__m64)__res.as_m64;
    732 #endif
    733 }
    734 
    735 extern __inline __m64
    736     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    737     _m_pcmpeqb(__m64 __m1, __m64 __m2) {
    738   return _mm_cmpeq_pi8(__m1, __m2);
    739 }
    740 
    741 extern __inline __m64
    742     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    743     _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
    744 #if _ARCH_PWR8
    745   __vector signed char __a, __b, __c;
    746 
    747   __a = (__vector signed char)vec_splats(__m1);
    748   __b = (__vector signed char)vec_splats(__m2);
    749   __c = (__vector signed char)vec_cmpgt(__a, __b);
    750   return (__m64)((__vector long long)__c)[0];
    751 #else
    752   __m64_union __mu1, __mu2, __res;
    753 
    754   __mu1.as_m64 = __m1;
    755   __mu2.as_m64 = __m2;
    756 
    757   __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0;
    758   __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0;
    759   __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0;
    760   __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0;
    761   __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0;
    762   __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0;
    763   __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0;
    764   __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0;
    765 
    766   return (__m64)__res.as_m64;
    767 #endif
    768 }
    769 
    770 extern __inline __m64
    771     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    772     _m_pcmpgtb(__m64 __m1, __m64 __m2) {
    773   return _mm_cmpgt_pi8(__m1, __m2);
    774 }
    775 
    776 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
    777    the test is true and zero if false.  */
    778 extern __inline __m64
    779     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    780     _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
    781 #if _ARCH_PWR8
    782   __vector signed short __a, __b, __c;
    783 
    784   __a = (__vector signed short)vec_splats(__m1);
    785   __b = (__vector signed short)vec_splats(__m2);
    786   __c = (__vector signed short)vec_cmpeq(__a, __b);
    787   return (__m64)((__vector long long)__c)[0];
    788 #else
    789   __m64_union __mu1, __mu2, __res;
    790 
    791   __mu1.as_m64 = __m1;
    792   __mu2.as_m64 = __m2;
    793 
    794   __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0;
    795   __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0;
    796   __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0;
    797   __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0;
    798 
    799   return (__m64)__res.as_m64;
    800 #endif
    801 }
    802 
    803 extern __inline __m64
    804     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    805     _m_pcmpeqw(__m64 __m1, __m64 __m2) {
    806   return _mm_cmpeq_pi16(__m1, __m2);
    807 }
    808 
    809 extern __inline __m64
    810     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    811     _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
    812 #if _ARCH_PWR8
    813   __vector signed short __a, __b, __c;
    814 
    815   __a = (__vector signed short)vec_splats(__m1);
    816   __b = (__vector signed short)vec_splats(__m2);
    817   __c = (__vector signed short)vec_cmpgt(__a, __b);
    818   return (__m64)((__vector long long)__c)[0];
    819 #else
    820   __m64_union __mu1, __mu2, __res;
    821 
    822   __mu1.as_m64 = __m1;
    823   __mu2.as_m64 = __m2;
    824 
    825   __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0;
    826   __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0;
    827   __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0;
    828   __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0;
    829 
    830   return (__m64)__res.as_m64;
    831 #endif
    832 }
    833 
    834 extern __inline __m64
    835     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    836     _m_pcmpgtw(__m64 __m1, __m64 __m2) {
    837   return _mm_cmpgt_pi16(__m1, __m2);
    838 }
    839 
    840 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
    841    the test is true and zero if false.  */
    842 extern __inline __m64
    843     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    844     _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
    845 #if _ARCH_PWR9
    846   __vector signed int __a, __b, __c;
    847 
    848   __a = (__vector signed int)vec_splats(__m1);
    849   __b = (__vector signed int)vec_splats(__m2);
    850   __c = (__vector signed int)vec_cmpeq(__a, __b);
    851   return (__m64)((__vector long long)__c)[0];
    852 #else
    853   __m64_union __mu1, __mu2, __res;
    854 
    855   __mu1.as_m64 = __m1;
    856   __mu2.as_m64 = __m2;
    857 
    858   __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0;
    859   __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0;
    860 
    861   return (__m64)__res.as_m64;
    862 #endif
    863 }
    864 
    865 extern __inline __m64
    866     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    867     _m_pcmpeqd(__m64 __m1, __m64 __m2) {
    868   return _mm_cmpeq_pi32(__m1, __m2);
    869 }
    870 
    871 extern __inline __m64
    872     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    873     _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
    874 #if _ARCH_PWR9
    875   __vector signed int __a, __b, __c;
    876 
    877   __a = (__vector signed int)vec_splats(__m1);
    878   __b = (__vector signed int)vec_splats(__m2);
    879   __c = (__vector signed int)vec_cmpgt(__a, __b);
    880   return (__m64)((__vector long long)__c)[0];
    881 #else
    882   __m64_union __mu1, __mu2, __res;
    883 
    884   __mu1.as_m64 = __m1;
    885   __mu2.as_m64 = __m2;
    886 
    887   __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0;
    888   __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0;
    889 
    890   return (__m64)__res.as_m64;
    891 #endif
    892 }
    893 
    894 extern __inline __m64
    895     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    896     _m_pcmpgtd(__m64 __m1, __m64 __m2) {
    897   return _mm_cmpgt_pi32(__m1, __m2);
    898 }
    899 
    900 #if _ARCH_PWR8
    901 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
    902    saturated arithmetic.  */
    903 extern __inline __m64
    904     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    905     _mm_adds_pi8(__m64 __m1, __m64 __m2) {
    906   __vector signed char __a, __b, __c;
    907 
    908   __a = (__vector signed char)vec_splats(__m1);
    909   __b = (__vector signed char)vec_splats(__m2);
    910   __c = vec_adds(__a, __b);
    911   return (__m64)((__vector long long)__c)[0];
    912 }
    913 
    914 extern __inline __m64
    915     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    916     _m_paddsb(__m64 __m1, __m64 __m2) {
    917   return _mm_adds_pi8(__m1, __m2);
    918 }
    919 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
    920    saturated arithmetic.  */
    921 extern __inline __m64
    922     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    923     _mm_adds_pi16(__m64 __m1, __m64 __m2) {
    924   __vector signed short __a, __b, __c;
    925 
    926   __a = (__vector signed short)vec_splats(__m1);
    927   __b = (__vector signed short)vec_splats(__m2);
    928   __c = vec_adds(__a, __b);
    929   return (__m64)((__vector long long)__c)[0];
    930 }
    931 
    932 extern __inline __m64
    933     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    934     _m_paddsw(__m64 __m1, __m64 __m2) {
    935   return _mm_adds_pi16(__m1, __m2);
    936 }
    937 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
    938    saturated arithmetic.  */
    939 extern __inline __m64
    940     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    941     _mm_adds_pu8(__m64 __m1, __m64 __m2) {
    942   __vector unsigned char __a, __b, __c;
    943 
    944   __a = (__vector unsigned char)vec_splats(__m1);
    945   __b = (__vector unsigned char)vec_splats(__m2);
    946   __c = vec_adds(__a, __b);
    947   return (__m64)((__vector long long)__c)[0];
    948 }
    949 
    950 extern __inline __m64
    951     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    952     _m_paddusb(__m64 __m1, __m64 __m2) {
    953   return _mm_adds_pu8(__m1, __m2);
    954 }
    955 
    956 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
    957    saturated arithmetic.  */
    958 extern __inline __m64
    959     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    960     _mm_adds_pu16(__m64 __m1, __m64 __m2) {
    961   __vector unsigned short __a, __b, __c;
    962 
    963   __a = (__vector unsigned short)vec_splats(__m1);
    964   __b = (__vector unsigned short)vec_splats(__m2);
    965   __c = vec_adds(__a, __b);
    966   return (__m64)((__vector long long)__c)[0];
    967 }
    968 
    969 extern __inline __m64
    970     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    971     _m_paddusw(__m64 __m1, __m64 __m2) {
    972   return _mm_adds_pu16(__m1, __m2);
    973 }
    974 
    975 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
    976    saturating arithmetic.  */
    977 extern __inline __m64
    978     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    979     _mm_subs_pi8(__m64 __m1, __m64 __m2) {
    980   __vector signed char __a, __b, __c;
    981 
    982   __a = (__vector signed char)vec_splats(__m1);
    983   __b = (__vector signed char)vec_splats(__m2);
    984   __c = vec_subs(__a, __b);
    985   return (__m64)((__vector long long)__c)[0];
    986 }
    987 
    988 extern __inline __m64
    989     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    990     _m_psubsb(__m64 __m1, __m64 __m2) {
    991   return _mm_subs_pi8(__m1, __m2);
    992 }
    993 
    994 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
    995    signed saturating arithmetic.  */
    996 extern __inline __m64
    997     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    998     _mm_subs_pi16(__m64 __m1, __m64 __m2) {
    999   __vector signed short __a, __b, __c;
   1000 
   1001   __a = (__vector signed short)vec_splats(__m1);
   1002   __b = (__vector signed short)vec_splats(__m2);
   1003   __c = vec_subs(__a, __b);
   1004   return (__m64)((__vector long long)__c)[0];
   1005 }
   1006 
   1007 extern __inline __m64
   1008     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1009     _m_psubsw(__m64 __m1, __m64 __m2) {
   1010   return _mm_subs_pi16(__m1, __m2);
   1011 }
   1012 
   1013 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
   1014    unsigned saturating arithmetic.  */
   1015 extern __inline __m64
   1016     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1017     _mm_subs_pu8(__m64 __m1, __m64 __m2) {
   1018   __vector unsigned char __a, __b, __c;
   1019 
   1020   __a = (__vector unsigned char)vec_splats(__m1);
   1021   __b = (__vector unsigned char)vec_splats(__m2);
   1022   __c = vec_subs(__a, __b);
   1023   return (__m64)((__vector long long)__c)[0];
   1024 }
   1025 
   1026 extern __inline __m64
   1027     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1028     _m_psubusb(__m64 __m1, __m64 __m2) {
   1029   return _mm_subs_pu8(__m1, __m2);
   1030 }
   1031 
   1032 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
   1033    unsigned saturating arithmetic.  */
   1034 extern __inline __m64
   1035     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1036     _mm_subs_pu16(__m64 __m1, __m64 __m2) {
   1037   __vector unsigned short __a, __b, __c;
   1038 
   1039   __a = (__vector unsigned short)vec_splats(__m1);
   1040   __b = (__vector unsigned short)vec_splats(__m2);
   1041   __c = vec_subs(__a, __b);
   1042   return (__m64)((__vector long long)__c)[0];
   1043 }
   1044 
   1045 extern __inline __m64
   1046     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1047     _m_psubusw(__m64 __m1, __m64 __m2) {
   1048   return _mm_subs_pu16(__m1, __m2);
   1049 }
   1050 
   1051 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
   1052    four 32-bit intermediate results, which are then summed by pairs to
   1053    produce two 32-bit results.  */
   1054 extern __inline __m64
   1055     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1056     _mm_madd_pi16(__m64 __m1, __m64 __m2) {
   1057   __vector signed short __a, __b;
   1058   __vector signed int __c;
   1059   __vector signed int __zero = {0, 0, 0, 0};
   1060 
   1061   __a = (__vector signed short)vec_splats(__m1);
   1062   __b = (__vector signed short)vec_splats(__m2);
   1063   __c = vec_vmsumshm(__a, __b, __zero);
   1064   return (__m64)((__vector long long)__c)[0];
   1065 }
   1066 
   1067 extern __inline __m64
   1068     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1069     _m_pmaddwd(__m64 __m1, __m64 __m2) {
   1070   return _mm_madd_pi16(__m1, __m2);
   1071 }
   1072 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
   1073    M2 and produce the high 16 bits of the 32-bit results.  */
   1074 extern __inline __m64
   1075     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1076     _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
   1077   __vector signed short __a, __b;
   1078   __vector signed short __c;
   1079   __vector signed int __w0, __w1;
   1080   __vector unsigned char __xform1 = {
   1081 #ifdef __LITTLE_ENDIAN__
   1082       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
   1083       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
   1084 #else
   1085       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
   1086       0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
   1087 #endif
   1088   };
   1089 
   1090   __a = (__vector signed short)vec_splats(__m1);
   1091   __b = (__vector signed short)vec_splats(__m2);
   1092 
   1093   __w0 = vec_vmulesh(__a, __b);
   1094   __w1 = vec_vmulosh(__a, __b);
   1095   __c = (__vector signed short)vec_perm(__w0, __w1, __xform1);
   1096 
   1097   return (__m64)((__vector long long)__c)[0];
   1098 }
   1099 
   1100 extern __inline __m64
   1101     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1102     _m_pmulhw(__m64 __m1, __m64 __m2) {
   1103   return _mm_mulhi_pi16(__m1, __m2);
   1104 }
   1105 
   1106 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
   1107    the low 16 bits of the results.  */
   1108 extern __inline __m64
   1109     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1110     _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
   1111   __vector signed short __a, __b, __c;
   1112 
   1113   __a = (__vector signed short)vec_splats(__m1);
   1114   __b = (__vector signed short)vec_splats(__m2);
   1115   __c = __a * __b;
   1116   return (__m64)((__vector long long)__c)[0];
   1117 }
   1118 
   1119 extern __inline __m64
   1120     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1121     _m_pmullw(__m64 __m1, __m64 __m2) {
   1122   return _mm_mullo_pi16(__m1, __m2);
   1123 }
   1124 
   1125 /* Shift four 16-bit values in M left by COUNT.  */
   1126 extern __inline __m64
   1127     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1128     _mm_sll_pi16(__m64 __m, __m64 __count) {
   1129   __vector signed short __r;
   1130   __vector unsigned short __c;
   1131 
   1132   if (__count <= 15) {
   1133     __r = (__vector signed short)vec_splats(__m);
   1134     __c = (__vector unsigned short)vec_splats((unsigned short)__count);
   1135     __r = vec_sl(__r, (__vector unsigned short)__c);
   1136     return (__m64)((__vector long long)__r)[0];
   1137   } else
   1138     return (0);
   1139 }
   1140 
   1141 extern __inline __m64
   1142     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1143     _m_psllw(__m64 __m, __m64 __count) {
   1144   return _mm_sll_pi16(__m, __count);
   1145 }
   1146 
   1147 extern __inline __m64
   1148     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1149     _mm_slli_pi16(__m64 __m, int __count) {
   1150   /* Promote int to long then invoke mm_sll_pi16.  */
   1151   return _mm_sll_pi16(__m, __count);
   1152 }
   1153 
   1154 extern __inline __m64
   1155     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1156     _m_psllwi(__m64 __m, int __count) {
   1157   return _mm_slli_pi16(__m, __count);
   1158 }
   1159 
   1160 /* Shift two 32-bit values in M left by COUNT.  */
   1161 extern __inline __m64
   1162     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1163     _mm_sll_pi32(__m64 __m, __m64 __count) {
   1164   __m64_union __res;
   1165 
   1166   __res.as_m64 = __m;
   1167 
   1168   __res.as_int[0] = __res.as_int[0] << __count;
   1169   __res.as_int[1] = __res.as_int[1] << __count;
   1170   return (__res.as_m64);
   1171 }
   1172 
   1173 extern __inline __m64
   1174     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1175     _m_pslld(__m64 __m, __m64 __count) {
   1176   return _mm_sll_pi32(__m, __count);
   1177 }
   1178 
   1179 extern __inline __m64
   1180     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1181     _mm_slli_pi32(__m64 __m, int __count) {
   1182   /* Promote int to long then invoke mm_sll_pi32.  */
   1183   return _mm_sll_pi32(__m, __count);
   1184 }
   1185 
   1186 extern __inline __m64
   1187     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1188     _m_pslldi(__m64 __m, int __count) {
   1189   return _mm_slli_pi32(__m, __count);
   1190 }
   1191 
   1192 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
   1193 extern __inline __m64
   1194     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1195     _mm_sra_pi16(__m64 __m, __m64 __count) {
   1196   __vector signed short __r;
   1197   __vector unsigned short __c;
   1198 
   1199   if (__count <= 15) {
   1200     __r = (__vector signed short)vec_splats(__m);
   1201     __c = (__vector unsigned short)vec_splats((unsigned short)__count);
   1202     __r = vec_sra(__r, (__vector unsigned short)__c);
   1203     return (__m64)((__vector long long)__r)[0];
   1204   } else
   1205     return (0);
   1206 }
   1207 
   1208 extern __inline __m64
   1209     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1210     _m_psraw(__m64 __m, __m64 __count) {
   1211   return _mm_sra_pi16(__m, __count);
   1212 }
   1213 
   1214 extern __inline __m64
   1215     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1216     _mm_srai_pi16(__m64 __m, int __count) {
   1217   /* Promote int to long then invoke mm_sra_pi32.  */
   1218   return _mm_sra_pi16(__m, __count);
   1219 }
   1220 
   1221 extern __inline __m64
   1222     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1223     _m_psrawi(__m64 __m, int __count) {
   1224   return _mm_srai_pi16(__m, __count);
   1225 }
   1226 
   1227 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
   1228 extern __inline __m64
   1229     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1230     _mm_sra_pi32(__m64 __m, __m64 __count) {
   1231   __m64_union __res;
   1232 
   1233   __res.as_m64 = __m;
   1234 
   1235   __res.as_int[0] = __res.as_int[0] >> __count;
   1236   __res.as_int[1] = __res.as_int[1] >> __count;
   1237   return (__res.as_m64);
   1238 }
   1239 
   1240 extern __inline __m64
   1241     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1242     _m_psrad(__m64 __m, __m64 __count) {
   1243   return _mm_sra_pi32(__m, __count);
   1244 }
   1245 
   1246 extern __inline __m64
   1247     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1248     _mm_srai_pi32(__m64 __m, int __count) {
   1249   /* Promote int to long then invoke mm_sra_pi32.  */
   1250   return _mm_sra_pi32(__m, __count);
   1251 }
   1252 
   1253 extern __inline __m64
   1254     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1255     _m_psradi(__m64 __m, int __count) {
   1256   return _mm_srai_pi32(__m, __count);
   1257 }
   1258 
   1259 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
   1260 extern __inline __m64
   1261     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1262     _mm_srl_pi16(__m64 __m, __m64 __count) {
   1263   __vector unsigned short __r;
   1264   __vector unsigned short __c;
   1265 
   1266   if (__count <= 15) {
   1267     __r = (__vector unsigned short)vec_splats(__m);
   1268     __c = (__vector unsigned short)vec_splats((unsigned short)__count);
   1269     __r = vec_sr(__r, (__vector unsigned short)__c);
   1270     return (__m64)((__vector long long)__r)[0];
   1271   } else
   1272     return (0);
   1273 }
   1274 
   1275 extern __inline __m64
   1276     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1277     _m_psrlw(__m64 __m, __m64 __count) {
   1278   return _mm_srl_pi16(__m, __count);
   1279 }
   1280 
   1281 extern __inline __m64
   1282     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1283     _mm_srli_pi16(__m64 __m, int __count) {
   1284   /* Promote int to long then invoke mm_sra_pi32.  */
   1285   return _mm_srl_pi16(__m, __count);
   1286 }
   1287 
   1288 extern __inline __m64
   1289     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1290     _m_psrlwi(__m64 __m, int __count) {
   1291   return _mm_srli_pi16(__m, __count);
   1292 }
   1293 
   1294 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
   1295 extern __inline __m64
   1296     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1297     _mm_srl_pi32(__m64 __m, __m64 __count) {
   1298   __m64_union __res;
   1299 
   1300   __res.as_m64 = __m;
   1301 
   1302   __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
   1303   __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
   1304   return (__res.as_m64);
   1305 }
   1306 
   1307 extern __inline __m64
   1308     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1309     _m_psrld(__m64 __m, __m64 __count) {
   1310   return _mm_srl_pi32(__m, __count);
   1311 }
   1312 
   1313 extern __inline __m64
   1314     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1315     _mm_srli_pi32(__m64 __m, int __count) {
   1316   /* Promote int to long then invoke mm_srl_pi32.  */
   1317   return _mm_srl_pi32(__m, __count);
   1318 }
   1319 
   1320 extern __inline __m64
   1321     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1322     _m_psrldi(__m64 __m, int __count) {
   1323   return _mm_srli_pi32(__m, __count);
   1324 }
   1325 #endif /* _ARCH_PWR8 */
   1326 
   1327 /* Creates a vector of two 32-bit values; I0 is least significant.  */
   1328 extern __inline __m64
   1329     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1330     _mm_set_pi32(int __i1, int __i0) {
   1331   __m64_union __res;
   1332 
   1333   __res.as_int[0] = __i0;
   1334   __res.as_int[1] = __i1;
   1335   return (__res.as_m64);
   1336 }
   1337 
   1338 /* Creates a vector of four 16-bit values; W0 is least significant.  */
   1339 extern __inline __m64
   1340     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1341     _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
   1342   __m64_union __res;
   1343 
   1344   __res.as_short[0] = __w0;
   1345   __res.as_short[1] = __w1;
   1346   __res.as_short[2] = __w2;
   1347   __res.as_short[3] = __w3;
   1348   return (__res.as_m64);
   1349 }
   1350 
   1351 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
   1352 extern __inline __m64
   1353     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1354     _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
   1355                 char __b2, char __b1, char __b0) {
   1356   __m64_union __res;
   1357 
   1358   __res.as_char[0] = __b0;
   1359   __res.as_char[1] = __b1;
   1360   __res.as_char[2] = __b2;
   1361   __res.as_char[3] = __b3;
   1362   __res.as_char[4] = __b4;
   1363   __res.as_char[5] = __b5;
   1364   __res.as_char[6] = __b6;
   1365   __res.as_char[7] = __b7;
   1366   return (__res.as_m64);
   1367 }
   1368 
   1369 /* Similar, but with the arguments in reverse order.  */
   1370 extern __inline __m64
   1371     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1372     _mm_setr_pi32(int __i0, int __i1) {
   1373   __m64_union __res;
   1374 
   1375   __res.as_int[0] = __i0;
   1376   __res.as_int[1] = __i1;
   1377   return (__res.as_m64);
   1378 }
   1379 
   1380 extern __inline __m64
   1381     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1382     _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
   1383   return _mm_set_pi16(__w3, __w2, __w1, __w0);
   1384 }
   1385 
   1386 extern __inline __m64
   1387     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1388     _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
   1389                  char __b5, char __b6, char __b7) {
   1390   return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
   1391 }
   1392 
   1393 /* Creates a vector of two 32-bit values, both elements containing I.  */
   1394 extern __inline __m64
   1395     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1396     _mm_set1_pi32(int __i) {
   1397   __m64_union __res;
   1398 
   1399   __res.as_int[0] = __i;
   1400   __res.as_int[1] = __i;
   1401   return (__res.as_m64);
   1402 }
   1403 
   1404 /* Creates a vector of four 16-bit values, all elements containing W.  */
   1405 extern __inline __m64
   1406     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1407     _mm_set1_pi16(short __w) {
   1408 #if _ARCH_PWR9
   1409   __vector signed short w;
   1410 
   1411   w = (__vector signed short)vec_splats(__w);
   1412   return (__m64)((__vector long long)w)[0];
   1413 #else
   1414   __m64_union __res;
   1415 
   1416   __res.as_short[0] = __w;
   1417   __res.as_short[1] = __w;
   1418   __res.as_short[2] = __w;
   1419   __res.as_short[3] = __w;
   1420   return (__res.as_m64);
   1421 #endif
   1422 }
   1423 
   1424 /* Creates a vector of eight 8-bit values, all elements containing B.  */
   1425 extern __inline __m64
   1426     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1427     _mm_set1_pi8(signed char __b) {
   1428 #if _ARCH_PWR8
   1429   __vector signed char __res;
   1430 
   1431   __res = (__vector signed char)vec_splats(__b);
   1432   return (__m64)((__vector long long)__res)[0];
   1433 #else
   1434   __m64_union __res;
   1435 
   1436   __res.as_char[0] = __b;
   1437   __res.as_char[1] = __b;
   1438   __res.as_char[2] = __b;
   1439   __res.as_char[3] = __b;
   1440   __res.as_char[4] = __b;
   1441   __res.as_char[5] = __b;
   1442   __res.as_char[6] = __b;
   1443   __res.as_char[7] = __b;
   1444   return (__res.as_m64);
   1445 #endif
   1446 }
   1447 
   1448 #else
   1449 #include_next <mmintrin.h>
   1450 #endif /* defined(__powerpc64__) &&                                            \
   1451         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
   1452 
   1453 #endif /* _MMINTRIN_H_INCLUDED */