smmintrin.h - zig - fork of https://codeberg.org/ziglang/zig

smmintrin.h (23559B) - Raw
      1 /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 /* Implemented from the specification included in the Intel C++ Compiler
     11    User Guide and Reference, version 9.0.
     12 
     13    NOTE: This is NOT a complete implementation of the SSE4 intrinsics!  */
     14 
     15 #ifndef NO_WARN_X86_INTRINSICS
     16 /* This header is distributed to simplify porting x86_64 code that
     17    makes explicit use of Intel intrinsics to powerpc64/powerpc64le.
     18 
     19    It is the user's responsibility to determine if the results are
     20    acceptable and make additional changes as necessary.
     21 
     22    Note that much code that uses Intel intrinsics can be rewritten in
     23    standard C or GNU C extensions, which are more portable and better
     24    optimized across multiple targets.  */
     25 #error                                                                         \
     26     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
     27 #endif
     28 
     29 #ifndef SMMINTRIN_H_
     30 #define SMMINTRIN_H_
     31 
     32 #if defined(__powerpc64__) &&                                                  \
     33     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
     34 
     35 #include <altivec.h>
     36 #include <tmmintrin.h>
     37 
     38 /* Rounding mode macros. */
     39 #define _MM_FROUND_TO_NEAREST_INT 0x00
     40 #define _MM_FROUND_TO_ZERO 0x01
     41 #define _MM_FROUND_TO_POS_INF 0x02
     42 #define _MM_FROUND_TO_NEG_INF 0x03
     43 #define _MM_FROUND_CUR_DIRECTION 0x04
     44 
     45 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
     46 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
     47 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
     48 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
     49 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
     50 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
     51 
     52 #define _MM_FROUND_RAISE_EXC 0x00
     53 #define _MM_FROUND_NO_EXC 0x08
     54 
     55 extern __inline __m128d
     56     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     57     _mm_round_pd(__m128d __A, int __rounding) {
     58   __v2df __r;
     59   union {
     60     double __fr;
     61     long long __fpscr;
     62   } __enables_save, __fpscr_save;
     63 
     64   if (__rounding & _MM_FROUND_NO_EXC) {
     65     /* Save enabled exceptions, disable all exceptions,
     66        and preserve the rounding mode.  */
     67 #ifdef _ARCH_PWR9
     68     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
     69     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
     70 #else
     71     __fpscr_save.__fr = __builtin_ppc_mffs();
     72     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
     73     __fpscr_save.__fpscr &= ~0xf8;
     74     __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
     75 #endif
     76     /* Insert an artificial "read/write" reference to the variable
     77        read below, to ensure the compiler does not schedule
     78        a read/use of the variable before the FPSCR is modified, above.
     79        This can be removed if and when GCC PR102783 is fixed.
     80      */
     81     __asm__("" : "+wa"(__A));
     82   }
     83 
     84   switch (__rounding) {
     85   case _MM_FROUND_TO_NEAREST_INT:
     86 #ifdef _ARCH_PWR9
     87     __fpscr_save.__fr = __builtin_ppc_mffsl();
     88 #else
     89     __fpscr_save.__fr = __builtin_ppc_mffs();
     90     __fpscr_save.__fpscr &= 0x70007f0ffL;
     91 #endif
     92     __attribute__((fallthrough));
     93   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
     94     __builtin_ppc_set_fpscr_rn(0b00);
     95     /* Insert an artificial "read/write" reference to the variable
     96        read below, to ensure the compiler does not schedule
     97        a read/use of the variable before the FPSCR is modified, above.
     98        This can be removed if and when GCC PR102783 is fixed.
     99      */
    100     __asm__("" : "+wa"(__A));
    101 
    102     __r = vec_rint((__v2df)__A);
    103 
    104     /* Insert an artificial "read" reference to the variable written
    105        above, to ensure the compiler does not schedule the computation
    106        of the value after the manipulation of the FPSCR, below.
    107        This can be removed if and when GCC PR102783 is fixed.
    108      */
    109     __asm__("" : : "wa"(__r));
    110     __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
    111     break;
    112   case _MM_FROUND_TO_NEG_INF:
    113   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
    114     __r = vec_floor((__v2df)__A);
    115     break;
    116   case _MM_FROUND_TO_POS_INF:
    117   case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
    118     __r = vec_ceil((__v2df)__A);
    119     break;
    120   case _MM_FROUND_TO_ZERO:
    121   case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
    122     __r = vec_trunc((__v2df)__A);
    123     break;
    124   case _MM_FROUND_CUR_DIRECTION:
    125     __r = vec_rint((__v2df)__A);
    126     break;
    127   }
    128   if (__rounding & _MM_FROUND_NO_EXC) {
    129     /* Insert an artificial "read" reference to the variable written
    130        above, to ensure the compiler does not schedule the computation
    131        of the value after the manipulation of the FPSCR, below.
    132        This can be removed if and when GCC PR102783 is fixed.
    133      */
    134     __asm__("" : : "wa"(__r));
    135     /* Restore enabled exceptions.  */
    136 #ifdef _ARCH_PWR9
    137     __fpscr_save.__fr = __builtin_ppc_mffsl();
    138 #else
    139     __fpscr_save.__fr = __builtin_ppc_mffs();
    140     __fpscr_save.__fpscr &= 0x70007f0ffL;
    141 #endif
    142     __fpscr_save.__fpscr |= __enables_save.__fpscr;
    143     __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
    144   }
    145   return (__m128d)__r;
    146 }
    147 
    148 extern __inline __m128d
    149     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    150     _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
    151   __B = _mm_round_pd(__B, __rounding);
    152   __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
    153   return (__m128d)__r;
    154 }
    155 
    156 extern __inline __m128
    157     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    158     _mm_round_ps(__m128 __A, int __rounding) {
    159   __v4sf __r;
    160   union {
    161     double __fr;
    162     long long __fpscr;
    163   } __enables_save, __fpscr_save;
    164 
    165   if (__rounding & _MM_FROUND_NO_EXC) {
    166     /* Save enabled exceptions, disable all exceptions,
    167        and preserve the rounding mode.  */
    168 #ifdef _ARCH_PWR9
    169     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
    170     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
    171 #else
    172     __fpscr_save.__fr = __builtin_ppc_mffs();
    173     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
    174     __fpscr_save.__fpscr &= ~0xf8;
    175     __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
    176 #endif
    177     /* Insert an artificial "read/write" reference to the variable
    178        read below, to ensure the compiler does not schedule
    179        a read/use of the variable before the FPSCR is modified, above.
    180        This can be removed if and when GCC PR102783 is fixed.
    181      */
    182     __asm__("" : "+wa"(__A));
    183   }
    184 
    185   switch (__rounding) {
    186   case _MM_FROUND_TO_NEAREST_INT:
    187 #ifdef _ARCH_PWR9
    188     __fpscr_save.__fr = __builtin_ppc_mffsl();
    189 #else
    190     __fpscr_save.__fr = __builtin_ppc_mffs();
    191     __fpscr_save.__fpscr &= 0x70007f0ffL;
    192 #endif
    193     __attribute__((fallthrough));
    194   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
    195     __builtin_ppc_set_fpscr_rn(0b00);
    196     /* Insert an artificial "read/write" reference to the variable
    197        read below, to ensure the compiler does not schedule
    198        a read/use of the variable before the FPSCR is modified, above.
    199        This can be removed if and when GCC PR102783 is fixed.
    200      */
    201     __asm__("" : "+wa"(__A));
    202 
    203     __r = vec_rint((__v4sf)__A);
    204 
    205     /* Insert an artificial "read" reference to the variable written
    206        above, to ensure the compiler does not schedule the computation
    207        of the value after the manipulation of the FPSCR, below.
    208        This can be removed if and when GCC PR102783 is fixed.
    209      */
    210     __asm__("" : : "wa"(__r));
    211     __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
    212     break;
    213   case _MM_FROUND_TO_NEG_INF:
    214   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
    215     __r = vec_floor((__v4sf)__A);
    216     break;
    217   case _MM_FROUND_TO_POS_INF:
    218   case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
    219     __r = vec_ceil((__v4sf)__A);
    220     break;
    221   case _MM_FROUND_TO_ZERO:
    222   case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
    223     __r = vec_trunc((__v4sf)__A);
    224     break;
    225   case _MM_FROUND_CUR_DIRECTION:
    226     __r = vec_rint((__v4sf)__A);
    227     break;
    228   }
    229   if (__rounding & _MM_FROUND_NO_EXC) {
    230     /* Insert an artificial "read" reference to the variable written
    231        above, to ensure the compiler does not schedule the computation
    232        of the value after the manipulation of the FPSCR, below.
    233        This can be removed if and when GCC PR102783 is fixed.
    234      */
    235     __asm__("" : : "wa"(__r));
    236     /* Restore enabled exceptions.  */
    237 #ifdef _ARCH_PWR9
    238     __fpscr_save.__fr = __builtin_ppc_mffsl();
    239 #else
    240     __fpscr_save.__fr = __builtin_ppc_mffs();
    241     __fpscr_save.__fpscr &= 0x70007f0ffL;
    242 #endif
    243     __fpscr_save.__fpscr |= __enables_save.__fpscr;
    244     __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
    245   }
    246   return (__m128)__r;
    247 }
    248 
    249 extern __inline __m128
    250     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    251     _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
    252   __B = _mm_round_ps(__B, __rounding);
    253   __v4sf __r = (__v4sf)__A;
    254   __r[0] = ((__v4sf)__B)[0];
    255   return (__m128)__r;
    256 }
    257 
    258 #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
    259 #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
    260 
    261 #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
    262 #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
    263 
    264 #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
    265 #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
    266 
    267 #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
    268 #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
    269 
    270 extern __inline __m128i
    271     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    272     _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
    273   __v16qi __result = (__v16qi)__A;
    274 
    275   __result[__N & 0xf] = __D;
    276 
    277   return (__m128i)__result;
    278 }
    279 
    280 extern __inline __m128i
    281     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    282     _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
    283   __v4si __result = (__v4si)__A;
    284 
    285   __result[__N & 3] = __D;
    286 
    287   return (__m128i)__result;
    288 }
    289 
    290 extern __inline __m128i
    291     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    292     _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
    293   __v2di __result = (__v2di)__A;
    294 
    295   __result[__N & 1] = __D;
    296 
    297   return (__m128i)__result;
    298 }
    299 
    300 extern __inline int
    301     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    302     _mm_extract_epi8(__m128i __X, const int __N) {
    303   return (unsigned char)((__v16qi)__X)[__N & 15];
    304 }
    305 
    306 extern __inline int
    307     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    308     _mm_extract_epi32(__m128i __X, const int __N) {
    309   return ((__v4si)__X)[__N & 3];
    310 }
    311 
    312 extern __inline int
    313     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    314     _mm_extract_epi64(__m128i __X, const int __N) {
    315   return ((__v2di)__X)[__N & 1];
    316 }
    317 
    318 extern __inline int
    319     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    320     _mm_extract_ps(__m128 __X, const int __N) {
    321   return ((__v4si)__X)[__N & 3];
    322 }
    323 
    324 #ifdef _ARCH_PWR8
    325 extern __inline __m128i
    326     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    327     _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
    328   __v16qu __charmask = vec_splats((unsigned char)__imm8);
    329   __charmask = vec_gb(__charmask);
    330   __v8hu __shortmask = (__v8hu)vec_unpackh((__v16qi)__charmask);
    331 #ifdef __BIG_ENDIAN__
    332   __shortmask = vec_reve(__shortmask);
    333 #endif
    334   return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
    335 }
    336 #endif
    337 
    338 extern __inline __m128i
    339     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    340     _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
    341 #ifdef _ARCH_PWR10
    342   return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
    343 #else
    344   const __v16qu __seven = vec_splats((unsigned char)0x07);
    345   __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
    346   return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
    347 #endif
    348 }
    349 
    350 extern __inline __m128
    351     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    352     _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
    353   __v16qu __pcv[] = {
    354       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
    355       {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
    356       {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
    357       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
    358       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
    359       {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
    360       {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
    361       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
    362       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
    363       {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
    364       {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
    365       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
    366       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
    367       {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
    368       {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
    369       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
    370   };
    371   __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
    372   return (__m128)__r;
    373 }
    374 
    375 extern __inline __m128
    376     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    377     _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
    378 #ifdef _ARCH_PWR10
    379   return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
    380 #else
    381   const __v4si __zero = {0};
    382   const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
    383   return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
    384 #endif
    385 }
    386 
    387 extern __inline __m128d
    388     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    389     _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
    390   __v16qu __pcv[] = {
    391       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
    392       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
    393       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
    394       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
    395   __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
    396   return (__m128d)__r;
    397 }
    398 
    399 #ifdef _ARCH_PWR8
    400 extern __inline __m128d
    401     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    402     _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
    403 #ifdef _ARCH_PWR10
    404   return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
    405 #else
    406   const __v2di __zero = {0};
    407   const __vector __bool long long __boolmask =
    408       vec_cmplt((__v2di)__mask, __zero);
    409   return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
    410 #endif
    411 }
    412 #endif
    413 
    414 extern __inline int
    415     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    416     _mm_testz_si128(__m128i __A, __m128i __B) {
    417   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
    418   const __v16qu __zero = {0};
    419   return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
    420 }
    421 
    422 extern __inline int
    423     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    424     _mm_testc_si128(__m128i __A, __m128i __B) {
    425   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
    426   const __v16qu __zero = {0};
    427   const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
    428   return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
    429 }
    430 
    431 extern __inline int
    432     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    433     _mm_testnzc_si128(__m128i __A, __m128i __B) {
    434   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
    435   return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
    436 }
    437 
    438 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
    439 
    440 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
    441 
    442 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
    443 
    444 #ifdef _ARCH_PWR8
    445 extern __inline __m128i
    446     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    447     _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
    448   return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
    449 }
    450 #endif
    451 
    452 extern __inline __m128i
    453     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    454     _mm_min_epi8(__m128i __X, __m128i __Y) {
    455   return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
    456 }
    457 
    458 extern __inline __m128i
    459     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    460     _mm_min_epu16(__m128i __X, __m128i __Y) {
    461   return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
    462 }
    463 
    464 extern __inline __m128i
    465     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    466     _mm_min_epi32(__m128i __X, __m128i __Y) {
    467   return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
    468 }
    469 
    470 extern __inline __m128i
    471     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    472     _mm_min_epu32(__m128i __X, __m128i __Y) {
    473   return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
    474 }
    475 
    476 extern __inline __m128i
    477     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    478     _mm_max_epi8(__m128i __X, __m128i __Y) {
    479   return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
    480 }
    481 
    482 extern __inline __m128i
    483     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    484     _mm_max_epu16(__m128i __X, __m128i __Y) {
    485   return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
    486 }
    487 
    488 extern __inline __m128i
    489     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    490     _mm_max_epi32(__m128i __X, __m128i __Y) {
    491   return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
    492 }
    493 
    494 extern __inline __m128i
    495     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    496     _mm_max_epu32(__m128i __X, __m128i __Y) {
    497   return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
    498 }
    499 
    500 extern __inline __m128i
    501     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    502     _mm_mullo_epi32(__m128i __X, __m128i __Y) {
    503   return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
    504 }
    505 
    506 #ifdef _ARCH_PWR8
    507 extern __inline __m128i
    508     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    509     _mm_mul_epi32(__m128i __X, __m128i __Y) {
    510   return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
    511 }
    512 #endif
    513 
    514 extern __inline __m128i
    515     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    516     _mm_cvtepi8_epi16(__m128i __A) {
    517   return (__m128i)vec_unpackh((__v16qi)__A);
    518 }
    519 
    520 extern __inline __m128i
    521     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    522     _mm_cvtepi8_epi32(__m128i __A) {
    523   __A = (__m128i)vec_unpackh((__v16qi)__A);
    524   return (__m128i)vec_unpackh((__v8hi)__A);
    525 }
    526 
    527 #ifdef _ARCH_PWR8
    528 extern __inline __m128i
    529     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    530     _mm_cvtepi8_epi64(__m128i __A) {
    531   __A = (__m128i)vec_unpackh((__v16qi)__A);
    532   __A = (__m128i)vec_unpackh((__v8hi)__A);
    533   return (__m128i)vec_unpackh((__v4si)__A);
    534 }
    535 #endif
    536 
    537 extern __inline __m128i
    538     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    539     _mm_cvtepi16_epi32(__m128i __A) {
    540   return (__m128i)vec_unpackh((__v8hi)__A);
    541 }
    542 
    543 #ifdef _ARCH_PWR8
    544 extern __inline __m128i
    545     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    546     _mm_cvtepi16_epi64(__m128i __A) {
    547   __A = (__m128i)vec_unpackh((__v8hi)__A);
    548   return (__m128i)vec_unpackh((__v4si)__A);
    549 }
    550 #endif
    551 
    552 #ifdef _ARCH_PWR8
    553 extern __inline __m128i
    554     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    555     _mm_cvtepi32_epi64(__m128i __A) {
    556   return (__m128i)vec_unpackh((__v4si)__A);
    557 }
    558 #endif
    559 
    560 extern __inline __m128i
    561     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    562     _mm_cvtepu8_epi16(__m128i __A) {
    563   const __v16qu __zero = {0};
    564 #ifdef __LITTLE_ENDIAN__
    565   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
    566 #else  /* __BIG_ENDIAN__.  */
    567   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
    568 #endif /* __BIG_ENDIAN__.  */
    569   return __A;
    570 }
    571 
    572 extern __inline __m128i
    573     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    574     _mm_cvtepu8_epi32(__m128i __A) {
    575   const __v16qu __zero = {0};
    576 #ifdef __LITTLE_ENDIAN__
    577   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
    578   __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
    579 #else  /* __BIG_ENDIAN__.  */
    580   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
    581   __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
    582 #endif /* __BIG_ENDIAN__.  */
    583   return __A;
    584 }
    585 
    586 extern __inline __m128i
    587     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    588     _mm_cvtepu8_epi64(__m128i __A) {
    589   const __v16qu __zero = {0};
    590 #ifdef __LITTLE_ENDIAN__
    591   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
    592   __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
    593   __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
    594 #else  /* __BIG_ENDIAN__.  */
    595   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
    596   __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
    597   __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
    598 #endif /* __BIG_ENDIAN__.  */
    599   return __A;
    600 }
    601 
    602 extern __inline __m128i
    603     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    604     _mm_cvtepu16_epi32(__m128i __A) {
    605   const __v8hu __zero = {0};
    606 #ifdef __LITTLE_ENDIAN__
    607   __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
    608 #else  /* __BIG_ENDIAN__.  */
    609   __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
    610 #endif /* __BIG_ENDIAN__.  */
    611   return __A;
    612 }
    613 
    614 extern __inline __m128i
    615     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    616     _mm_cvtepu16_epi64(__m128i __A) {
    617   const __v8hu __zero = {0};
    618 #ifdef __LITTLE_ENDIAN__
    619   __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
    620   __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
    621 #else  /* __BIG_ENDIAN__.  */
    622   __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
    623   __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
    624 #endif /* __BIG_ENDIAN__.  */
    625   return __A;
    626 }
    627 
    628 extern __inline __m128i
    629     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    630     _mm_cvtepu32_epi64(__m128i __A) {
    631   const __v4su __zero = {0};
    632 #ifdef __LITTLE_ENDIAN__
    633   __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
    634 #else  /* __BIG_ENDIAN__.  */
    635   __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
    636 #endif /* __BIG_ENDIAN__.  */
    637   return __A;
    638 }
    639 
    640 /* Return horizontal packed word minimum and its index in bits [15:0]
    641    and bits [18:16] respectively.  */
    642 extern __inline __m128i
    643     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    644     _mm_minpos_epu16(__m128i __A) {
    645   union __u {
    646     __m128i __m;
    647     __v8hu __uh;
    648   };
    649   union __u __u = {.__m = __A}, __r = {.__m = {0}};
    650   unsigned short __ridx = 0;
    651   unsigned short __rmin = __u.__uh[__ridx];
    652   unsigned long __i;
    653   for (__i = 1; __i < 8; __i++) {
    654     if (__u.__uh[__i] < __rmin) {
    655       __rmin = __u.__uh[__i];
    656       __ridx = __i;
    657     }
    658   }
    659   __r.__uh[0] = __rmin;
    660   __r.__uh[1] = __ridx;
    661   return __r.__m;
    662 }
    663 
    664 extern __inline __m128i
    665     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    666     _mm_packus_epi32(__m128i __X, __m128i __Y) {
    667   return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
    668 }
    669 
    670 #ifdef _ARCH_PWR8
    671 extern __inline __m128i
    672     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    673     _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
    674   return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
    675 }
    676 #endif
    677 
    678 #else
    679 #include_next <smmintrin.h>
    680 #endif /* defined(__powerpc64__) &&                                            \
    681         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
    682 
    683 #endif /* SMMINTRIN_H_ */
	zig fork of https://codeberg.org/ziglang/zig
	Log \| Files \| Refs \| README \| LICENSE