zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

avx10_2minmaxintrin.h (16562B) - Raw


      1 /*===-------- avx10_2minmaxintrin.h - AVX10_2MINMAX intrinsics -------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 #ifndef __IMMINTRIN_H
     10 #error                                                                         \
     11     "Never use <avx10_2minmaxintrin.h> directly; include <immintrin.h> instead."
     12 #endif // __IMMINTRIN_H
     13 
     14 #ifndef __AVX10_2MINMAXINTRIN_H
     15 #define __AVX10_2MINMAXINTRIN_H
     16 
     17 #define _mm_minmax_pbh(A, B, C)                                                \
     18   ((__m128bh)__builtin_ia32_vminmaxbf16128((__m128bh)(__v8bf)(A),              \
     19                                            (__m128bh)(__v8bf)(B), (int)(C)))
     20 
     21 #define _mm_mask_minmax_pbh(W, U, A, B, C)                                     \
     22   ((__m128bh)__builtin_ia32_selectpbf_128(                                     \
     23       (__mmask8)(U),                                                           \
     24       (__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B),     \
     25                              (int)(C)),                                        \
     26       (__v8bf)(W)))
     27 
     28 #define _mm_maskz_minmax_pbh(U, A, B, C)                                       \
     29   ((__m128bh)__builtin_ia32_selectpbf_128(                                     \
     30       (__mmask8)(U),                                                           \
     31       (__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B),     \
     32                              (int)(C)),                                        \
     33       (__v8bf) __builtin_bit_cast(__m128bh, _mm_setzero_ps())))
     34 
     35 #define _mm256_minmax_pbh(A, B, C)                                             \
     36   ((__m256bh)__builtin_ia32_vminmaxbf16256((__m256bh)(__v16bf)(A),             \
     37                                            (__m256bh)(__v16bf)(B), (int)(C)))
     38 
     39 #define _mm256_mask_minmax_pbh(W, U, A, B, C)                                  \
     40   ((__m256bh)__builtin_ia32_selectpbf_256(                                     \
     41       (__mmask16)(U),                                                          \
     42       (__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A),                       \
     43                                  (__m256bh)(__v16bf)(B), (int)(C)),            \
     44       (__v16bf)(W)))
     45 
     46 #define _mm256_maskz_minmax_pbh(U, A, B, C)                                    \
     47   ((__m256bh)__builtin_ia32_selectpbf_256(                                     \
     48       (__mmask16)(U),                                                          \
     49       (__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A),                       \
     50                                  (__m256bh)(__v16bf)(B), (int)(C)),            \
     51       (__v16bf) __builtin_bit_cast(__m256bh, _mm256_setzero_ps())))
     52 
     53 #define _mm_minmax_pd(A, B, C)                                                 \
     54   ((__m128d)__builtin_ia32_vminmaxpd128_mask(                                  \
     55       (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
     56       (__v2df)_mm_setzero_pd(), (__mmask8)-1))
     57 
     58 #define _mm_mask_minmax_pd(W, U, A, B, C)                                      \
     59   ((__m128d)__builtin_ia32_vminmaxpd128_mask(                                  \
     60       (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
     61       (__v2df)(__m128d)(W), (__mmask8)(U)))
     62 
     63 #define _mm_maskz_minmax_pd(U, A, B, C)                                        \
     64   ((__m128d)__builtin_ia32_vminmaxpd128_mask(                                  \
     65       (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
     66       (__v2df)_mm_setzero_pd(), (__mmask8)(U)))
     67 
     68 #define _mm256_minmax_pd(A, B, C)                                              \
     69   ((__m256d)__builtin_ia32_vminmaxpd256_round_mask(                            \
     70       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
     71       (__v4df)_mm256_setzero_pd(), (__mmask8)-1, _MM_FROUND_NO_EXC))
     72 
     73 #define _mm256_mask_minmax_pd(W, U, A, B, C)                                   \
     74   ((__m256d)__builtin_ia32_vminmaxpd256_round_mask(                            \
     75       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
     76       (__v4df)(__m256d)(W), (__mmask8)(U), _MM_FROUND_NO_EXC))
     77 
     78 #define _mm256_maskz_minmax_pd(U, A, B, C)                                     \
     79   ((__m256d)__builtin_ia32_vminmaxpd256_round_mask(                            \
     80       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
     81       (__v4df)_mm256_setzero_pd(), (__mmask8)(U), _MM_FROUND_NO_EXC))
     82 
     83 #define _mm256_minmax_round_pd(A, B, C, R)                                     \
     84   ((__m256d)__builtin_ia32_vminmaxpd256_round_mask(                            \
     85       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
     86       (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
     87 
     88 #define _mm256_mask_minmax_round_pd(W, U, A, B, C, R)                          \
     89   ((__m256d)__builtin_ia32_vminmaxpd256_round_mask(                            \
     90       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
     91       (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
     92 
     93 #define _mm256_maskz_minmax_round_pd(U, A, B, C, R)                            \
     94   ((__m256d)__builtin_ia32_vminmaxpd256_round_mask(                            \
     95       (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
     96       (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
     97 
     98 #define _mm_minmax_ph(A, B, C)                                                 \
     99   ((__m128h)__builtin_ia32_vminmaxph128_mask(                                  \
    100       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
    101       (__v8hf)_mm_setzero_ph(), (__mmask8)-1))
    102 
    103 #define _mm_mask_minmax_ph(W, U, A, B, C)                                      \
    104   ((__m128h)__builtin_ia32_vminmaxph128_mask(                                  \
    105       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
    106       (__v8hf)(__m128h)(W), (__mmask16)-1))
    107 
    108 #define _mm_maskz_minmax_ph(U, A, B, C)                                        \
    109   ((__m128h)__builtin_ia32_vminmaxph128_mask(                                  \
    110       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
    111       (__v8hf)_mm_setzero_ph(), (__mmask8)(U)))
    112 
    113 #define _mm256_minmax_ph(A, B, C)                                              \
    114   ((__m256h)__builtin_ia32_vminmaxph256_round_mask(                            \
    115       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C),                  \
    116       (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, _MM_FROUND_NO_EXC))
    117 
    118 #define _mm256_mask_minmax_ph(W, U, A, B, C)                                   \
    119   ((__m256h)__builtin_ia32_vminmaxph256_round_mask(                            \
    120       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C),                  \
    121       (__v16hf)(__m256h)(W), (__mmask16)(U), _MM_FROUND_NO_EXC))
    122 
    123 #define _mm256_maskz_minmax_ph(U, A, B, C)                                     \
    124   ((__m256h)__builtin_ia32_vminmaxph256_round_mask(                            \
    125       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C),                  \
    126       (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), _MM_FROUND_NO_EXC))
    127 
    128 #define _mm256_minmax_round_ph(A, B, C, R)                                     \
    129   ((__m256h)__builtin_ia32_vminmaxph256_round_mask(                            \
    130       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C),                  \
    131       (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
    132 
    133 #define _mm256_mask_minmax_round_ph(W, U, A, B, C, R)                          \
    134   ((__m256h)__builtin_ia32_vminmaxph256_round_mask(                            \
    135       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (C),                       \
    136       (__v16hf)(__m256h)(W), (__mmask16)(U), (int)(R)))
    137 
    138 #define _mm256_maskz_minmax_round_ph(U, A, B, C, R)                            \
    139   ((__m256h)__builtin_ia32_vminmaxph256_round_mask(                            \
    140       (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C),                  \
    141       (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
    142 
    143 #define _mm_minmax_ps(A, B, C)                                                 \
    144   ((__m128)__builtin_ia32_vminmaxps128_mask(                                   \
    145       (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
    146       (__v4sf)_mm_setzero_ps(), (__mmask8)-1))
    147 
    148 #define _mm_mask_minmax_ps(W, U, A, B, C)                                      \
    149   ((__m128)__builtin_ia32_vminmaxps128_mask(                                   \
    150       (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \
    151       (__mmask8)(U)))
    152 
    153 #define _mm_maskz_minmax_ps(U, A, B, C)                                        \
    154   ((__m128)__builtin_ia32_vminmaxps128_mask(                                   \
    155       (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
    156       (__v4sf)_mm_setzero_ps(), (__mmask8)(U)))
    157 
    158 #define _mm256_minmax_ps(A, B, C)                                              \
    159   ((__m256)__builtin_ia32_vminmaxps256_round_mask(                             \
    160       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
    161       (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, _MM_FROUND_NO_EXC))
    162 
    163 #define _mm256_mask_minmax_ps(W, U, A, B, C)                                   \
    164   ((__m256)__builtin_ia32_vminmaxps256_round_mask(                             \
    165       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
    166       (__mmask8)(U), _MM_FROUND_NO_EXC))
    167 
    168 #define _mm256_maskz_minmax_ps(U, A, B, C)                                     \
    169   ((__m256)__builtin_ia32_vminmaxps256_round_mask(                             \
    170       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
    171       (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), _MM_FROUND_NO_EXC))
    172 
    173 #define _mm256_minmax_round_ps(A, B, C, R)                                     \
    174   ((__m256)__builtin_ia32_vminmaxps256_round_mask(                             \
    175       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
    176       (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R)))
    177 
    178 #define _mm256_mask_minmax_round_ps(W, U, A, B, C, R)                          \
    179   ((__m256)__builtin_ia32_vminmaxps256_round_mask(                             \
    180       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
    181       (__mmask8)(U), (int)(R)))
    182 
    183 #define _mm256_maskz_minmax_round_ps(U, A, B, C, R)                            \
    184   ((__m256)__builtin_ia32_vminmaxps256_round_mask(                             \
    185       (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
    186       (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
    187 
    188 #define _mm_minmax_sd(A, B, C)                                                 \
    189   ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
    190       (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
    191       (__v2df)_mm_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
    192 
    193 #define _mm_mask_minmax_sd(W, U, A, B, C)                                      \
    194   ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
    195       (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
    196       (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
    197 
    198 #define _mm_maskz_minmax_sd(U, A, B, C)                                        \
    199   ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
    200       (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
    201       (__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
    202 
    203 #define _mm_minmax_round_sd(A, B, C, R)                                        \
    204   ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
    205       (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
    206       (__v2df)_mm_undefined_pd(), (__mmask8)-1, (int)(R)))
    207 
    208 #define _mm_mask_minmax_round_sd(W, U, A, B, C, R)                             \
    209   ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
    210       (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
    211       (__v2df)(__m128d)(W), (__mmask8)(U), (int)(R)))
    212 
    213 #define _mm_maskz_minmax_round_sd(U, A, B, C, R)                               \
    214   ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
    215       (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
    216       (__v2df)_mm_setzero_pd(), (__mmask8)(U), (int)(R)))
    217 
    218 #define _mm_minmax_sh(A, B, C)                                                 \
    219   ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
    220       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
    221       (__v8hf)_mm_undefined_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
    222 
    223 #define _mm_mask_minmax_sh(W, U, A, B, C)                                      \
    224   ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
    225       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
    226       (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
    227 
    228 #define _mm_maskz_minmax_sh(U, A, B, C)                                        \
    229   ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
    230       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
    231       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
    232 
    233 #define _mm_minmax_round_sh(A, B, C, R)                                        \
    234   ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
    235       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
    236       (__v8hf)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
    237 
    238 #define _mm_mask_minmax_round_sh(W, U, A, B, C, R)                             \
    239   ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
    240       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
    241       (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
    242 
    243 #define _mm_maskz_minmax_round_sh(U, A, B, C, R)                               \
    244   ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
    245       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
    246       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
    247 
    248 #define _mm_minmax_ss(A, B, C)                                                 \
    249   ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
    250       (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
    251       (__v4sf)_mm_undefined_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
    252 
    253 #define _mm_mask_minmax_ss(W, U, A, B, C)                                      \
    254   ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
    255       (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W),         \
    256       (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
    257 
    258 #define _mm_maskz_minmax_ss(U, A, B, C)                                        \
    259   ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
    260       (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
    261       (__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
    262 
    263 #define _mm_minmax_round_ss(A, B, C, R)                                        \
    264   ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
    265       (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
    266       (__v4sf)_mm_undefined_ps(), (__mmask8)-1, (int)(R)))
    267 
    268 #define _mm_mask_minmax_round_ss(W, U, A, B, C, R)                             \
    269   ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
    270       (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W),         \
    271       (__mmask8)(U), (int)(R)))
    272 
    273 #define _mm_maskz_minmax_round_ss(U, A, B, C, R)                               \
    274   ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
    275       (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
    276       (__v4sf)_mm_setzero_ps(), (__mmask8)(U), (int)(R)))
    277 #endif // __AVX10_2MINMAXINTRIN_H