avx10_2bf16intrin.h (44674B) - Raw
1 /*===-------------- avx10_2bf16intrin.h - AVX10-BF16 intrinsics ------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error \ 11 "Never use <avx10_2bf16intrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifdef __SSE2__ 15 16 #ifndef __AVX10_2BF16INTRIN_H 17 #define __AVX10_2BF16INTRIN_H 18 19 typedef __bf16 __m128bh_u __attribute__((__vector_size__(16), __aligned__(1))); 20 typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1))); 21 22 /* Define the default attributes for the functions in this file. */ 23 #define __DEFAULT_FN_ATTRS256 \ 24 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ 25 __min_vector_width__(256))) 26 #define __DEFAULT_FN_ATTRS128 \ 27 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ 28 __min_vector_width__(128))) 29 30 static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) { 31 return __builtin_bit_cast(__m256bh, _mm256_setzero_ps()); 32 } 33 34 static __inline __m128bh __DEFAULT_FN_ATTRS128 _mm_setzero_pbh(void) { 35 return __builtin_bit_cast(__m128bh, _mm_setzero_ps()); 36 } 37 38 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castbf16_ps(__m128bh __a) { 39 return (__m128)__a; 40 } 41 42 static __inline__ __m256 __DEFAULT_FN_ATTRS256 43 _mm256_castbf16_ps(__m256bh __a) { 44 return (__m256)__a; 45 } 46 47 static __inline__ __m256d __DEFAULT_FN_ATTRS256 48 _mm256_castbf16_pd(__m256bh __a) { 49 return (__m256d)__a; 50 } 51 52 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castbf16_pd(__m128bh __a) { 53 return (__m128d)__a; 54 } 55 56 static __inline__ __m128i __DEFAULT_FN_ATTRS128 57 _mm_castbf16_si128(__m128bh __a) { 58 return (__m128i)__a; 59 } 60 61 static __inline__ __m256i __DEFAULT_FN_ATTRS256 62 _mm256_castbf16_si256(__m256bh __a) { 63 return (__m256i)__a; 64 } 65 66 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_castps_pbh(__m128 __a) { 67 return (__m128bh)__a; 68 } 69 70 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_castps_pbh(__m256 __a) { 71 return (__m256bh)__a; 72 } 73 74 static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtsbh_bf16(__m128bh __a) { 75 return __a[0]; 76 } 77 78 static __inline__ __bf16 __DEFAULT_FN_ATTRS256 79 _mm256_cvtsbh_bf16(__m256bh __a) { 80 return __a[0]; 81 } 82 83 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_castpd_pbh(__m128d __a) { 84 return (__m128bh)__a; 85 } 86 87 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 88 _mm256_castpd_pbh(__m256d __a) { 89 return (__m256bh)__a; 90 } 91 92 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 93 _mm_castsi128_pbh(__m128i __a) { 94 return (__m128bh)__a; 95 } 96 97 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 98 _mm256_castsi256_pbh(__m256i __a) { 99 return (__m256bh)__a; 100 } 101 102 static __inline__ __m128bh __DEFAULT_FN_ATTRS256 103 _mm256_castbf16256_pbh128(__m256bh __a) { 104 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); 105 } 106 107 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 108 _mm256_castbf16128_pbh256(__m128bh __a) { 109 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, 110 -1, -1, -1, -1, -1); 111 } 112 113 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 114 _mm256_zextbf16128_pbh256(__m128bh __a) { 115 return __builtin_shufflevector(__a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 116 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 117 } 118 119 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_undefined_pbh(void) { 120 return (__m256bh)__builtin_ia32_undef256(); 121 } 122 123 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 124 _mm_load_sbh(void const *__dp) { 125 __m128bh src = (__v8bf)_mm_setzero_pbh(); 126 return (__m128bh)__builtin_ia32_loadsbf16128_mask((const __v8bf *)__dp, src, 127 1); 128 } 129 130 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 131 _mm_mask_load_sbh(__m128bh __W, __mmask8 __U, const void *__A) { 132 __m128bh src = (__v8bf)__builtin_shufflevector( 133 (__v8bf)__W, (__v8bf)_mm_setzero_pbh(), 0, 8, 8, 8, 8, 8, 8, 8); 134 135 return (__m128bh)__builtin_ia32_loadsbf16128_mask((const __v8bf *)__A, src, 136 __U & 1); 137 } 138 139 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 140 _mm_maskz_load_sbh(__mmask8 __U, const void *__A) { 141 return (__m128bh)__builtin_ia32_loadsbf16128_mask( 142 (const __v8bf *)__A, (__v8bf)_mm_setzero_pbh(), __U & 1); 143 } 144 145 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 146 _mm256_load_pbh(void const *__p) { 147 return *(const __m256bh *)__p; 148 } 149 150 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_load_pbh(void const *__p) { 151 return *(const __m128bh *)__p; 152 } 153 154 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 155 _mm256_loadu_pbh(void const *__p) { 156 struct __loadu_pbh { 157 __m256bh_u __v; 158 } __attribute__((__packed__, __may_alias__)); 159 return ((const struct __loadu_pbh *)__p)->__v; 160 } 161 162 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 163 _mm_loadu_pbh(void const *__p) { 164 struct __loadu_pbh { 165 __m128bh_u __v; 166 } __attribute__((__packed__, __may_alias__)); 167 return ((const struct __loadu_pbh *)__p)->__v; 168 } 169 170 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sbh(void *__dp, 171 __m128bh __a) { 172 struct __mm_store_sbh_struct { 173 __bf16 __u; 174 } __attribute__((__packed__, __may_alias__)); 175 ((struct __mm_store_sbh_struct *)__dp)->__u = __a[0]; 176 } 177 178 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sbh(void *__W, 179 __mmask8 __U, 180 __m128bh __A) { 181 __builtin_ia32_storesbf16128_mask((__v8bf *)__W, __A, __U & 1); 182 } 183 184 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_pbh(void *__P, 185 __m256bh __A) { 186 *(__m256bh *)__P = __A; 187 } 188 189 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_pbh(void *__P, 190 __m128bh __A) { 191 *(__m128bh *)__P = __A; 192 } 193 194 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_pbh(void *__P, 195 __m256bh __A) { 196 struct __storeu_pbh { 197 __m256bh_u __v; 198 } __attribute__((__packed__, __may_alias__)); 199 ((struct __storeu_pbh *)__P)->__v = __A; 200 } 201 202 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_pbh(void *__P, 203 __m128bh __A) { 204 struct __storeu_pbh { 205 __m128bh_u __v; 206 } __attribute__((__packed__, __may_alias__)); 207 ((struct __storeu_pbh *)__P)->__v = __A; 208 } 209 210 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_move_sbh(__m128bh __a, 211 __m128bh __b) { 212 __a[0] = __b[0]; 213 return __a; 214 } 215 216 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 217 _mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { 218 return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), __W); 219 } 220 221 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 222 _mm_maskz_move_sbh(__mmask8 __U, __m128bh __A, __m128bh __B) { 223 return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), 224 _mm_setzero_pbh()); 225 } 226 227 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_undefined_pbh(void) { 228 return (__m128bh)__builtin_ia32_undef128(); 229 } 230 231 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_set_sbh(__bf16 bf) { 232 return (__v8bf)__builtin_shufflevector( 233 (__v8bf){bf, bf, bf, bf, bf, bf, bf, bf}, (__v8bf)_mm_setzero_pbh(), 0, 8, 234 8, 8, 8, 8, 8, 8); 235 } 236 237 static __inline __m128bh __DEFAULT_FN_ATTRS128 _mm_set1_pbh(__bf16 bf) { 238 return (__m128bh)(__v8bf){bf, bf, bf, bf, bf, bf, bf, bf}; 239 } 240 241 static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_set1_pbh(__bf16 bf) { 242 return (__m256bh)(__v16bf){bf, bf, bf, bf, bf, bf, bf, bf, 243 bf, bf, bf, bf, bf, bf, bf, bf}; 244 } 245 246 static __inline __m128bh __DEFAULT_FN_ATTRS128 247 _mm_set_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, 248 __bf16 bf6, __bf16 bf7, __bf16 bf8) { 249 return (__m128bh)(__v8bf){bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8}; 250 } 251 252 static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_set_pbh( 253 __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6, 254 __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12, 255 __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16) { 256 return (__m256bh)(__v16bf){bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, 257 bf9, bf10, bf11, bf12, bf13, bf14, bf15, bf16}; 258 } 259 260 #define _mm_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8) \ 261 _mm_set_pbh((bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2), (bf1)) 262 263 #define _mm256_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10, \ 264 bf11, bf12, bf13, bf14, bf15, bf16) \ 265 _mm256_set_pbh((bf16), (bf15), (bf14), (bf13), (bf12), (bf11), (bf10), \ 266 (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2), \ 267 (bf1)) 268 269 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_abs_pbh(__m256bh __A) { 270 return (__m256bh)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), 271 (__m256i)__A); 272 } 273 274 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_abs_pbh(__m128bh __A) { 275 return (__m128bh)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); 276 } 277 278 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 279 _mm_mask_blend_pbh(__mmask8 __U, __m128bh __A, __m128bh __W) { 280 return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, (__v8bf)__W, 281 (__v8bf)__A); 282 } 283 284 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 285 _mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) { 286 return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, (__v16bf)__W, 287 (__v16bf)__A); 288 } 289 290 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 291 _mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) { 292 return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, 293 (__v8hi)__B); 294 } 295 296 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 297 _mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) { 298 return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, 299 (__v16hi)__B); 300 } 301 302 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 303 _mm_permutexvar_pbh(__m128i __A, __m128bh __B) { 304 return (__m128bh)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); 305 } 306 307 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 308 _mm256_permutexvar_pbh(__m256i __A, __m256bh __B) { 309 return (__m256bh)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); 310 } 311 312 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_add_pbh(__m256bh __A, 313 __m256bh __B) { 314 return (__m256bh)((__v16bf)__A + (__v16bf)__B); 315 } 316 317 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 318 _mm256_mask_add_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { 319 return (__m256bh)__builtin_ia32_selectpbf_256( 320 (__mmask16)__U, (__v16bf)_mm256_add_pbh(__A, __B), (__v16bf)__W); 321 } 322 323 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 324 _mm256_maskz_add_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { 325 return (__m256bh)__builtin_ia32_selectpbf_256( 326 (__mmask16)__U, (__v16bf)_mm256_add_pbh(__A, __B), 327 (__v16bf)_mm256_setzero_pbh()); 328 } 329 330 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_add_pbh(__m128bh __A, 331 __m128bh __B) { 332 return (__m128bh)((__v8bf)__A + (__v8bf)__B); 333 } 334 335 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 336 _mm_mask_add_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { 337 return (__m128bh)__builtin_ia32_selectpbf_128( 338 (__mmask8)__U, (__v8bf)_mm_add_pbh(__A, __B), (__v8bf)__W); 339 } 340 341 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 342 _mm_maskz_add_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { 343 return (__m128bh)__builtin_ia32_selectpbf_128( 344 (__mmask8)__U, (__v8bf)_mm_add_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); 345 } 346 347 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sub_pbh(__m256bh __A, 348 __m256bh __B) { 349 return (__m256bh)((__v16bf)__A - (__v16bf)__B); 350 } 351 352 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 353 _mm256_mask_sub_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { 354 return (__m256bh)__builtin_ia32_selectpbf_256( 355 (__mmask16)__U, (__v16bf)_mm256_sub_pbh(__A, __B), (__v16bf)__W); 356 } 357 358 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 359 _mm256_maskz_sub_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { 360 return (__m256bh)__builtin_ia32_selectpbf_256( 361 (__mmask16)__U, (__v16bf)_mm256_sub_pbh(__A, __B), 362 (__v16bf)_mm256_setzero_pbh()); 363 } 364 365 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sub_pbh(__m128bh __A, 366 __m128bh __B) { 367 return (__m128bh)((__v8bf)__A - (__v8bf)__B); 368 } 369 370 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 371 _mm_mask_sub_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { 372 return (__m128bh)__builtin_ia32_selectpbf_128( 373 (__mmask8)__U, (__v8bf)_mm_sub_pbh(__A, __B), (__v8bf)__W); 374 } 375 376 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 377 _mm_maskz_sub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { 378 return (__m128bh)__builtin_ia32_selectpbf_128( 379 (__mmask8)__U, (__v8bf)_mm_sub_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); 380 } 381 382 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mul_pbh(__m256bh __A, 383 __m256bh __B) { 384 return (__m256bh)((__v16bf)__A * (__v16bf)__B); 385 } 386 387 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 388 _mm256_mask_mul_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { 389 return (__m256bh)__builtin_ia32_selectpbf_256( 390 (__mmask16)__U, (__v16bf)_mm256_mul_pbh(__A, __B), (__v16bf)__W); 391 } 392 393 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 394 _mm256_maskz_mul_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { 395 return (__m256bh)__builtin_ia32_selectpbf_256( 396 (__mmask16)__U, (__v16bf)_mm256_mul_pbh(__A, __B), 397 (__v16bf)_mm256_setzero_pbh()); 398 } 399 400 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_mul_pbh(__m128bh __A, 401 __m128bh __B) { 402 return (__m128bh)((__v8bf)__A * (__v8bf)__B); 403 } 404 405 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 406 _mm_mask_mul_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { 407 return (__m128bh)__builtin_ia32_selectpbf_128( 408 (__mmask8)__U, (__v8bf)_mm_mul_pbh(__A, __B), (__v8bf)__W); 409 } 410 411 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 412 _mm_maskz_mul_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { 413 return (__m128bh)__builtin_ia32_selectpbf_128( 414 (__mmask8)__U, (__v8bf)_mm_mul_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); 415 } 416 417 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_div_pbh(__m256bh __A, 418 __m256bh __B) { 419 return (__m256bh)((__v16bf)__A / (__v16bf)__B); 420 } 421 422 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 423 _mm256_mask_div_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { 424 return (__m256bh)__builtin_ia32_selectpbf_256( 425 (__mmask16)__U, (__v16bf)_mm256_div_pbh(__A, __B), (__v16bf)__W); 426 } 427 428 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 429 _mm256_maskz_div_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { 430 return (__m256bh)__builtin_ia32_selectpbf_256( 431 (__mmask16)__U, (__v16bf)_mm256_div_pbh(__A, __B), 432 (__v16bf)_mm256_setzero_pbh()); 433 } 434 435 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_div_pbh(__m128bh __A, 436 __m128bh __B) { 437 return (__m128bh)((__v8bf)__A / (__v8bf)__B); 438 } 439 440 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 441 _mm_mask_div_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { 442 return (__m128bh)__builtin_ia32_selectpbf_128( 443 (__mmask8)__U, (__v8bf)_mm_div_pbh(__A, __B), (__v8bf)__W); 444 } 445 446 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 447 _mm_maskz_div_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { 448 return (__m128bh)__builtin_ia32_selectpbf_128( 449 (__mmask8)__U, (__v8bf)_mm_div_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); 450 } 451 452 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_max_pbh(__m256bh __A, 453 __m256bh __B) { 454 return (__m256bh)__builtin_ia32_vmaxbf16256((__v16bf)__A, (__v16bf)__B); 455 } 456 457 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 458 _mm256_mask_max_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { 459 return (__m256bh)__builtin_ia32_selectpbf_256( 460 (__mmask16)__U, (__v16bf)_mm256_max_pbh(__A, __B), (__v16bf)__W); 461 } 462 463 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 464 _mm256_maskz_max_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { 465 return (__m256bh)__builtin_ia32_selectpbf_256( 466 (__mmask16)__U, (__v16bf)_mm256_max_pbh(__A, __B), 467 (__v16bf)_mm256_setzero_pbh()); 468 } 469 470 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_max_pbh(__m128bh __A, 471 __m128bh __B) { 472 return (__m128bh)__builtin_ia32_vmaxbf16128((__v8bf)__A, (__v8bf)__B); 473 } 474 475 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 476 _mm_mask_max_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { 477 return (__m128bh)__builtin_ia32_selectpbf_128( 478 (__mmask8)__U, (__v8bf)_mm_max_pbh(__A, __B), (__v8bf)__W); 479 } 480 481 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 482 _mm_maskz_max_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { 483 return (__m128bh)__builtin_ia32_selectpbf_128( 484 (__mmask8)__U, (__v8bf)_mm_max_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); 485 } 486 487 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_min_pbh(__m256bh __A, 488 __m256bh __B) { 489 return (__m256bh)__builtin_ia32_vminbf16256((__v16bf)__A, (__v16bf)__B); 490 } 491 492 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 493 _mm256_mask_min_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { 494 return (__m256bh)__builtin_ia32_selectpbf_256( 495 (__mmask16)__U, (__v16bf)_mm256_min_pbh(__A, __B), (__v16bf)__W); 496 } 497 498 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 499 _mm256_maskz_min_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { 500 return (__m256bh)__builtin_ia32_selectpbf_256( 501 (__mmask16)__U, (__v16bf)_mm256_min_pbh(__A, __B), 502 (__v16bf)_mm256_setzero_pbh()); 503 } 504 505 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_min_pbh(__m128bh __A, 506 __m128bh __B) { 507 return (__m128bh)__builtin_ia32_vminbf16128((__v8bf)__A, (__v8bf)__B); 508 } 509 510 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 511 _mm_mask_min_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { 512 return (__m128bh)__builtin_ia32_selectpbf_128( 513 (__mmask8)__U, (__v8bf)_mm_min_pbh(__A, __B), (__v8bf)__W); 514 } 515 516 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 517 _mm_maskz_min_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { 518 return (__m128bh)__builtin_ia32_selectpbf_128( 519 (__mmask8)__U, (__v8bf)_mm_min_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); 520 } 521 522 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sbh(__m128bh A, 523 __m128bh B) { 524 return __builtin_ia32_vcomisbf16eq((__v8bf)A, (__v8bf)B); 525 } 526 527 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sbh(__m128bh A, 528 __m128bh B) { 529 return __builtin_ia32_vcomisbf16lt((__v8bf)A, (__v8bf)B); 530 } 531 532 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sbh(__m128bh A, 533 __m128bh B) { 534 return __builtin_ia32_vcomisbf16le((__v8bf)A, (__v8bf)B); 535 } 536 537 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sbh(__m128bh A, 538 __m128bh B) { 539 return __builtin_ia32_vcomisbf16gt((__v8bf)A, (__v8bf)B); 540 } 541 542 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sbh(__m128bh A, 543 __m128bh B) { 544 return __builtin_ia32_vcomisbf16ge((__v8bf)A, (__v8bf)B); 545 } 546 547 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sbh(__m128bh A, 548 __m128bh B) { 549 return __builtin_ia32_vcomisbf16neq((__v8bf)A, (__v8bf)B); 550 } 551 552 #define _mm256_cmp_pbh_mask(__A, __B, __P) \ 553 ((__mmask16)__builtin_ia32_vcmpbf16256_mask((__v16bf)(__m256bh)(__A), \ 554 (__v16bf)(__m256bh)(__B), \ 555 (int)(__P), (__mmask16) - 1)) 556 557 #define _mm256_mask_cmp_pbh_mask(__U, __A, __B, __P) \ 558 ((__mmask16)__builtin_ia32_vcmpbf16256_mask((__v16bf)(__m256bh)(__A), \ 559 (__v16bf)(__m256bh)(__B), \ 560 (int)(__P), (__mmask16)(__U))) 561 562 #define _mm_cmp_pbh_mask(__A, __B, __P) \ 563 ((__mmask8)__builtin_ia32_vcmpbf16128_mask((__v8bf)(__m128bh)(__A), \ 564 (__v8bf)(__m128bh)(__B), \ 565 (int)(__P), (__mmask8) - 1)) 566 567 #define _mm_mask_cmp_pbh_mask(__U, __A, __B, __P) \ 568 ((__mmask8)__builtin_ia32_vcmpbf16128_mask((__v8bf)(__m128bh)(__A), \ 569 (__v8bf)(__m128bh)(__B), \ 570 (int)(__P), (__mmask8)(__U))) 571 572 #define _mm256_mask_fpclass_pbh_mask(__U, __A, imm) \ 573 ((__mmask16)__builtin_ia32_vfpclassbf16256_mask( \ 574 (__v16bf)(__m256bh)(__A), (int)(imm), (__mmask16)(__U))) 575 576 #define _mm256_fpclass_pbh_mask(__A, imm) \ 577 ((__mmask16)__builtin_ia32_vfpclassbf16256_mask( \ 578 (__v16bf)(__m256bh)(__A), (int)(imm), (__mmask16) - 1)) 579 580 #define _mm_mask_fpclass_pbh_mask(__U, __A, imm) \ 581 ((__mmask8)__builtin_ia32_vfpclassbf16128_mask((__v8bf)(__m128bh)(__A), \ 582 (int)(imm), (__mmask8)(__U))) 583 584 #define _mm_fpclass_pbh_mask(__A, imm) \ 585 ((__mmask8)__builtin_ia32_vfpclassbf16128_mask((__v8bf)(__m128bh)(__A), \ 586 (int)(imm), (__mmask8) - 1)) 587 588 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 589 _mm256_scalef_pbh(__m256bh __A, __m256bh __B) { 590 return (__m256bh)__builtin_ia32_vscalefbf16256_mask( 591 (__v16bf)__A, (__v16bf)__B, (__v16bf)_mm256_undefined_pbh(), 592 (__mmask16)-1); 593 } 594 595 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_scalef_pbh( 596 __m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { 597 return (__m256bh)__builtin_ia32_vscalefbf16256_mask( 598 (__v16bf)__A, (__v16bf)__B, (__v16bf)__W, (__mmask16)__U); 599 } 600 601 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 602 _mm256_maskz_scalef_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { 603 return (__m256bh)__builtin_ia32_vscalefbf16256_mask( 604 (__v16bf)__A, (__v16bf)__B, (__v16bf)_mm256_setzero_pbh(), 605 (__mmask16)__U); 606 } 607 608 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_scalef_pbh(__m128bh __A, 609 __m128bh __B) { 610 return (__m128bh)__builtin_ia32_vscalefbf16128_mask( 611 (__v8bf)__A, (__v8bf)__B, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1); 612 } 613 614 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 615 _mm_mask_scalef_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { 616 return (__m128bh)__builtin_ia32_vscalefbf16128_mask( 617 (__v8bf)__A, (__v8bf)__B, (__v8bf)__W, (__mmask8)__U); 618 } 619 620 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 621 _mm_maskz_scalef_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { 622 return (__m128bh)__builtin_ia32_vscalefbf16128_mask( 623 (__v8bf)__A, (__v8bf)__B, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U); 624 } 625 626 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_rcp_pbh(__m256bh __A) { 627 return (__m256bh)__builtin_ia32_vrcpbf16256_mask( 628 (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1); 629 } 630 631 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 632 _mm256_mask_rcp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { 633 return (__m256bh)__builtin_ia32_vrcpbf16256_mask((__v16bf)__A, (__v16bf)__W, 634 (__mmask16)__U); 635 } 636 637 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 638 _mm256_maskz_rcp_pbh(__mmask16 __U, __m256bh __A) { 639 return (__m256bh)__builtin_ia32_vrcpbf16256_mask( 640 (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U); 641 } 642 643 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_rcp_pbh(__m128bh __A) { 644 return (__m128bh)__builtin_ia32_vrcpbf16128_mask( 645 (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1); 646 } 647 648 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 649 _mm_mask_rcp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { 650 return (__m128bh)__builtin_ia32_vrcpbf16128_mask((__v8bf)__A, (__v8bf)__W, 651 (__mmask8)__U); 652 } 653 654 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 655 _mm_maskz_rcp_pbh(__mmask8 __U, __m128bh __A) { 656 return (__m128bh)__builtin_ia32_vrcpbf16128_mask( 657 (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U); 658 } 659 660 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 661 _mm256_getexp_pbh(__m256bh __A) { 662 return (__m256bh)__builtin_ia32_vgetexpbf16256_mask( 663 (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1); 664 } 665 666 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 667 _mm256_mask_getexp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { 668 return (__m256bh)__builtin_ia32_vgetexpbf16256_mask( 669 (__v16bf)__A, (__v16bf)__W, (__mmask16)__U); 670 } 671 672 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 673 _mm256_maskz_getexp_pbh(__mmask16 __U, __m256bh __A) { 674 return (__m256bh)__builtin_ia32_vgetexpbf16256_mask( 675 (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U); 676 } 677 678 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_getexp_pbh(__m128bh __A) { 679 return (__m128bh)__builtin_ia32_vgetexpbf16128_mask( 680 (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1); 681 } 682 683 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 684 _mm_mask_getexp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { 685 return (__m128bh)__builtin_ia32_vgetexpbf16128_mask((__v8bf)__A, (__v8bf)__W, 686 (__mmask8)__U); 687 } 688 689 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 690 _mm_maskz_getexp_pbh(__mmask8 __U, __m128bh __A) { 691 return (__m128bh)__builtin_ia32_vgetexpbf16128_mask( 692 (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U); 693 } 694 695 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 696 _mm256_rsqrt_pbh(__m256bh __A) { 697 return (__m256bh)__builtin_ia32_vrsqrtbf16256_mask( 698 (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1); 699 } 700 701 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 702 _mm256_mask_rsqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { 703 return (__m256bh)__builtin_ia32_vrsqrtbf16256_mask((__v16bf)__A, (__v16bf)__W, 704 (__mmask16)__U); 705 } 706 707 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 708 _mm256_maskz_rsqrt_pbh(__mmask16 __U, __m256bh __A) { 709 return (__m256bh)__builtin_ia32_vrsqrtbf16256_mask( 710 (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U); 711 } 712 713 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_rsqrt_pbh(__m128bh __A) { 714 return (__m128bh)__builtin_ia32_vrsqrtbf16128_mask( 715 (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1); 716 } 717 718 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 719 _mm_mask_rsqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { 720 return (__m128bh)__builtin_ia32_vrsqrtbf16128_mask((__v8bf)__A, (__v8bf)__W, 721 (__mmask8)__U); 722 } 723 724 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 725 _mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) { 726 return (__m128bh)__builtin_ia32_vrsqrtbf16128_mask( 727 (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U); 728 } 729 730 #define _mm256_reduce_pbh(__A, imm) \ 731 ((__m256bh)__builtin_ia32_vreducebf16256_mask( \ 732 (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_undefined_pbh(), \ 733 (__mmask16) - 1)) 734 735 #define _mm256_mask_reduce_pbh(__W, __U, __A, imm) \ 736 ((__m256bh)__builtin_ia32_vreducebf16256_mask( \ 737 (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)(__m256bh)(__W), \ 738 (__mmask16)(__U))) 739 740 #define _mm256_maskz_reduce_pbh(__U, __A, imm) \ 741 ((__m256bh)__builtin_ia32_vreducebf16256_mask( \ 742 (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(), \ 743 (__mmask16)(__U))) 744 745 #define _mm_reduce_pbh(__A, imm) \ 746 ((__m128bh)__builtin_ia32_vreducebf16128_mask( \ 747 (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_undefined_pbh(), \ 748 (__mmask8) - 1)) 749 750 #define _mm_mask_reduce_pbh(__W, __U, __A, imm) \ 751 ((__m128bh)__builtin_ia32_vreducebf16128_mask( \ 752 (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)(__m128bh)(__W), \ 753 (__mmask8)(__U))) 754 755 #define _mm_maskz_reduce_pbh(__U, __A, imm) \ 756 ((__m128bh)__builtin_ia32_vreducebf16128_mask( \ 757 (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(), \ 758 (__mmask8)(__U))) 759 760 #define _mm256_roundscale_pbh(__A, imm) \ 761 ((__m256bh)__builtin_ia32_vrndscalebf16_256_mask( \ 762 (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(), \ 763 (__mmask16) - 1)) 764 765 #define _mm256_mask_roundscale_pbh(__W, __U, __A, imm) \ 766 ((__m256bh)__builtin_ia32_vrndscalebf16_256_mask( \ 767 (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)(__m256bh)(__W), \ 768 (__mmask16)(__U))) 769 770 #define _mm256_maskz_roundscale_pbh(__U, __A, imm) \ 771 ((__m256bh)__builtin_ia32_vrndscalebf16_256_mask( \ 772 (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(), \ 773 (__mmask16)(__U))) 774 775 #define _mm_roundscale_pbh(__A, imm) \ 776 ((__m128bh)__builtin_ia32_vrndscalebf16_128_mask( \ 777 (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(), \ 778 (__mmask8) - 1)) 779 780 #define _mm_mask_roundscale_pbh(__W, __U, __A, imm) \ 781 ((__m128bh)__builtin_ia32_vrndscalebf16_128_mask( \ 782 (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)(__m128bh)(__W), \ 783 (__mmask8)(__U))) 784 785 #define _mm_maskz_roundscale_pbh(__U, __A, imm) \ 786 ((__m128bh)__builtin_ia32_vrndscalebf16_128_mask( \ 787 (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(), \ 788 (__mmask8)(__U))) 789 790 #define _mm256_getmant_pbh(__A, __B, __C) \ 791 ((__m256bh)__builtin_ia32_vgetmantbf16256_mask( \ 792 (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)), \ 793 (__v16bf)_mm256_undefined_pbh(), (__mmask16) - 1)) 794 795 #define _mm256_mask_getmant_pbh(__W, __U, __A, __B, __C) \ 796 ((__m256bh)__builtin_ia32_vgetmantbf16256_mask( \ 797 (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)), \ 798 (__v16bf)(__m256bh)(__W), (__mmask16)(__U))) 799 800 #define _mm256_maskz_getmant_pbh(__U, __A, __B, __C) \ 801 ((__m256bh)__builtin_ia32_vgetmantbf16256_mask( \ 802 (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)), \ 803 (__v16bf)_mm256_setzero_pbh(), (__mmask16)(__U))) 804 805 #define _mm_getmant_pbh(__A, __B, __C) \ 806 ((__m128bh)__builtin_ia32_vgetmantbf16128_mask( \ 807 (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)), \ 808 (__v8bf)_mm_undefined_pbh(), (__mmask8) - 1)) 809 810 #define _mm_mask_getmant_pbh(__W, __U, __A, __B, __C) \ 811 ((__m128bh)__builtin_ia32_vgetmantbf16128_mask( \ 812 (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)), \ 813 (__v8bf)(__m128bh)(__W), (__mmask8)(__U))) 814 815 #define _mm_maskz_getmant_pbh(__U, __A, __B, __C) \ 816 ((__m128bh)__builtin_ia32_vgetmantbf16128_mask( \ 817 (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)), \ 818 (__v8bf)_mm_setzero_pbh(), (__mmask8)(__U))) 819 820 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) { 821 return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A); 822 } 823 824 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 825 _mm256_mask_sqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { 826 return (__m256bh)__builtin_ia32_selectpbf_256( 827 (__mmask16)__U, (__v16bf)_mm256_sqrt_pbh(__A), (__v16bf)__W); 828 } 829 830 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 831 _mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) { 832 return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, 833 (__v16bf)_mm256_sqrt_pbh(__A), 834 (__v16bf)_mm256_setzero_pbh()); 835 } 836 837 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) { 838 return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A); 839 } 840 841 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 842 _mm_mask_sqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { 843 return (__m128bh)__builtin_ia32_selectpbf_128( 844 (__mmask8)__U, (__v8bf)_mm_sqrt_pbh(__A), (__v8bf)__W); 845 } 846 847 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 848 _mm_maskz_sqrt_pbh(__mmask8 __U, __m128bh __A) { 849 return (__m128bh)__builtin_ia32_selectpbf_128( 850 (__mmask8)__U, (__v8bf)_mm_sqrt_pbh(__A), (__v8bf)_mm_setzero_pbh()); 851 } 852 853 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 854 _mm256_fmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { 855 return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, (__v16bf)__B, 856 (__v16bf)__C); 857 } 858 859 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 860 _mm256_mask_fmadd_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { 861 return (__m256bh)__builtin_ia32_selectpbf_256( 862 (__mmask16)__U, 863 _mm256_fmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), (__v16bf)__A); 864 } 865 866 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fmadd_pbh( 867 __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { 868 return (__m256bh)__builtin_ia32_selectpbf_256( 869 (__mmask16)__U, 870 _mm256_fmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), (__v16bf)__C); 871 } 872 873 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_pbh( 874 __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { 875 return (__m256bh)__builtin_ia32_selectpbf_256( 876 (__mmask16)__U, 877 _mm256_fmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), 878 (__v16bf)_mm256_setzero_pbh()); 879 } 880 881 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 882 _mm256_fmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { 883 return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, (__v16bf)__B, 884 -(__v16bf)__C); 885 } 886 887 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 888 _mm256_mask_fmsub_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { 889 return (__m256bh)__builtin_ia32_selectpbf_256( 890 (__mmask16)__U, 891 _mm256_fmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), (__v16bf)__A); 892 } 893 894 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsub_pbh( 895 __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { 896 return (__m256bh)__builtin_ia32_selectpbf_256( 897 (__mmask16)__U, 898 _mm256_fmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), (__v16bf)__C); 899 } 900 901 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_pbh( 902 __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { 903 return (__m256bh)__builtin_ia32_selectpbf_256( 904 (__mmask16)__U, 905 _mm256_fmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), 906 (__v16bf)_mm256_setzero_pbh()); 907 } 908 909 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 910 _mm256_fnmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { 911 return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, -(__v16bf)__B, 912 (__v16bf)__C); 913 } 914 915 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_pbh( 916 __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { 917 return (__m256bh)__builtin_ia32_selectpbf_256( 918 (__mmask16)__U, 919 _mm256_fnmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), 920 (__v16bf)__A); 921 } 922 923 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmadd_pbh( 924 __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { 925 return (__m256bh)__builtin_ia32_selectpbf_256( 926 (__mmask16)__U, 927 _mm256_fnmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), 928 (__v16bf)__C); 929 } 930 931 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_pbh( 932 __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { 933 return (__m256bh)__builtin_ia32_selectpbf_256( 934 (__mmask16)__U, 935 _mm256_fnmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), 936 (__v16bf)_mm256_setzero_pbh()); 937 } 938 939 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 940 _mm256_fnmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { 941 return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, -(__v16bf)__B, 942 -(__v16bf)__C); 943 } 944 945 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_pbh( 946 __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { 947 return (__m256bh)__builtin_ia32_selectpbf_256( 948 (__mmask16)__U, 949 _mm256_fnmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), 950 (__v16bf)__A); 951 } 952 953 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmsub_pbh( 954 __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { 955 return (__m256bh)__builtin_ia32_selectpbf_256( 956 (__mmask16)__U, 957 _mm256_fnmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), 958 (__v16bf)__C); 959 } 960 961 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_pbh( 962 __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { 963 return (__m256bh)__builtin_ia32_selectpbf_256( 964 (__mmask16)__U, 965 _mm256_fnmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), 966 (__v16bf)_mm256_setzero_pbh()); 967 } 968 969 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmadd_pbh(__m128bh __A, 970 __m128bh __B, 971 __m128bh __C) { 972 return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, (__v8bf)__B, 973 (__v8bf)__C); 974 } 975 976 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 977 _mm_mask_fmadd_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { 978 return (__m128bh)__builtin_ia32_selectpbf_128( 979 (__mmask8)__U, _mm_fmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), 980 (__v8bf)__A); 981 } 982 983 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 984 _mm_mask3_fmadd_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { 985 return (__m128bh)__builtin_ia32_selectpbf_128( 986 (__mmask8)__U, _mm_fmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), 987 (__v8bf)__C); 988 } 989 990 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 991 _mm_maskz_fmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { 992 return (__m128bh)__builtin_ia32_selectpbf_128( 993 (__mmask8)__U, _mm_fmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), 994 (__v8bf)_mm_setzero_pbh()); 995 } 996 997 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmsub_pbh(__m128bh __A, 998 __m128bh __B, 999 __m128bh __C) { 1000 return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, (__v8bf)__B, 1001 -(__v8bf)__C); 1002 } 1003 1004 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 1005 _mm_mask_fmsub_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { 1006 return (__m128bh)__builtin_ia32_selectpbf_128( 1007 (__mmask8)__U, _mm_fmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), 1008 (__v8bf)__A); 1009 } 1010 1011 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 1012 _mm_mask3_fmsub_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { 1013 return (__m128bh)__builtin_ia32_selectpbf_128( 1014 (__mmask8)__U, _mm_fmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), 1015 (__v8bf)__C); 1016 } 1017 1018 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 1019 _mm_maskz_fmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { 1020 return (__m128bh)__builtin_ia32_selectpbf_128( 1021 (__mmask8)__U, _mm_fmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), 1022 (__v8bf)_mm_setzero_pbh()); 1023 } 1024 1025 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmadd_pbh(__m128bh __A, 1026 __m128bh __B, 1027 __m128bh __C) { 1028 return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, -(__v8bf)__B, 1029 (__v8bf)__C); 1030 } 1031 1032 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 1033 _mm_mask_fnmadd_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { 1034 return (__m128bh)__builtin_ia32_selectpbf_128( 1035 (__mmask8)__U, _mm_fnmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), 1036 (__v8bf)__A); 1037 } 1038 1039 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 1040 _mm_mask3_fnmadd_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { 1041 return (__m128bh)__builtin_ia32_selectpbf_128( 1042 (__mmask8)__U, _mm_fnmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), 1043 (__v8bf)__C); 1044 } 1045 1046 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 1047 _mm_maskz_fnmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { 1048 return (__m128bh)__builtin_ia32_selectpbf_128( 1049 (__mmask8)__U, _mm_fnmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), 1050 (__v8bf)_mm_setzero_pbh()); 1051 } 1052 1053 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmsub_pbh(__m128bh __A, 1054 __m128bh __B, 1055 __m128bh __C) { 1056 return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, -(__v8bf)__B, 1057 -(__v8bf)__C); 1058 } 1059 1060 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 1061 _mm_mask_fnmsub_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { 1062 return (__m128bh)__builtin_ia32_selectpbf_128( 1063 (__mmask8)__U, _mm_fnmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), 1064 (__v8bf)__A); 1065 } 1066 1067 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 1068 _mm_mask3_fnmsub_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { 1069 return (__m128bh)__builtin_ia32_selectpbf_128( 1070 (__mmask8)__U, _mm_fnmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), 1071 (__v8bf)__C); 1072 } 1073 1074 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 1075 _mm_maskz_fnmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { 1076 return (__m128bh)__builtin_ia32_selectpbf_128( 1077 (__mmask8)__U, _mm_fnmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), 1078 (__v8bf)_mm_setzero_pbh()); 1079 } 1080 1081 #undef __DEFAULT_FN_ATTRS128 1082 #undef __DEFAULT_FN_ATTRS256 1083 1084 #endif 1085 #endif