zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

fmaintrin.h (29337B) - Raw


      1 /*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 #ifndef __IMMINTRIN_H
     11 #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
     12 #endif
     13 
     14 #ifndef __FMAINTRIN_H
     15 #define __FMAINTRIN_H
     16 
     17 /* Define the default attributes for the functions in this file. */
     18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
     19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
     20 
     21 /// Computes a multiply-add of 128-bit vectors of [4 x float].
     22 ///    For each element, computes <c> (__A * __B) + __C </c>.
     23 ///
     24 /// \headerfile <immintrin.h>
     25 ///
     26 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
     27 ///
     28 /// \param __A
     29 ///    A 128-bit vector of [4 x float] containing the multiplicand.
     30 /// \param __B
     31 ///    A 128-bit vector of [4 x float] containing the multiplier.
     32 /// \param __C
     33 ///    A 128-bit vector of [4 x float] containing the addend.
     34 /// \returns A 128-bit vector of [4 x float] containing the result.
     35 static __inline__ __m128 __DEFAULT_FN_ATTRS128
     36 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
     37 {
     38   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
     39 }
     40 
     41 /// Computes a multiply-add of 128-bit vectors of [2 x double].
     42 ///    For each element, computes <c> (__A * __B) + __C </c>.
     43 ///
     44 /// \headerfile <immintrin.h>
     45 ///
     46 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
     47 ///
     48 /// \param __A
     49 ///    A 128-bit vector of [2 x double] containing the multiplicand.
     50 /// \param __B
     51 ///    A 128-bit vector of [2 x double] containing the multiplier.
     52 /// \param __C
     53 ///    A 128-bit vector of [2 x double] containing the addend.
     54 /// \returns A 128-bit [2 x double] vector containing the result.
     55 static __inline__ __m128d __DEFAULT_FN_ATTRS128
     56 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
     57 {
     58   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
     59 }
     60 
     61 /// Computes a scalar multiply-add of the single-precision values in the
     62 ///    low 32 bits of 128-bit vectors of [4 x float].
     63 ///
     64 /// \code{.operation}
     65 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
     66 /// result[127:32] = __A[127:32]
     67 /// \endcode
     68 ///
     69 /// \headerfile <immintrin.h>
     70 ///
     71 /// This intrinsic corresponds to the \c VFMADD213SS instruction.
     72 ///
     73 /// \param __A
     74 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
     75 ///    32 bits.
     76 /// \param __B
     77 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
     78 ///    32 bits.
     79 /// \param __C
     80 ///    A 128-bit vector of [4 x float] containing the addend in the low
     81 ///    32 bits.
     82 /// \returns A 128-bit vector of [4 x float] containing the result in the low
     83 ///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
     84 static __inline__ __m128 __DEFAULT_FN_ATTRS128
     85 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
     86 {
     87   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
     88 }
     89 
     90 /// Computes a scalar multiply-add of the double-precision values in the
     91 ///    low 64 bits of 128-bit vectors of [2 x double].
     92 ///
     93 /// \code{.operation}
     94 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
     95 /// result[127:64] = __A[127:64]
     96 /// \endcode
     97 ///
     98 /// \headerfile <immintrin.h>
     99 ///
    100 /// This intrinsic corresponds to the \c VFMADD213SD instruction.
    101 ///
    102 /// \param __A
    103 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
    104 ///    64 bits.
    105 /// \param __B
    106 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
    107 ///    64 bits.
    108 /// \param __C
    109 ///    A 128-bit vector of [2 x double] containing the addend in the low
    110 ///    64 bits.
    111 /// \returns A 128-bit vector of [2 x double] containing the result in the low
    112 ///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
    113 static __inline__ __m128d __DEFAULT_FN_ATTRS128
    114 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
    115 {
    116   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
    117 }
    118 
    119 /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
    120 ///    For each element, computes <c> (__A * __B) - __C </c>.
    121 ///
    122 /// \headerfile <immintrin.h>
    123 ///
    124 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
    125 ///
    126 /// \param __A
    127 ///    A 128-bit vector of [4 x float] containing the multiplicand.
    128 /// \param __B
    129 ///    A 128-bit vector of [4 x float] containing the multiplier.
    130 /// \param __C
    131 ///    A 128-bit vector of [4 x float] containing the subtrahend.
    132 /// \returns A 128-bit vector of [4 x float] containing the result.
    133 static __inline__ __m128 __DEFAULT_FN_ATTRS128
    134 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
    135 {
    136   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
    137 }
    138 
    139 /// Computes a multiply-subtract of 128-bit vectors of [2 x double].
    140 ///    For each element, computes <c> (__A * __B) - __C </c>.
    141 ///
    142 /// \headerfile <immintrin.h>
    143 ///
    144 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
    145 ///
    146 /// \param __A
    147 ///    A 128-bit vector of [2 x double] containing the multiplicand.
    148 /// \param __B
    149 ///    A 128-bit vector of [2 x double] containing the multiplier.
    150 /// \param __C
    151 ///    A 128-bit vector of [2 x double] containing the addend.
    152 /// \returns A 128-bit vector of [2 x double] containing the result.
    153 static __inline__ __m128d __DEFAULT_FN_ATTRS128
    154 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
    155 {
    156   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
    157 }
    158 
    159 /// Computes a scalar multiply-subtract of the single-precision values in
    160 ///    the low 32 bits of 128-bit vectors of [4 x float].
    161 ///
    162 /// \code{.operation}
    163 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
    164 /// result[127:32] = __A[127:32]
    165 /// \endcode
    166 ///
    167 /// \headerfile <immintrin.h>
    168 ///
    169 /// This intrinsic corresponds to the \c VFMSUB213SS instruction.
    170 ///
    171 /// \param __A
    172 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
    173 ///    32 bits.
    174 /// \param __B
    175 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
    176 ///    32 bits.
    177 /// \param __C
    178 ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
    179 ///   32 bits.
    180 /// \returns A 128-bit vector of [4 x float] containing the result in the low
    181 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
    182 static __inline__ __m128 __DEFAULT_FN_ATTRS128
    183 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
    184 {
    185   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
    186 }
    187 
    188 /// Computes a scalar multiply-subtract of the double-precision values in
    189 ///    the low 64 bits of 128-bit vectors of [2 x double].
    190 ///
    191 /// \code{.operation}
    192 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
    193 /// result[127:64] = __A[127:64]
    194 /// \endcode
    195 ///
    196 /// \headerfile <immintrin.h>
    197 ///
    198 /// This intrinsic corresponds to the \c VFMSUB213SD instruction.
    199 ///
    200 /// \param __A
    201 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
    202 ///    64 bits.
    203 /// \param __B
    204 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
    205 ///    64 bits.
    206 /// \param __C
    207 ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
    208 ///    64 bits.
    209 /// \returns A 128-bit vector of [2 x double] containing the result in the low
    210 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
    211 static __inline__ __m128d __DEFAULT_FN_ATTRS128
    212 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
    213 {
    214   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
    215 }
    216 
    217 /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
    218 ///    For each element, computes <c> -(__A * __B) + __C </c>.
    219 ///
    220 /// \headerfile <immintrin.h>
    221 ///
    222 /// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
    223 ///
    224 /// \param __A
    225 ///    A 128-bit vector of [4 x float] containing the multiplicand.
    226 /// \param __B
    227 ///    A 128-bit vector of [4 x float] containing the multiplier.
    228 /// \param __C
    229 ///    A 128-bit vector of [4 x float] containing the addend.
    230 /// \returns A 128-bit [4 x float] vector containing the result.
    231 static __inline__ __m128 __DEFAULT_FN_ATTRS128
    232 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
    233 {
    234   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
    235 }
    236 
    237 /// Computes a negated multiply-add of 128-bit vectors of [2 x double].
    238 ///    For each element, computes <c> -(__A * __B) + __C </c>.
    239 ///
    240 /// \headerfile <immintrin.h>
    241 ///
    242 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
    243 ///
    244 /// \param __A
    245 ///    A 128-bit vector of [2 x double] containing the multiplicand.
    246 /// \param __B
    247 ///    A 128-bit vector of [2 x double] containing the multiplier.
    248 /// \param __C
    249 ///    A 128-bit vector of [2 x double] containing the addend.
    250 /// \returns A 128-bit vector of [2 x double] containing the result.
    251 static __inline__ __m128d __DEFAULT_FN_ATTRS128
    252 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
    253 {
    254   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
    255 }
    256 
    257 /// Computes a scalar negated multiply-add of the single-precision values in
    258 ///    the low 32 bits of 128-bit vectors of [4 x float].
    259 ///
    260 /// \code{.operation}
    261 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
    262 /// result[127:32] = __A[127:32]
    263 /// \endcode
    264 ///
    265 /// \headerfile <immintrin.h>
    266 ///
    267 /// This intrinsic corresponds to the \c VFNMADD213SS instruction.
    268 ///
    269 /// \param __A
    270 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
    271 ///    32 bits.
    272 /// \param __B
    273 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
    274 ///    32 bits.
    275 /// \param __C
    276 ///    A 128-bit vector of [4 x float] containing the addend in the low
    277 ///    32 bits.
    278 /// \returns A 128-bit vector of [4 x float] containing the result in the low
    279 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
    280 static __inline__ __m128 __DEFAULT_FN_ATTRS128
    281 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
    282 {
    283   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
    284 }
    285 
    286 /// Computes a scalar negated multiply-add of the double-precision values
    287 ///    in the low 64 bits of 128-bit vectors of [2 x double].
    288 ///
    289 /// \code{.operation}
    290 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
    291 /// result[127:64] = __A[127:64]
    292 /// \endcode
    293 ///
    294 /// \headerfile <immintrin.h>
    295 ///
    296 /// This intrinsic corresponds to the \c VFNMADD213SD instruction.
    297 ///
    298 /// \param __A
    299 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
    300 ///    64 bits.
    301 /// \param __B
    302 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
    303 ///    64 bits.
    304 /// \param __C
    305 ///    A 128-bit vector of [2 x double] containing the addend in the low
    306 ///    64 bits.
    307 /// \returns A 128-bit vector of [2 x double] containing the result in the low
    308 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
    309 static __inline__ __m128d __DEFAULT_FN_ATTRS128
    310 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
    311 {
    312   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
    313 }
    314 
    315 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
    316 ///    For each element, computes <c> -(__A * __B) - __C </c>.
    317 ///
    318 /// \headerfile <immintrin.h>
    319 ///
    320 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
    321 ///
    322 /// \param __A
    323 ///    A 128-bit vector of [4 x float] containing the multiplicand.
    324 /// \param __B
    325 ///    A 128-bit vector of [4 x float] containing the multiplier.
    326 /// \param __C
    327 ///    A 128-bit vector of [4 x float] containing the subtrahend.
    328 /// \returns A 128-bit vector of [4 x float] containing the result.
    329 static __inline__ __m128 __DEFAULT_FN_ATTRS128
    330 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
    331 {
    332   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
    333 }
    334 
    335 /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
    336 ///    For each element, computes <c> -(__A * __B) - __C </c>.
    337 ///
    338 /// \headerfile <immintrin.h>
    339 ///
    340 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
    341 ///
    342 /// \param __A
    343 ///    A 128-bit vector of [2 x double] containing the multiplicand.
    344 /// \param __B
    345 ///    A 128-bit vector of [2 x double] containing the multiplier.
    346 /// \param __C
    347 ///    A 128-bit vector of [2 x double] containing the subtrahend.
    348 /// \returns A 128-bit vector of [2 x double] containing the result.
    349 static __inline__ __m128d __DEFAULT_FN_ATTRS128
    350 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
    351 {
    352   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
    353 }
    354 
    355 /// Computes a scalar negated multiply-subtract of the single-precision
    356 ///    values in the low 32 bits of 128-bit vectors of [4 x float].
    357 ///
    358 /// \code{.operation}
    359 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
    360 /// result[127:32] = __A[127:32]
    361 /// \endcode
    362 ///
    363 /// \headerfile <immintrin.h>
    364 ///
    365 /// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
    366 ///
    367 /// \param __A
    368 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
    369 ///    32 bits.
    370 /// \param __B
    371 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
    372 ///    32 bits.
    373 /// \param __C
    374 ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
    375 ///    32 bits.
    376 /// \returns A 128-bit vector of [4 x float] containing the result in the low
    377 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
    378 static __inline__ __m128 __DEFAULT_FN_ATTRS128
    379 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
    380 {
    381   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
    382 }
    383 
    384 /// Computes a scalar negated multiply-subtract of the double-precision
    385 ///    values in the low 64 bits of 128-bit vectors of [2 x double].
    386 ///
    387 /// \code{.operation}
    388 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
    389 /// result[127:64] = __A[127:64]
    390 /// \endcode
    391 ///
    392 /// \headerfile <immintrin.h>
    393 ///
    394 /// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
    395 ///
    396 /// \param __A
    397 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
    398 ///    64 bits.
    399 /// \param __B
    400 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
    401 ///    64 bits.
    402 /// \param __C
    403 ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
    404 ///    64 bits.
    405 /// \returns A 128-bit vector of [2 x double] containing the result in the low
    406 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
    407 static __inline__ __m128d __DEFAULT_FN_ATTRS128
    408 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
    409 {
    410   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
    411 }
    412 
    413 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
    414 ///    [4 x float].
    415 ///
    416 /// \code{.operation}
    417 /// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
    418 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
    419 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
    420 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
    421 /// \endcode
    422 ///
    423 /// \headerfile <immintrin.h>
    424 ///
    425 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
    426 ///
    427 /// \param __A
    428 ///    A 128-bit vector of [4 x float] containing the multiplicand.
    429 /// \param __B
    430 ///    A 128-bit vector of [4 x float] containing the multiplier.
    431 /// \param __C
    432 ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
    433 /// \returns A 128-bit vector of [4 x float] containing the result.
    434 static __inline__ __m128 __DEFAULT_FN_ATTRS128
    435 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
    436 {
    437   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
    438 }
    439 
    440 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
    441 ///    [2 x double].
    442 ///
    443 /// \code{.operation}
    444 /// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
    445 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
    446 /// \endcode
    447 ///
    448 /// \headerfile <immintrin.h>
    449 ///
    450 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
    451 ///
    452 /// \param __A
    453 ///    A 128-bit vector of [2 x double] containing the multiplicand.
    454 /// \param __B
    455 ///    A 128-bit vector of [2 x double] containing the multiplier.
    456 /// \param __C
    457 ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
    458 /// \returns A 128-bit vector of [2 x double] containing the result.
    459 static __inline__ __m128d __DEFAULT_FN_ATTRS128
    460 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
    461 {
    462   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
    463 }
    464 
    465 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
    466 ///    [4 x float].
    467 ///
    468 /// \code{.operation}
    469 /// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
    470 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
    471 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
    472 /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
    473 /// \endcode
    474 ///
    475 /// \headerfile <immintrin.h>
    476 ///
    477 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
    478 ///
    479 /// \param __A
    480 ///    A 128-bit vector of [4 x float] containing the multiplicand.
    481 /// \param __B
    482 ///    A 128-bit vector of [4 x float] containing the multiplier.
    483 /// \param __C
    484 ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
    485 /// \returns A 128-bit vector of [4 x float] containing the result.
    486 static __inline__ __m128 __DEFAULT_FN_ATTRS128
    487 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
    488 {
    489   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
    490 }
    491 
    492 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
    493 ///    [2 x double].
    494 ///
    495 /// \code{.operation}
    496 /// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
    497 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
    498 /// \endcode
    499 ///
    500 /// \headerfile <immintrin.h>
    501 ///
    502 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
    503 ///
    504 /// \param __A
    505 ///    A 128-bit vector of [2 x double] containing the multiplicand.
    506 /// \param __B
    507 ///    A 128-bit vector of [2 x double] containing the multiplier.
    508 /// \param __C
    509 ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
    510 /// \returns A 128-bit vector of [2 x double] containing the result.
    511 static __inline__ __m128d __DEFAULT_FN_ATTRS128
    512 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
    513 {
    514   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
    515 }
    516 
    517 /// Computes a multiply-add of 256-bit vectors of [8 x float].
    518 ///    For each element, computes <c> (__A * __B) + __C </c>.
    519 ///
    520 /// \headerfile <immintrin.h>
    521 ///
    522 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
    523 ///
    524 /// \param __A
    525 ///    A 256-bit vector of [8 x float] containing the multiplicand.
    526 /// \param __B
    527 ///    A 256-bit vector of [8 x float] containing the multiplier.
    528 /// \param __C
    529 ///    A 256-bit vector of [8 x float] containing the addend.
    530 /// \returns A 256-bit vector of [8 x float] containing the result.
    531 static __inline__ __m256 __DEFAULT_FN_ATTRS256
    532 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
    533 {
    534   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
    535 }
    536 
    537 /// Computes a multiply-add of 256-bit vectors of [4 x double].
    538 ///    For each element, computes <c> (__A * __B) + __C </c>.
    539 ///
    540 /// \headerfile <immintrin.h>
    541 ///
    542 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
    543 ///
    544 /// \param __A
    545 ///    A 256-bit vector of [4 x double] containing the multiplicand.
    546 /// \param __B
    547 ///    A 256-bit vector of [4 x double] containing the multiplier.
    548 /// \param __C
    549 ///    A 256-bit vector of [4 x double] containing the addend.
    550 /// \returns A 256-bit vector of [4 x double] containing the result.
    551 static __inline__ __m256d __DEFAULT_FN_ATTRS256
    552 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
    553 {
    554   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
    555 }
    556 
    557 /// Computes a multiply-subtract of 256-bit vectors of [8 x float].
    558 ///    For each element, computes <c> (__A * __B) - __C </c>.
    559 ///
    560 /// \headerfile <immintrin.h>
    561 ///
    562 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
    563 ///
    564 /// \param __A
    565 ///    A 256-bit vector of [8 x float] containing the multiplicand.
    566 /// \param __B
    567 ///    A 256-bit vector of [8 x float] containing the multiplier.
    568 /// \param __C
    569 ///    A 256-bit vector of [8 x float] containing the subtrahend.
    570 /// \returns A 256-bit vector of [8 x float] containing the result.
    571 static __inline__ __m256 __DEFAULT_FN_ATTRS256
    572 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
    573 {
    574   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
    575 }
    576 
    577 /// Computes a multiply-subtract of 256-bit vectors of [4 x double].
    578 ///    For each element, computes <c> (__A * __B) - __C </c>.
    579 ///
    580 /// \headerfile <immintrin.h>
    581 ///
    582 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
    583 ///
    584 /// \param __A
    585 ///    A 256-bit vector of [4 x double] containing the multiplicand.
    586 /// \param __B
    587 ///    A 256-bit vector of [4 x double] containing the multiplier.
    588 /// \param __C
    589 ///    A 256-bit vector of [4 x double] containing the subtrahend.
    590 /// \returns A 256-bit vector of [4 x double] containing the result.
    591 static __inline__ __m256d __DEFAULT_FN_ATTRS256
    592 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
    593 {
    594   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
    595 }
    596 
    597 /// Computes a negated multiply-add of 256-bit vectors of [8 x float].
    598 ///    For each element, computes <c> -(__A * __B) + __C </c>.
    599 ///
    600 /// \headerfile <immintrin.h>
    601 ///
    602 /// This intrinsic corresponds to the \c VFNMADD213PS instruction.
    603 ///
    604 /// \param __A
    605 ///    A 256-bit vector of [8 x float] containing the multiplicand.
    606 /// \param __B
    607 ///    A 256-bit vector of [8 x float] containing the multiplier.
    608 /// \param __C
    609 ///    A 256-bit vector of [8 x float] containing the addend.
    610 /// \returns A 256-bit vector of [8 x float] containing the result.
    611 static __inline__ __m256 __DEFAULT_FN_ATTRS256
    612 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
    613 {
    614   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
    615 }
    616 
    617 /// Computes a negated multiply-add of 256-bit vectors of [4 x double].
    618 ///    For each element, computes <c> -(__A * __B) + __C </c>.
    619 ///
    620 /// \headerfile <immintrin.h>
    621 ///
    622 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
    623 ///
    624 /// \param __A
    625 ///    A 256-bit vector of [4 x double] containing the multiplicand.
    626 /// \param __B
    627 ///    A 256-bit vector of [4 x double] containing the multiplier.
    628 /// \param __C
    629 ///    A 256-bit vector of [4 x double] containing the addend.
    630 /// \returns A 256-bit vector of [4 x double] containing the result.
    631 static __inline__ __m256d __DEFAULT_FN_ATTRS256
    632 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
    633 {
    634   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
    635 }
    636 
    637 /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
    638 ///    For each element, computes <c> -(__A * __B) - __C </c>.
    639 ///
    640 /// \headerfile <immintrin.h>
    641 ///
    642 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
    643 ///
    644 /// \param __A
    645 ///    A 256-bit vector of [8 x float] containing the multiplicand.
    646 /// \param __B
    647 ///    A 256-bit vector of [8 x float] containing the multiplier.
    648 /// \param __C
    649 ///    A 256-bit vector of [8 x float] containing the subtrahend.
    650 /// \returns A 256-bit vector of [8 x float] containing the result.
    651 static __inline__ __m256 __DEFAULT_FN_ATTRS256
    652 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
    653 {
    654   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
    655 }
    656 
    657 /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
    658 ///    For each element, computes <c> -(__A * __B) - __C </c>.
    659 ///
    660 /// \headerfile <immintrin.h>
    661 ///
    662 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
    663 ///
    664 /// \param __A
    665 ///    A 256-bit vector of [4 x double] containing the multiplicand.
    666 /// \param __B
    667 ///    A 256-bit vector of [4 x double] containing the multiplier.
    668 /// \param __C
    669 ///    A 256-bit vector of [4 x double] containing the subtrahend.
    670 /// \returns A 256-bit vector of [4 x double] containing the result.
    671 static __inline__ __m256d __DEFAULT_FN_ATTRS256
    672 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
    673 {
    674   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
    675 }
    676 
    677 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
    678 ///    [8 x float].
    679 ///
    680 /// \code{.operation}
    681 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
    682 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
    683 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
    684 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
    685 /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
    686 /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
    687 /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
    688 /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
    689 /// \endcode
    690 ///
    691 /// \headerfile <immintrin.h>
    692 ///
    693 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
    694 ///
    695 /// \param __A
    696 ///    A 256-bit vector of [8 x float] containing the multiplicand.
    697 /// \param __B
    698 ///    A 256-bit vector of [8 x float] containing the multiplier.
    699 /// \param __C
    700 ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
    701 /// \returns A 256-bit vector of [8 x float] containing the result.
    702 static __inline__ __m256 __DEFAULT_FN_ATTRS256
    703 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
    704 {
    705   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
    706 }
    707 
    708 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
    709 ///    [4 x double].
    710 ///
    711 /// \code{.operation}
    712 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
    713 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
    714 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
    715 /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
    716 /// \endcode
    717 ///
    718 /// \headerfile <immintrin.h>
    719 ///
    720 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
    721 ///
    722 /// \param __A
    723 ///    A 256-bit vector of [4 x double] containing the multiplicand.
    724 /// \param __B
    725 ///    A 256-bit vector of [4 x double] containing the multiplier.
    726 /// \param __C
    727 ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
    728 /// \returns A 256-bit vector of [4 x double] containing the result.
    729 static __inline__ __m256d __DEFAULT_FN_ATTRS256
    730 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
    731 {
    732   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
    733 }
    734 
    735 /// Computes a vector multiply with alternating add/subtract of 256-bit
    736 ///    vectors of [8 x float].
    737 ///
    738 /// \code{.operation}
    739 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
    740 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
    741 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
    742 /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
    743 /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
    744 /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
    745 /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
    746 /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
    747 /// \endcode
    748 ///
    749 /// \headerfile <immintrin.h>
    750 ///
    751 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
    752 ///
    753 /// \param __A
    754 ///    A 256-bit vector of [8 x float] containing the multiplicand.
    755 /// \param __B
    756 ///    A 256-bit vector of [8 x float] containing the multiplier.
    757 /// \param __C
    758 ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
    759 /// \returns A 256-bit vector of [8 x float] containing the result.
    760 static __inline__ __m256 __DEFAULT_FN_ATTRS256
    761 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
    762 {
    763   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
    764 }
    765 
    766 /// Computes a vector multiply with alternating add/subtract of 256-bit
    767 ///    vectors of [4 x double].
    768 ///
    769 /// \code{.operation}
    770 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
    771 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
    772 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
    773 /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
    774 /// \endcode
    775 ///
    776 /// \headerfile <immintrin.h>
    777 ///
    778 /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
    779 ///
    780 /// \param __A
    781 ///    A 256-bit vector of [4 x double] containing the multiplicand.
    782 /// \param __B
    783 ///    A 256-bit vector of [4 x double] containing the multiplier.
    784 /// \param __C
    785 ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
    786 /// \returns A 256-bit vector of [4 x double] containing the result.
    787 static __inline__ __m256d __DEFAULT_FN_ATTRS256
    788 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
    789 {
    790   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
    791 }
    792 
    793 #undef __DEFAULT_FN_ATTRS128
    794 #undef __DEFAULT_FN_ATTRS256
    795 
    796 #endif /* __FMAINTRIN_H */