zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

sm4intrin.h (8401B) - Raw


      1 /*===--------------- sm4intrin.h - SM4 intrinsics -----------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 #ifndef __IMMINTRIN_H
     11 #error "Never use <sm4intrin.h> directly; include <immintrin.h> instead."
     12 #endif // __IMMINTRIN_H
     13 
     14 #ifndef __SM4INTRIN_H
     15 #define __SM4INTRIN_H
     16 
     17 /// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
     18 ///    operates on independent 128-bit lanes. The calculated results are
     19 ///    stored in \a dst.
     20 /// \headerfile <immintrin.h>
     21 ///
     22 /// \code
     23 /// __m128i _mm_sm4key4_epi32(__m128i __A, __m128i __B)
     24 /// \endcode
     25 ///
     26 /// This intrinsic corresponds to the \c VSM4KEY4 instruction.
     27 ///
     28 /// \param __A
     29 ///    A 128-bit vector of [4 x int].
     30 /// \param __B
     31 ///    A 128-bit vector of [4 x int].
     32 /// \returns
     33 ///    A 128-bit vector of [4 x int].
     34 ///
     35 /// \code{.operation}
     36 /// DEFINE ROL32(dword, n) {
     37 /// 	count := n % 32
     38 /// 	dest := (dword << count) | (dword >> (32-count))
     39 /// 	RETURN dest
     40 /// }
     41 /// DEFINE SBOX_BYTE(dword, i) {
     42 /// 	RETURN sbox[dword.byte[i]]
     43 /// }
     44 /// DEFINE lower_t(dword) {
     45 /// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
     46 /// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
     47 /// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
     48 /// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
     49 /// 	RETURN tmp
     50 /// }
     51 /// DEFINE L_KEY(dword) {
     52 /// 	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
     53 /// }
     54 /// DEFINE T_KEY(dword) {
     55 /// 	RETURN L_KEY(lower_t(dword))
     56 /// }
     57 /// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
     58 /// 	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
     59 /// }
     60 /// FOR i:= 0 to 0
     61 /// 	P[0] := __B.xmm[i].dword[0]
     62 /// 	P[1] := __B.xmm[i].dword[1]
     63 /// 	P[2] := __B.xmm[i].dword[2]
     64 /// 	P[3] := __B.xmm[i].dword[3]
     65 /// 	C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
     66 /// 	C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
     67 /// 	C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
     68 /// 	C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
     69 /// 	DEST.xmm[i].dword[0] := C[0]
     70 /// 	DEST.xmm[i].dword[1] := C[1]
     71 /// 	DEST.xmm[i].dword[2] := C[2]
     72 /// 	DEST.xmm[i].dword[3] := C[3]
     73 /// ENDFOR
     74 /// DEST[MAX:128] := 0
     75 /// \endcode
     76 #define _mm_sm4key4_epi32(A, B)                                                \
     77   (__m128i) __builtin_ia32_vsm4key4128((__v4su)A, (__v4su)B)
     78 
     79 /// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
     80 ///    operates on independent 128-bit lanes. The calculated results are
     81 ///    stored in \a dst.
     82 /// \headerfile <immintrin.h>
     83 ///
     84 /// \code
     85 /// __m256i _mm256_sm4key4_epi32(__m256i __A, __m256i __B)
     86 /// \endcode
     87 ///
     88 /// This intrinsic corresponds to the \c VSM4KEY4 instruction.
     89 ///
     90 /// \param __A
     91 ///    A 256-bit vector of [8 x int].
     92 /// \param __B
     93 ///    A 256-bit vector of [8 x int].
     94 /// \returns
     95 ///    A 256-bit vector of [8 x int].
     96 ///
     97 /// \code{.operation}
     98 /// DEFINE ROL32(dword, n) {
     99 /// 	count := n % 32
    100 /// 	dest := (dword << count) | (dword >> (32-count))
    101 /// 	RETURN dest
    102 /// }
    103 /// DEFINE SBOX_BYTE(dword, i) {
    104 /// 	RETURN sbox[dword.byte[i]]
    105 /// }
    106 /// DEFINE lower_t(dword) {
    107 /// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
    108 /// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
    109 /// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
    110 /// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
    111 /// 	RETURN tmp
    112 /// }
    113 /// DEFINE L_KEY(dword) {
    114 /// 	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
    115 /// }
    116 /// DEFINE T_KEY(dword) {
    117 /// 	RETURN L_KEY(lower_t(dword))
    118 /// }
    119 /// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
    120 /// 	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
    121 /// }
    122 /// FOR i:= 0 to 1
    123 /// 	P[0] := __B.xmm[i].dword[0]
    124 /// 	P[1] := __B.xmm[i].dword[1]
    125 /// 	P[2] := __B.xmm[i].dword[2]
    126 /// 	P[3] := __B.xmm[i].dword[3]
    127 /// 	C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
    128 /// 	C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
    129 /// 	C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
    130 /// 	C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
    131 /// 	DEST.xmm[i].dword[0] := C[0]
    132 /// 	DEST.xmm[i].dword[1] := C[1]
    133 /// 	DEST.xmm[i].dword[2] := C[2]
    134 /// 	DEST.xmm[i].dword[3] := C[3]
    135 /// ENDFOR
    136 /// DEST[MAX:256] := 0
    137 /// \endcode
    138 #define _mm256_sm4key4_epi32(A, B)                                             \
    139   (__m256i) __builtin_ia32_vsm4key4256((__v8su)A, (__v8su)B)
    140 
    141 /// This intrinisc performs four rounds of SM4 encryption. The intrinisc
    142 ///    operates on independent 128-bit lanes. The calculated results are
    143 ///    stored in \a dst.
    144 /// \headerfile <immintrin.h>
    145 ///
    146 /// \code
    147 /// __m128i _mm_sm4rnds4_epi32(__m128i __A, __m128i __B)
    148 /// \endcode
    149 ///
    150 /// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
    151 ///
    152 /// \param __A
    153 ///    A 128-bit vector of [4 x int].
    154 /// \param __B
    155 ///    A 128-bit vector of [4 x int].
    156 /// \returns
    157 ///    A 128-bit vector of [4 x int].
    158 ///
    159 /// \code{.operation}
    160 /// DEFINE ROL32(dword, n) {
    161 /// 	count := n % 32
    162 /// 	dest := (dword << count) | (dword >> (32-count))
    163 /// 	RETURN dest
    164 /// }
    165 /// DEFINE lower_t(dword) {
    166 /// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
    167 /// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
    168 /// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
    169 /// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
    170 /// 	RETURN tmp
    171 /// }
    172 /// DEFINE L_RND(dword) {
    173 /// 	tmp := dword
    174 /// 	tmp := tmp ^ ROL32(dword, 2)
    175 /// 	tmp := tmp ^ ROL32(dword, 10)
    176 /// 	tmp := tmp ^ ROL32(dword, 18)
    177 /// 	tmp := tmp ^ ROL32(dword, 24)
    178 ///   RETURN tmp
    179 /// }
    180 /// DEFINE T_RND(dword) {
    181 /// 	RETURN L_RND(lower_t(dword))
    182 /// }
    183 /// DEFINE F_RND(X0, X1, X2, X3, round_key) {
    184 /// 	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
    185 /// }
    186 /// FOR i:= 0 to 0
    187 /// 	P[0] := __B.xmm[i].dword[0]
    188 /// 	P[1] := __B.xmm[i].dword[1]
    189 /// 	P[2] := __B.xmm[i].dword[2]
    190 /// 	P[3] := __B.xmm[i].dword[3]
    191 /// 	C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
    192 /// 	C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
    193 /// 	C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
    194 /// 	C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
    195 /// 	DEST.xmm[i].dword[0] := C[0]
    196 /// 	DEST.xmm[i].dword[1] := C[1]
    197 /// 	DEST.xmm[i].dword[2] := C[2]
    198 /// 	DEST.xmm[i].dword[3] := C[3]
    199 /// ENDFOR
    200 /// DEST[MAX:128] := 0
    201 /// \endcode
    202 #define _mm_sm4rnds4_epi32(A, B)                                               \
    203   (__m128i) __builtin_ia32_vsm4rnds4128((__v4su)A, (__v4su)B)
    204 
    205 /// This intrinisc performs four rounds of SM4 encryption. The intrinisc
    206 ///    operates on independent 128-bit lanes. The calculated results are
    207 ///    stored in \a dst.
    208 /// \headerfile <immintrin.h>
    209 ///
    210 /// \code
    211 /// __m256i _mm256_sm4rnds4_epi32(__m256i __A, __m256i __B)
    212 /// \endcode
    213 ///
    214 /// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
    215 ///
    216 /// \param __A
    217 ///    A 256-bit vector of [8 x int].
    218 /// \param __B
    219 ///    A 256-bit vector of [8 x int].
    220 /// \returns
    221 ///    A 256-bit vector of [8 x int].
    222 ///
    223 /// \code{.operation}
    224 /// DEFINE ROL32(dword, n) {
    225 /// 	count := n % 32
    226 /// 	dest := (dword << count) | (dword >> (32-count))
    227 /// 	RETURN dest
    228 /// }
    229 /// DEFINE lower_t(dword) {
    230 /// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
    231 /// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
    232 /// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
    233 /// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
    234 /// 	RETURN tmp
    235 /// }
    236 /// DEFINE L_RND(dword) {
    237 /// 	tmp := dword
    238 /// 	tmp := tmp ^ ROL32(dword, 2)
    239 /// 	tmp := tmp ^ ROL32(dword, 10)
    240 /// 	tmp := tmp ^ ROL32(dword, 18)
    241 /// 	tmp := tmp ^ ROL32(dword, 24)
    242 ///   RETURN tmp
    243 /// }
    244 /// DEFINE T_RND(dword) {
    245 /// 	RETURN L_RND(lower_t(dword))
    246 /// }
    247 /// DEFINE F_RND(X0, X1, X2, X3, round_key) {
    248 /// 	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
    249 /// }
    250 /// FOR i:= 0 to 0
    251 /// 	P[0] := __B.xmm[i].dword[0]
    252 /// 	P[1] := __B.xmm[i].dword[1]
    253 /// 	P[2] := __B.xmm[i].dword[2]
    254 /// 	P[3] := __B.xmm[i].dword[3]
    255 /// 	C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
    256 /// 	C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
    257 /// 	C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
    258 /// 	C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
    259 /// 	DEST.xmm[i].dword[0] := C[0]
    260 /// 	DEST.xmm[i].dword[1] := C[1]
    261 /// 	DEST.xmm[i].dword[2] := C[2]
    262 /// 	DEST.xmm[i].dword[3] := C[3]
    263 /// ENDFOR
    264 /// DEST[MAX:256] := 0
    265 /// \endcode
    266 #define _mm256_sm4rnds4_epi32(A, B)                                            \
    267   (__m256i) __builtin_ia32_vsm4rnds4256((__v8su)A, (__v8su)B)
    268 
    269 #endif // __SM4INTRIN_H