Crypto++  8.8
Free C++ class library of cryptographic schemes
sm4_simd.cpp
1 // sm4_simd.cpp - written and placed in the public domain by
2 // Markku-Juhani O. Saarinen and Jeffrey Walton
3 //
4 // This source file uses intrinsics and built-ins to gain access to
5 // AESNI, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
6 // source file is needed because additional CXXFLAGS are required to enable
7 // the appropriate instructions sets in some build configurations.
8 //
9 // AES-NI based on Markku-Juhani O. Saarinen work at https://github.com/mjosaarinen/sm4ni.
10 //
11 // ARMv8 is upcoming.
12 
13 #include "pch.h"
14 #include "config.h"
15 
16 #include "sm4.h"
17 #include "misc.h"
18 
19 // Uncomment for benchmarking C++ against SSE.
20 // Do so in both simon.cpp and simon_simd.cpp.
21 // #undef CRYPTOPP_AESNI_AVAILABLE
22 
23 #if (CRYPTOPP_AESNI_AVAILABLE)
24 # include "adv_simd.h"
25 # include <emmintrin.h>
26 # include <tmmintrin.h>
27 # include <wmmintrin.h>
28 #endif
29 
30 // Squash MS LNK4221 and libtool warnings
31 extern const char SM4_SIMD_FNAME[] = __FILE__;
32 
33 ANONYMOUS_NAMESPACE_BEGIN
34 
35 using CryptoPP::word32;
36 
37 #if (CRYPTOPP_AESNI_AVAILABLE)
38 
39 template <unsigned int R>
40 inline __m128i ShiftLeft(const __m128i& val)
41 {
42  return _mm_slli_epi32(val, R);
43 }
44 
45 template <unsigned int R>
46 inline __m128i ShiftRight(const __m128i& val)
47 {
48  return _mm_srli_epi32(val, R);
49 }
50 
51 template <unsigned int R>
52 inline __m128i ShiftLeft64(const __m128i& val)
53 {
54  return _mm_slli_epi64(val, R);
55 }
56 
57 template <unsigned int R>
58 inline __m128i ShiftRight64(const __m128i& val)
59 {
60  return _mm_srli_epi64(val, R);
61 }
62 
63 template <unsigned int R>
64 inline __m128i RotateLeft(const __m128i& val)
65 {
66  return _mm_or_si128(
67  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
68 }
69 
70 template <unsigned int R>
71 inline __m128i RotateRight(const __m128i& val)
72 {
73  return _mm_or_si128(
74  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
75 }
76 
77 template <>
78 inline __m128i RotateLeft<8>(const __m128i& val)
79 {
80  const __m128i r08 = _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003);
81  return _mm_shuffle_epi8(val, r08);
82 }
83 
84 template <>
85 inline __m128i RotateLeft<16>(const __m128i& val)
86 {
87  const __m128i mask = _mm_set_epi32(0x0D0C0F0E, 0x09080B0A, 0x05040706, 0x01000302);
88  return _mm_shuffle_epi8(val, mask);
89 }
90 
91 template <>
92 inline __m128i RotateLeft<24>(const __m128i& val)
93 {
94  const __m128i mask = _mm_set_epi32(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201);
95  return _mm_shuffle_epi8(val, mask);
96 }
97 
98 /// \brief Unpack XMM words
99 /// \tparam IDX the element from each XMM word
100 /// \param a the first XMM word
101 /// \param b the second XMM word
102 /// \param c the third XMM word
103 /// \param d the fourth XMM word
104 /// \details UnpackXMM selects the IDX element from a, b, c, d and returns a concatenation
105 /// equivalent to <tt>a[IDX] || b[IDX] || c[IDX] || d[IDX]</tt>.
106 template <unsigned int IDX>
107 inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
108 {
109  // Should not be instantiated
110  CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
111  CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
112  CRYPTOPP_ASSERT(0);
113  return _mm_setzero_si128();
114 }
115 
116 template <>
117 inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
118 {
119  const __m128i r1 = _mm_unpacklo_epi32(a, b);
120  const __m128i r2 = _mm_unpacklo_epi32(c, d);
121  return _mm_unpacklo_epi64(r1, r2);
122 }
123 
124 template <>
125 inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
126 {
127  const __m128i r1 = _mm_unpacklo_epi32(a, b);
128  const __m128i r2 = _mm_unpacklo_epi32(c, d);
129  return _mm_unpackhi_epi64(r1, r2);
130 }
131 
132 template <>
133 inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
134 {
135  const __m128i r1 = _mm_unpackhi_epi32(a, b);
136  const __m128i r2 = _mm_unpackhi_epi32(c, d);
137  return _mm_unpacklo_epi64(r1, r2);
138 }
139 
140 template <>
141 inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
142 {
143  const __m128i r1 = _mm_unpackhi_epi32(a, b);
144  const __m128i r2 = _mm_unpackhi_epi32(c, d);
145  return _mm_unpackhi_epi64(r1, r2);
146 }
147 
148 /// \brief Unpack a XMM word
149 /// \tparam IDX the element from each XMM word
150 /// \param v the first XMM word
151 /// \details UnpackXMM selects the IDX element from v and returns a concatenation
152 /// equivalent to <tt>v[IDX] || v[IDX] || v[IDX] || v[IDX]</tt>.
153 template <unsigned int IDX>
154 inline __m128i UnpackXMM(const __m128i& v)
155 {
156  // Should not be instantiated
157  CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
158  return _mm_setzero_si128();
159 }
160 
161 template <>
162 inline __m128i UnpackXMM<0>(const __m128i& v)
163 {
164  // Splat to all lanes
165  return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
166 }
167 
168 template <>
169 inline __m128i UnpackXMM<1>(const __m128i& v)
170 {
171  // Splat to all lanes
172  return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
173 }
174 
175 template <>
176 inline __m128i UnpackXMM<2>(const __m128i& v)
177 {
178  // Splat to all lanes
179  return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
180 }
181 
182 template <>
183 inline __m128i UnpackXMM<3>(const __m128i& v)
184 {
185  // Splat to all lanes
186  return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
187 }
188 
189 template <unsigned int IDX>
190 inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
191 {
192  return UnpackXMM<IDX>(a, b, c, d);
193 }
194 
195 template <unsigned int IDX>
196 inline __m128i RepackXMM(const __m128i& v)
197 {
198  return UnpackXMM<IDX>(v);
199 }
200 
201 inline void SM4_Encrypt(__m128i &block0, __m128i &block1,
202  __m128i &block2, __m128i &block3, const word32 *subkeys)
203 {
204  // nibble mask
205  const __m128i c0f = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);
206 
207  // flip all bytes in all 32-bit words
208  const __m128i flp = _mm_set_epi32(0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203);
209 
210  // inverse shift rows
211  const __m128i shr = _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00);
212 
213  // Affine transform 1 (low and high hibbles)
214  const __m128i m1l = _mm_set_epi32(0xC7C1B4B2, 0x22245157, 0x9197E2E4, 0x74720701);
215  const __m128i m1h = _mm_set_epi32(0xF052B91B, 0xF95BB012, 0xE240AB09, 0xEB49A200);
216 
217  // Affine transform 2 (low and high hibbles)
218  const __m128i m2l = _mm_set_epi32(0xEDD14478, 0x172BBE82, 0x5B67F2CE, 0xA19D0834);
219  const __m128i m2h = _mm_set_epi32(0x11CDBE62, 0xCC1063BF, 0xAE7201DD, 0x73AFDC00);
220 
221  __m128i t0 = UnpackXMM<0>(block0, block1, block2, block3);
222  __m128i t1 = UnpackXMM<1>(block0, block1, block2, block3);
223  __m128i t2 = UnpackXMM<2>(block0, block1, block2, block3);
224  __m128i t3 = UnpackXMM<3>(block0, block1, block2, block3);
225 
226  t0 = _mm_shuffle_epi8(t0, flp);
227  t1 = _mm_shuffle_epi8(t1, flp);
228  t2 = _mm_shuffle_epi8(t2, flp);
229  t3 = _mm_shuffle_epi8(t3, flp);
230 
231  const unsigned int ROUNDS = 32;
232  for (unsigned int i = 0; i < ROUNDS; i++)
233  {
234  const __m128i k = _mm_shuffle_epi32(_mm_castps_si128(
235  _mm_load_ss((const float*)(subkeys+i))), _MM_SHUFFLE(0,0,0,0));
236 
237  __m128i x, y;
238  x = _mm_xor_si128(t1, _mm_xor_si128(t2, _mm_xor_si128(t3, k)));
239 
240  y = _mm_and_si128(x, c0f); // inner affine
241  y = _mm_shuffle_epi8(m1l, y);
242  x = _mm_and_si128(ShiftRight64<4>(x), c0f);
243  x = _mm_xor_si128(_mm_shuffle_epi8(m1h, x), y);
244 
245  x = _mm_shuffle_epi8(x, shr); // inverse MixColumns
246  x = _mm_aesenclast_si128(x, c0f); // AESNI instruction
247 
248  y = _mm_andnot_si128(x, c0f); // outer affine
249  y = _mm_shuffle_epi8(m2l, y);
250  x = _mm_and_si128(ShiftRight64<4>(x), c0f);
251  x = _mm_xor_si128(_mm_shuffle_epi8(m2h, x), y);
252 
253  // 4 parallel L1 linear transforms
254  y = _mm_xor_si128(x, RotateLeft<8>(x));
255  y = _mm_xor_si128(y, RotateLeft<16>(x));
256  y = _mm_xor_si128(ShiftLeft<2>(y), ShiftRight<30>(y));
257  x = _mm_xor_si128(x, _mm_xor_si128(y, RotateLeft<24>(x)));
258 
259  // rotate registers
260  x = _mm_xor_si128(x, t0);
261  t0 = t1; t1 = t2;
262  t2 = t3; t3 = x;
263  }
264 
265  t0 = _mm_shuffle_epi8(t0, flp);
266  t1 = _mm_shuffle_epi8(t1, flp);
267  t2 = _mm_shuffle_epi8(t2, flp);
268  t3 = _mm_shuffle_epi8(t3, flp);
269 
270  block0 = RepackXMM<0>(t3,t2,t1,t0);
271  block1 = RepackXMM<1>(t3,t2,t1,t0);
272  block2 = RepackXMM<2>(t3,t2,t1,t0);
273  block3 = RepackXMM<3>(t3,t2,t1,t0);
274 }
275 
276 inline void SM4_Enc_4_Blocks(__m128i &block0, __m128i &block1,
277  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
278 {
279  SM4_Encrypt(block0, block1, block2, block3, subkeys);
280 }
281 
282 inline void SM4_Dec_4_Blocks(__m128i &block0, __m128i &block1,
283  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
284 {
285  SM4_Encrypt(block0, block1, block2, block3, subkeys);
286 }
287 
288 inline void SM4_Enc_Block(__m128i &block0,
289  const word32 *subkeys, unsigned int /*rounds*/)
290 {
291  __m128i t1 = _mm_setzero_si128();
292  __m128i t2 = _mm_setzero_si128();
293  __m128i t3 = _mm_setzero_si128();
294 
295  SM4_Encrypt(block0, t1, t2, t3, subkeys);
296 }
297 
298 inline void SM4_Dec_Block(__m128i &block0,
299  const word32 *subkeys, unsigned int /*rounds*/)
300 {
301  __m128i t1 = _mm_setzero_si128();
302  __m128i t2 = _mm_setzero_si128();
303  __m128i t3 = _mm_setzero_si128();
304 
305  SM4_Encrypt(block0, t1, t2, t3, subkeys);
306 }
307 
308 #endif // CRYPTOPP_AESNI_AVAILABLE
309 
310 ANONYMOUS_NAMESPACE_END
311 
312 NAMESPACE_BEGIN(CryptoPP)
313 
314 #if defined(CRYPTOPP_AESNI_AVAILABLE)
315 size_t SM4_Enc_AdvancedProcessBlocks_AESNI(const word32* subKeys, size_t rounds,
316  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
317 {
318  return AdvancedProcessBlocks128_4x1_SSE(SM4_Enc_Block, SM4_Enc_4_Blocks,
319  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
320 }
321 #endif // CRYPTOPP_AESNI_AVAILABLE
322 
323 NAMESPACE_END
Template for AdvancedProcessBlocks and SIMD processing.
size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 4 blocks.
Definition: adv_simd.h:830
Library configuration file.
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:72
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.
Classes for the SM4 block cipher.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68