Crypto++  8.8
Free C++ class library of cryptographic schemes
chacha_avx.cpp
1 // chacha_avx.cpp - written and placed in the public domain by
2 // Jack Lloyd and Jeffrey Walton
3 //
4 // This source file uses intrinsics and built-ins to gain access to
5 // AVX2 instructions. A separate source file is needed because
6 // additional CXXFLAGS are required to enable the appropriate
7 // instructions sets in some build configurations.
8 //
9 // AVX2 implementation based on Botan's chacha_avx.cpp. Many thanks
10 // to Jack Lloyd and the Botan team for allowing us to use it.
11 //
12 // Here are some relative numbers for ChaCha8:
13 // * Intel Skylake, 3.0 GHz: AVX2 at 4411 MB/s; 0.57 cpb.
14 // * Intel Broadwell, 2.3 GHz: AVX2 at 3828 MB/s; 0.58 cpb.
15 // * AMD Bulldozer, 3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb.
16 
17 #include "pch.h"
18 #include "config.h"
19 
20 #include "chacha.h"
21 #include "misc.h"
22 
23 #if defined(CRYPTOPP_AVX2_AVAILABLE)
24 # include <xmmintrin.h>
25 # include <emmintrin.h>
26 # include <immintrin.h>
27 #endif
28 
29 // Squash MS LNK4221 and libtool warnings
30 extern const char CHACHA_AVX_FNAME[] = __FILE__;
31 
32 // Sun Studio 12.4 OK, 12.5 and 12.6 compile error.
33 #if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150)
34 # define MAYBE_CONST
35 #else
36 # define MAYBE_CONST const
37 #endif
38 
39 // VS2017 and global optimization bug. Also see
40 // https://github.com/weidai11/cryptopp/issues/649 and
41 // https://github.com/weidai11/cryptopp/issues/735. The
42 // 649 issue affects AES but it is the same here. The 735
43 // issue is ChaCha AVX2 cut-in where it surfaced again.
44 #if (CRYPTOPP_MSC_VERSION >= 1910) && (CRYPTOPP_MSC_VERSION <= 1916)
45 # ifndef CRYPTOPP_DEBUG
46 # pragma optimize("", off)
47 # pragma optimize("ts", on)
48 # endif
49 #endif
50 
51 // The data is aligned, but Clang issues warning based on type
52 // and not the actual alignment of the variable and data.
53 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
54 # pragma GCC diagnostic ignored "-Wcast-align"
55 #endif
56 
57 ANONYMOUS_NAMESPACE_BEGIN
58 
59 #if (CRYPTOPP_AVX2_AVAILABLE)
60 
61 template <unsigned int R>
62 inline __m256i RotateLeft(const __m256i val)
63 {
64  return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));
65 }
66 
67 template <>
68 inline __m256i RotateLeft<8>(const __m256i val)
69 {
70  const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,
71  14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
72  return _mm256_shuffle_epi8(val, mask);
73 }
74 
75 template <>
76 inline __m256i RotateLeft<16>(const __m256i val)
77 {
78  const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
79  13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
80  return _mm256_shuffle_epi8(val, mask);
81 }
82 
83 #endif // CRYPTOPP_AVX2_AVAILABLE
84 
85 ANONYMOUS_NAMESPACE_END
86 
87 NAMESPACE_BEGIN(CryptoPP)
88 
89 #if (CRYPTOPP_AVX2_AVAILABLE)
90 
91 void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds)
92 {
93  const __m256i state0 = _mm256_broadcastsi128_si256(
94  _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+0*4)));
95  const __m256i state1 = _mm256_broadcastsi128_si256(
96  _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+1*4)));
97  const __m256i state2 = _mm256_broadcastsi128_si256(
98  _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+2*4)));
99  const __m256i state3 = _mm256_broadcastsi128_si256(
100  _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+3*4)));
101 
102  const word32 C = 0xFFFFFFFFu - state[12];
103  const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, C < 4, 4);
104  const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5);
105  const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6);
106  const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7);
107 
108  __m256i X0_0 = state0;
109  __m256i X0_1 = state1;
110  __m256i X0_2 = state2;
111  __m256i X0_3 = _mm256_add_epi32(state3, CTR0);
112 
113  __m256i X1_0 = state0;
114  __m256i X1_1 = state1;
115  __m256i X1_2 = state2;
116  __m256i X1_3 = _mm256_add_epi32(state3, CTR1);
117 
118  __m256i X2_0 = state0;
119  __m256i X2_1 = state1;
120  __m256i X2_2 = state2;
121  __m256i X2_3 = _mm256_add_epi32(state3, CTR2);
122 
123  __m256i X3_0 = state0;
124  __m256i X3_1 = state1;
125  __m256i X3_2 = state2;
126  __m256i X3_3 = _mm256_add_epi32(state3, CTR3);
127 
128  for (int i = static_cast<int>(rounds); i > 0; i -= 2)
129  {
130  X0_0 = _mm256_add_epi32(X0_0, X0_1);
131  X1_0 = _mm256_add_epi32(X1_0, X1_1);
132  X2_0 = _mm256_add_epi32(X2_0, X2_1);
133  X3_0 = _mm256_add_epi32(X3_0, X3_1);
134 
135  X0_3 = _mm256_xor_si256(X0_3, X0_0);
136  X1_3 = _mm256_xor_si256(X1_3, X1_0);
137  X2_3 = _mm256_xor_si256(X2_3, X2_0);
138  X3_3 = _mm256_xor_si256(X3_3, X3_0);
139 
140  X0_3 = RotateLeft<16>(X0_3);
141  X1_3 = RotateLeft<16>(X1_3);
142  X2_3 = RotateLeft<16>(X2_3);
143  X3_3 = RotateLeft<16>(X3_3);
144 
145  X0_2 = _mm256_add_epi32(X0_2, X0_3);
146  X1_2 = _mm256_add_epi32(X1_2, X1_3);
147  X2_2 = _mm256_add_epi32(X2_2, X2_3);
148  X3_2 = _mm256_add_epi32(X3_2, X3_3);
149 
150  X0_1 = _mm256_xor_si256(X0_1, X0_2);
151  X1_1 = _mm256_xor_si256(X1_1, X1_2);
152  X2_1 = _mm256_xor_si256(X2_1, X2_2);
153  X3_1 = _mm256_xor_si256(X3_1, X3_2);
154 
155  X0_1 = RotateLeft<12>(X0_1);
156  X1_1 = RotateLeft<12>(X1_1);
157  X2_1 = RotateLeft<12>(X2_1);
158  X3_1 = RotateLeft<12>(X3_1);
159 
160  X0_0 = _mm256_add_epi32(X0_0, X0_1);
161  X1_0 = _mm256_add_epi32(X1_0, X1_1);
162  X2_0 = _mm256_add_epi32(X2_0, X2_1);
163  X3_0 = _mm256_add_epi32(X3_0, X3_1);
164 
165  X0_3 = _mm256_xor_si256(X0_3, X0_0);
166  X1_3 = _mm256_xor_si256(X1_3, X1_0);
167  X2_3 = _mm256_xor_si256(X2_3, X2_0);
168  X3_3 = _mm256_xor_si256(X3_3, X3_0);
169 
170  X0_3 = RotateLeft<8>(X0_3);
171  X1_3 = RotateLeft<8>(X1_3);
172  X2_3 = RotateLeft<8>(X2_3);
173  X3_3 = RotateLeft<8>(X3_3);
174 
175  X0_2 = _mm256_add_epi32(X0_2, X0_3);
176  X1_2 = _mm256_add_epi32(X1_2, X1_3);
177  X2_2 = _mm256_add_epi32(X2_2, X2_3);
178  X3_2 = _mm256_add_epi32(X3_2, X3_3);
179 
180  X0_1 = _mm256_xor_si256(X0_1, X0_2);
181  X1_1 = _mm256_xor_si256(X1_1, X1_2);
182  X2_1 = _mm256_xor_si256(X2_1, X2_2);
183  X3_1 = _mm256_xor_si256(X3_1, X3_2);
184 
185  X0_1 = RotateLeft<7>(X0_1);
186  X1_1 = RotateLeft<7>(X1_1);
187  X2_1 = RotateLeft<7>(X2_1);
188  X3_1 = RotateLeft<7>(X3_1);
189 
190  X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
191  X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
192  X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));
193 
194  X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));
195  X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
196  X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));
197 
198  X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));
199  X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
200  X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));
201 
202  X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));
203  X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
204  X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));
205 
206  X0_0 = _mm256_add_epi32(X0_0, X0_1);
207  X1_0 = _mm256_add_epi32(X1_0, X1_1);
208  X2_0 = _mm256_add_epi32(X2_0, X2_1);
209  X3_0 = _mm256_add_epi32(X3_0, X3_1);
210 
211  X0_3 = _mm256_xor_si256(X0_3, X0_0);
212  X1_3 = _mm256_xor_si256(X1_3, X1_0);
213  X2_3 = _mm256_xor_si256(X2_3, X2_0);
214  X3_3 = _mm256_xor_si256(X3_3, X3_0);
215 
216  X0_3 = RotateLeft<16>(X0_3);
217  X1_3 = RotateLeft<16>(X1_3);
218  X2_3 = RotateLeft<16>(X2_3);
219  X3_3 = RotateLeft<16>(X3_3);
220 
221  X0_2 = _mm256_add_epi32(X0_2, X0_3);
222  X1_2 = _mm256_add_epi32(X1_2, X1_3);
223  X2_2 = _mm256_add_epi32(X2_2, X2_3);
224  X3_2 = _mm256_add_epi32(X3_2, X3_3);
225 
226  X0_1 = _mm256_xor_si256(X0_1, X0_2);
227  X1_1 = _mm256_xor_si256(X1_1, X1_2);
228  X2_1 = _mm256_xor_si256(X2_1, X2_2);
229  X3_1 = _mm256_xor_si256(X3_1, X3_2);
230 
231  X0_1 = RotateLeft<12>(X0_1);
232  X1_1 = RotateLeft<12>(X1_1);
233  X2_1 = RotateLeft<12>(X2_1);
234  X3_1 = RotateLeft<12>(X3_1);
235 
236  X0_0 = _mm256_add_epi32(X0_0, X0_1);
237  X1_0 = _mm256_add_epi32(X1_0, X1_1);
238  X2_0 = _mm256_add_epi32(X2_0, X2_1);
239  X3_0 = _mm256_add_epi32(X3_0, X3_1);
240 
241  X0_3 = _mm256_xor_si256(X0_3, X0_0);
242  X1_3 = _mm256_xor_si256(X1_3, X1_0);
243  X2_3 = _mm256_xor_si256(X2_3, X2_0);
244  X3_3 = _mm256_xor_si256(X3_3, X3_0);
245 
246  X0_3 = RotateLeft<8>(X0_3);
247  X1_3 = RotateLeft<8>(X1_3);
248  X2_3 = RotateLeft<8>(X2_3);
249  X3_3 = RotateLeft<8>(X3_3);
250 
251  X0_2 = _mm256_add_epi32(X0_2, X0_3);
252  X1_2 = _mm256_add_epi32(X1_2, X1_3);
253  X2_2 = _mm256_add_epi32(X2_2, X2_3);
254  X3_2 = _mm256_add_epi32(X3_2, X3_3);
255 
256  X0_1 = _mm256_xor_si256(X0_1, X0_2);
257  X1_1 = _mm256_xor_si256(X1_1, X1_2);
258  X2_1 = _mm256_xor_si256(X2_1, X2_2);
259  X3_1 = _mm256_xor_si256(X3_1, X3_2);
260 
261  X0_1 = RotateLeft<7>(X0_1);
262  X1_1 = RotateLeft<7>(X1_1);
263  X2_1 = RotateLeft<7>(X2_1);
264  X3_1 = RotateLeft<7>(X3_1);
265 
266  X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
267  X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
268  X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));
269 
270  X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));
271  X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
272  X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));
273 
274  X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));
275  X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
276  X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));
277 
278  X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));
279  X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
280  X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));
281  }
282 
283  X0_0 = _mm256_add_epi32(X0_0, state0);
284  X0_1 = _mm256_add_epi32(X0_1, state1);
285  X0_2 = _mm256_add_epi32(X0_2, state2);
286  X0_3 = _mm256_add_epi32(X0_3, state3);
287  X0_3 = _mm256_add_epi32(X0_3, CTR0);
288 
289  X1_0 = _mm256_add_epi32(X1_0, state0);
290  X1_1 = _mm256_add_epi32(X1_1, state1);
291  X1_2 = _mm256_add_epi32(X1_2, state2);
292  X1_3 = _mm256_add_epi32(X1_3, state3);
293  X1_3 = _mm256_add_epi32(X1_3, CTR1);
294 
295  X2_0 = _mm256_add_epi32(X2_0, state0);
296  X2_1 = _mm256_add_epi32(X2_1, state1);
297  X2_2 = _mm256_add_epi32(X2_2, state2);
298  X2_3 = _mm256_add_epi32(X2_3, state3);
299  X2_3 = _mm256_add_epi32(X2_3, CTR2);
300 
301  X3_0 = _mm256_add_epi32(X3_0, state0);
302  X3_1 = _mm256_add_epi32(X3_1, state1);
303  X3_2 = _mm256_add_epi32(X3_2, state2);
304  X3_3 = _mm256_add_epi32(X3_3, state3);
305  X3_3 = _mm256_add_epi32(X3_3, CTR3);
306 
307  if (input)
308  {
309  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
310  _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)),
311  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+0*32)))));
312  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
313  _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)),
314  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+1*32)))));
315  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
316  _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)),
317  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+2*32)))));
318  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
319  _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)),
320  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+3*32)))));
321  }
322  else
323  {
324  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
325  _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
326  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
327  _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
328  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
329  _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));
330  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
331  _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));
332  }
333 
334  if (input)
335  {
336  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
337  _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)),
338  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+4*32)))));
339  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
340  _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)),
341  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+5*32)))));
342  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
343  _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)),
344  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+6*32)))));
345  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
346  _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)),
347  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+7*32)))));
348  }
349  else
350  {
351  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
352  _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));
353  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
354  _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));
355  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
356  _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));
357  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
358  _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));
359  }
360 
361  if (input)
362  {
363  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
364  _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)),
365  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+8*32)))));
366  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
367  _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)),
368  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+9*32)))));
369  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
370  _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)),
371  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+10*32)))));
372  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
373  _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)),
374  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+11*32)))));
375  }
376  else
377  {
378  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
379  _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
380  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
381  _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
382  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
383  _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));
384  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
385  _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));
386  }
387 
388  if (input)
389  {
390  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
391  _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)),
392  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+12*32)))));
393  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
394  _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)),
395  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+13*32)))));
396  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
397  _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)),
398  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+14*32)))));
399  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
400  _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)),
401  _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+15*32)))));
402  }
403  else
404  {
405  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
406  _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));
407  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
408  _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));
409  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
410  _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));
411  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
412  _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));
413  }
414 
415  // https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
416  _mm256_zeroupper();
417 }
418 
419 #endif // CRYPTOPP_AVX2_AVAILABLE
420 
421 NAMESPACE_END
#define MAYBE_CONST
SunCC workaround.
Definition: adv_simd.h:590
Classes for ChaCha8, ChaCha12 and ChaCha20 stream ciphers.
Library configuration file.
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:72
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.