Crypto++  8.8
Free C++ class library of cryptographic schemes
adv_simd.h
Go to the documentation of this file.
1 // adv_simd.h - written and placed in the public domain by Jeffrey Walton
2 
3 /// \file adv_simd.h
4 /// \brief Template for AdvancedProcessBlocks and SIMD processing
5 
6 // The SIMD based implementations for ciphers that use SSE, NEON and Power7
7 // have a common pattern. Namely, they have a specialized implementation of
8 // AdvancedProcessBlocks which processes multiple block using hardware
9 // acceleration. After several implementations we noticed a lot of copy and
10 // paste occurring. adv_simd.h provides a template to avoid the copy and paste.
11 //
12 // There are 6 templates provided in this file. The number following the
13 // function name, 128, is the block size in bits. The name following the
14 // block size is the arrangement and acceleration. For example 4x1_SSE means
15 // Intel SSE using two encrypt (or decrypt) functions: one that operates on
16 // 4 SIMD words, and one that operates on 1 SIMD words.
17 //
18 // * AdvancedProcessBlocks128_4x1_SSE
19 // * AdvancedProcessBlocks128_6x2_SSE
20 // * AdvancedProcessBlocks128_4x1_NEON
21 // * AdvancedProcessBlocks128_6x1_NEON
22 // * AdvancedProcessBlocks128_4x1_ALTIVEC
23 // * AdvancedProcessBlocks128_6x1_ALTIVEC
24 //
25 // If an arrangement ends in 2, like 6x2, then the template will handle the
26 // single block case by padding with 0's and using the two SIMD word
27 // function. This happens at most one time when processing multiple blocks.
28 // The extra processing of a zero block is trivial and worth the tradeoff.
29 //
30 // The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions
31 // of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually
32 // results in a failed link due to the const/non-const mismatch.
33 //
34 // In July 2020 the library stopped using 64-bit block version of
35 // AdvancedProcessBlocks. Testing showed unreliable results and failed
36 // self tests on occasion. Also see Issue 945 and
37 // https://github.com/weidai11/cryptopp/commit/dd7598e638bb.
38 
39 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
40 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
41 
42 #include "config.h"
43 #include "misc.h"
44 #include "stdcpp.h"
45 
46 #if (CRYPTOPP_ARM_NEON_HEADER)
47 # include <arm_neon.h>
48 #endif
49 
50 #if (CRYPTOPP_ARM_ACLE_HEADER)
51 # include <stdint.h>
52 # include <arm_acle.h>
53 #endif
54 
55 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
56 # include <emmintrin.h>
57 # include <xmmintrin.h>
58 #endif
59 
60 // SunCC needs CRYPTOPP_SSSE3_AVAILABLE, too
61 #if (CRYPTOPP_SSSE3_AVAILABLE)
62 # include <emmintrin.h>
63 # include <pmmintrin.h>
64 # include <xmmintrin.h>
65 #endif
66 
67 #if defined(__ALTIVEC__)
68 # include "ppc_simd.h"
69 #endif
70 
71 // ************************ All block ciphers *********************** //
72 
73 ANONYMOUS_NAMESPACE_BEGIN
74 
75 using CryptoPP::BlockTransformation;
76 
77 CRYPTOPP_CONSTANT(BT_XorInput = BlockTransformation::BT_XorInput);
78 CRYPTOPP_CONSTANT(BT_AllowParallel = BlockTransformation::BT_AllowParallel);
79 CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter);
80 CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection);
81 CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers);
82 
83 ANONYMOUS_NAMESPACE_END
84 
85 // *************************** ARM NEON ************************** //
86 
87 #if (CRYPTOPP_ARM_NEON_AVAILABLE) || (CRYPTOPP_ARM_ASIMD_AVAILABLE) || \
88  defined(CRYPTOPP_DOXYGEN_PROCESSING)
89 NAMESPACE_BEGIN(CryptoPP)
90 
91 /// \brief AdvancedProcessBlocks for 1 and 6 blocks
92 /// \tparam F1 function to process 1 128-bit block
93 /// \tparam F6 function to process 6 128-bit blocks
94 /// \tparam W word type of the subkey table
95 /// \details AdvancedProcessBlocks128_6x1_NEON processes 6 and 2 NEON SIMD words
96 /// at a time.
97 /// \details The subkey type is usually word32 or word64. F1 and F6 must use the
98 /// same word type.
99 template <typename F1, typename F6, typename W>
100 inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
101  const W *subKeys, size_t rounds, const byte *inBlocks,
102  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
103 {
104  CRYPTOPP_ASSERT(subKeys);
105  CRYPTOPP_ASSERT(inBlocks);
106  CRYPTOPP_ASSERT(outBlocks);
107  CRYPTOPP_ASSERT(length >= 16);
108 
109  const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
110  const uint32x4_t s_one = vld1q_u32(w_one);
111 
112  const size_t blockSize = 16;
113  // const size_t neonBlockSize = 16;
114 
115  size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
116  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
117  size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
118 
119  // Clang and Coverity are generating findings using xorBlocks as a flag.
120  const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
121  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
122 
123  if (flags & BT_ReverseDirection)
124  {
125  inBlocks = PtrAdd(inBlocks, length - blockSize);
126  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
127  outBlocks = PtrAdd(outBlocks, length - blockSize);
128  inIncrement = 0-inIncrement;
129  xorIncrement = 0-xorIncrement;
130  outIncrement = 0-outIncrement;
131  }
132 
133  if (flags & BT_AllowParallel)
134  {
135  while (length >= 6*blockSize)
136  {
137  uint64x2_t block0, block1, block2, block3, block4, block5;
138  if (flags & BT_InBlockIsCounter)
139  {
140  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
141  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
142  block1 = vaddq_u64(block0, one);
143  block2 = vaddq_u64(block1, one);
144  block3 = vaddq_u64(block2, one);
145  block4 = vaddq_u64(block3, one);
146  block5 = vaddq_u64(block4, one);
147  vst1q_u8(const_cast<byte*>(inBlocks),
148  vreinterpretq_u8_u64(vaddq_u64(block5, one)));
149  }
150  else
151  {
152  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
153  inBlocks = PtrAdd(inBlocks, inIncrement);
154  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
155  inBlocks = PtrAdd(inBlocks, inIncrement);
156  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
157  inBlocks = PtrAdd(inBlocks, inIncrement);
158  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
159  inBlocks = PtrAdd(inBlocks, inIncrement);
160  block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
161  inBlocks = PtrAdd(inBlocks, inIncrement);
162  block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
163  inBlocks = PtrAdd(inBlocks, inIncrement);
164  }
165 
166  if (xorInput)
167  {
168  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
169  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
170  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
171  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
172  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
173  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
174  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
175  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
176  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
177  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
178  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
179  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
180  }
181 
182  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
183 
184  if (xorOutput)
185  {
186  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
187  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
188  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
189  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
190  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
191  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
192  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
193  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
194  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
195  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
196  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
197  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
198  }
199 
200  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
201  outBlocks = PtrAdd(outBlocks, outIncrement);
202  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
203  outBlocks = PtrAdd(outBlocks, outIncrement);
204  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
205  outBlocks = PtrAdd(outBlocks, outIncrement);
206  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
207  outBlocks = PtrAdd(outBlocks, outIncrement);
208  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
209  outBlocks = PtrAdd(outBlocks, outIncrement);
210  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
211  outBlocks = PtrAdd(outBlocks, outIncrement);
212 
213  length -= 6*blockSize;
214  }
215  }
216 
217  while (length >= blockSize)
218  {
219  uint64x2_t block;
220  block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
221 
222  if (xorInput)
223  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
224 
225  if (flags & BT_InBlockIsCounter)
226  const_cast<byte *>(inBlocks)[15]++;
227 
228  func1(block, subKeys, static_cast<unsigned int>(rounds));
229 
230  if (xorOutput)
231  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
232 
233  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
234 
235  inBlocks = PtrAdd(inBlocks, inIncrement);
236  outBlocks = PtrAdd(outBlocks, outIncrement);
237  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
238  length -= blockSize;
239  }
240 
241  return length;
242 }
243 
244 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
245 /// \tparam F1 function to process 1 128-bit block
246 /// \tparam F4 function to process 4 128-bit blocks
247 /// \tparam W word type of the subkey table
248 /// \details AdvancedProcessBlocks128_4x1_NEON processes 4 and 1 NEON SIMD words
249 /// at a time.
250 /// \details The subkey type is usually word32 or word64. V is the vector type and it is
251 /// usually uint32x4_t or uint32x4_t. F1, F4, and W must use the same word and
252 /// vector type.
253 template <typename F1, typename F4, typename W>
254 inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
255  const W *subKeys, size_t rounds, const byte *inBlocks,
256  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
257 {
258  CRYPTOPP_ASSERT(subKeys);
259  CRYPTOPP_ASSERT(inBlocks);
260  CRYPTOPP_ASSERT(outBlocks);
261  CRYPTOPP_ASSERT(length >= 16);
262 
263  const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
264  const uint32x4_t s_one = vld1q_u32(w_one);
265 
266  const size_t blockSize = 16;
267  // const size_t neonBlockSize = 16;
268 
269  size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
270  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
271  size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
272 
273  // Clang and Coverity are generating findings using xorBlocks as a flag.
274  const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
275  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
276 
277  if (flags & BT_ReverseDirection)
278  {
279  inBlocks = PtrAdd(inBlocks, length - blockSize);
280  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
281  outBlocks = PtrAdd(outBlocks, length - blockSize);
282  inIncrement = 0-inIncrement;
283  xorIncrement = 0-xorIncrement;
284  outIncrement = 0-outIncrement;
285  }
286 
287  if (flags & BT_AllowParallel)
288  {
289  while (length >= 4*blockSize)
290  {
291  uint32x4_t block0, block1, block2, block3;
292  if (flags & BT_InBlockIsCounter)
293  {
294  const uint32x4_t one = s_one;
295  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
296  block1 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block0), vreinterpretq_u64_u32(one)));
297  block2 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block1), vreinterpretq_u64_u32(one)));
298  block3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block2), vreinterpretq_u64_u32(one)));
299  vst1q_u8(const_cast<byte*>(inBlocks), vreinterpretq_u8_u64(vaddq_u64(
300  vreinterpretq_u64_u32(block3), vreinterpretq_u64_u32(one))));
301  }
302  else
303  {
304  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
305  inBlocks = PtrAdd(inBlocks, inIncrement);
306  block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
307  inBlocks = PtrAdd(inBlocks, inIncrement);
308  block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
309  inBlocks = PtrAdd(inBlocks, inIncrement);
310  block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
311  inBlocks = PtrAdd(inBlocks, inIncrement);
312  }
313 
314  if (xorInput)
315  {
316  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
317  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
318  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
319  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
320  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
321  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
322  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
323  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
324  }
325 
326  func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
327 
328  if (xorOutput)
329  {
330  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
331  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
332  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
333  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
334  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
335  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
336  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
337  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
338  }
339 
340  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
341  outBlocks = PtrAdd(outBlocks, outIncrement);
342  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
343  outBlocks = PtrAdd(outBlocks, outIncrement);
344  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
345  outBlocks = PtrAdd(outBlocks, outIncrement);
346  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
347  outBlocks = PtrAdd(outBlocks, outIncrement);
348 
349  length -= 4*blockSize;
350  }
351  }
352 
353  while (length >= blockSize)
354  {
355  uint32x4_t block = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
356 
357  if (xorInput)
358  block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
359 
360  if (flags & BT_InBlockIsCounter)
361  const_cast<byte *>(inBlocks)[15]++;
362 
363  func1(block, subKeys, static_cast<unsigned int>(rounds));
364 
365  if (xorOutput)
366  block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
367 
368  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block));
369 
370  inBlocks = PtrAdd(inBlocks, inIncrement);
371  outBlocks = PtrAdd(outBlocks, outIncrement);
372  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
373  length -= blockSize;
374  }
375 
376  return length;
377 }
378 
379 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
380 /// \tparam F2 function to process 2 128-bit blocks
381 /// \tparam F6 function to process 6 128-bit blocks
382 /// \tparam W word type of the subkey table
383 /// \details AdvancedProcessBlocks128_6x2_NEON processes 6 and 2 NEON SIMD words
384 /// at a time. For a single block the template uses F2 with a zero block.
385 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
386 /// same word type.
387 template <typename F2, typename F6, typename W>
388 inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
389  const W *subKeys, size_t rounds, const byte *inBlocks,
390  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
391 {
392  CRYPTOPP_ASSERT(subKeys);
393  CRYPTOPP_ASSERT(inBlocks);
394  CRYPTOPP_ASSERT(outBlocks);
395  CRYPTOPP_ASSERT(length >= 16);
396 
397  const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
398  const uint32x4_t s_one = vld1q_u32(w_one);
399 
400  const size_t blockSize = 16;
401  // const size_t neonBlockSize = 16;
402 
403  size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
404  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
405  size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
406 
407  // Clang and Coverity are generating findings using xorBlocks as a flag.
408  const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
409  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
410 
411  if (flags & BT_ReverseDirection)
412  {
413  inBlocks = PtrAdd(inBlocks, length - blockSize);
414  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
415  outBlocks = PtrAdd(outBlocks, length - blockSize);
416  inIncrement = 0-inIncrement;
417  xorIncrement = 0-xorIncrement;
418  outIncrement = 0-outIncrement;
419  }
420 
421  if (flags & BT_AllowParallel)
422  {
423  while (length >= 6*blockSize)
424  {
425  uint64x2_t block0, block1, block2, block3, block4, block5;
426  if (flags & BT_InBlockIsCounter)
427  {
428  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
429  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
430  block1 = vaddq_u64(block0, one);
431  block2 = vaddq_u64(block1, one);
432  block3 = vaddq_u64(block2, one);
433  block4 = vaddq_u64(block3, one);
434  block5 = vaddq_u64(block4, one);
435  vst1q_u8(const_cast<byte*>(inBlocks),
436  vreinterpretq_u8_u64(vaddq_u64(block5, one)));
437  }
438  else
439  {
440  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
441  inBlocks = PtrAdd(inBlocks, inIncrement);
442  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
443  inBlocks = PtrAdd(inBlocks, inIncrement);
444  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
445  inBlocks = PtrAdd(inBlocks, inIncrement);
446  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
447  inBlocks = PtrAdd(inBlocks, inIncrement);
448  block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
449  inBlocks = PtrAdd(inBlocks, inIncrement);
450  block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
451  inBlocks = PtrAdd(inBlocks, inIncrement);
452  }
453 
454  if (xorInput)
455  {
456  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
457  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
458  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
459  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
460  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
461  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
462  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
463  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
464  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
465  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
466  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
467  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
468  }
469 
470  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
471 
472  if (xorOutput)
473  {
474  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
475  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
476  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
477  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
478  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
479  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
480  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
481  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
482  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
483  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
484  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
485  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
486  }
487 
488  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
489  outBlocks = PtrAdd(outBlocks, outIncrement);
490  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
491  outBlocks = PtrAdd(outBlocks, outIncrement);
492  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
493  outBlocks = PtrAdd(outBlocks, outIncrement);
494  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
495  outBlocks = PtrAdd(outBlocks, outIncrement);
496  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
497  outBlocks = PtrAdd(outBlocks, outIncrement);
498  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
499  outBlocks = PtrAdd(outBlocks, outIncrement);
500 
501  length -= 6*blockSize;
502  }
503 
504  while (length >= 2*blockSize)
505  {
506  uint64x2_t block0, block1;
507  if (flags & BT_InBlockIsCounter)
508  {
509  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
510  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
511  block1 = vaddq_u64(block0, one);
512  vst1q_u8(const_cast<byte*>(inBlocks),
513  vreinterpretq_u8_u64(vaddq_u64(block1, one)));
514  }
515  else
516  {
517  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
518  inBlocks = PtrAdd(inBlocks, inIncrement);
519  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
520  inBlocks = PtrAdd(inBlocks, inIncrement);
521  }
522 
523  if (xorInput)
524  {
525  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
526  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
527  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
528  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
529  }
530 
531  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
532 
533  if (xorOutput)
534  {
535  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
536  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
537  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
538  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
539  }
540 
541  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
542  outBlocks = PtrAdd(outBlocks, outIncrement);
543  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
544  outBlocks = PtrAdd(outBlocks, outIncrement);
545 
546  length -= 2*blockSize;
547  }
548  }
549 
550  while (length >= blockSize)
551  {
552  uint64x2_t block, zero = {0,0};
553  block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
554 
555  if (xorInput)
556  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
557 
558  if (flags & BT_InBlockIsCounter)
559  const_cast<byte *>(inBlocks)[15]++;
560 
561  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
562 
563  if (xorOutput)
564  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
565 
566  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
567 
568  inBlocks = PtrAdd(inBlocks, inIncrement);
569  outBlocks = PtrAdd(outBlocks, outIncrement);
570  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
571  length -= blockSize;
572  }
573 
574  return length;
575 }
576 
577 NAMESPACE_END // CryptoPP
578 
579 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
580 
581 // *************************** Intel SSE ************************** //
582 
583 #if defined(CRYPTOPP_SSSE3_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
584 
585 #if defined(CRYPTOPP_DOXYGEN_PROCESSING)
586 /// \brief SunCC workaround
587 /// \details SunCC loses the const on AES_Enc_Block and AES_Dec_Block
588 /// \sa <A HREF="http://github.com/weidai11/cryptopp/issues/224">Issue
589 /// 224, SunCC and failed compile for rijndael.cpp</A>
590 # define MAYBE_CONST const
591 /// \brief SunCC workaround
592 /// \details SunCC loses the const on AES_Enc_Block and AES_Dec_Block
593 /// \sa <A HREF="http://github.com/weidai11/cryptopp/issues/224">Issue
594 /// 224, SunCC and failed compile for rijndael.cpp</A>
595 # define MAYBE_UNCONST_CAST(T, x) (x)
596 #elif (__SUNPRO_CC >= 0x5130)
597 # define MAYBE_CONST
598 # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x)
599 #else
600 # define MAYBE_CONST const
601 # define MAYBE_UNCONST_CAST(T, x) (x)
602 #endif
603 
604 #if defined(CRYPTOPP_DOXYGEN_PROCESSING)
605 /// \brief Clang workaround
606 /// \details Clang issues spurious alignment warnings
607 /// \sa <A HREF="http://bugs.llvm.org/show_bug.cgi?id=20670">Issue
608 /// 20670, _mm_loadu_si128 parameter has wrong type</A>
609 # define M128_CAST(x) ((__m128i *)(void *)(x))
610 /// \brief Clang workaround
611 /// \details Clang issues spurious alignment warnings
612 /// \sa <A HREF="http://bugs.llvm.org/show_bug.cgi?id=20670">Issue
613 /// 20670, _mm_loadu_si128 parameter has wrong type</A>
614 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
615 #else
616 # ifndef M128_CAST
617 # define M128_CAST(x) ((__m128i *)(void *)(x))
618 # endif
619 # ifndef CONST_M128_CAST
620 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
621 # endif
622 #endif
623 
624 NAMESPACE_BEGIN(CryptoPP)
625 
626 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
627 /// \tparam F2 function to process 2 128-bit blocks
628 /// \tparam F6 function to process 6 128-bit blocks
629 /// \tparam W word type of the subkey table
630 /// \details AdvancedProcessBlocks128_6x2_SSE processes 6 and 2 SSE SIMD words
631 /// at a time. For a single block the template uses F2 with a zero block.
632 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
633 /// same word type.
634 template <typename F2, typename F6, typename W>
635 inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
636  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
637  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
638 {
639  CRYPTOPP_ASSERT(subKeys);
640  CRYPTOPP_ASSERT(inBlocks);
641  CRYPTOPP_ASSERT(outBlocks);
642  CRYPTOPP_ASSERT(length >= 16);
643 
644  const size_t blockSize = 16;
645  // const size_t xmmBlockSize = 16;
646 
647  size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
648  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
649  size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
650 
651  // Clang and Coverity are generating findings using xorBlocks as a flag.
652  const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
653  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
654 
655  if (flags & BT_ReverseDirection)
656  {
657  inBlocks = PtrAdd(inBlocks, length - blockSize);
658  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
659  outBlocks = PtrAdd(outBlocks, length - blockSize);
660  inIncrement = 0-inIncrement;
661  xorIncrement = 0-xorIncrement;
662  outIncrement = 0-outIncrement;
663  }
664 
665  if (flags & BT_AllowParallel)
666  {
667  while (length >= 6*blockSize)
668  {
669  __m128i block0, block1, block2, block3, block4, block5;
670  if (flags & BT_InBlockIsCounter)
671  {
672  // Increment of 1 in big-endian compatible with the ctr byte array.
673  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
674  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
675  block1 = _mm_add_epi32(block0, s_one);
676  block2 = _mm_add_epi32(block1, s_one);
677  block3 = _mm_add_epi32(block2, s_one);
678  block4 = _mm_add_epi32(block3, s_one);
679  block5 = _mm_add_epi32(block4, s_one);
680  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
681  }
682  else
683  {
684  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
685  inBlocks = PtrAdd(inBlocks, inIncrement);
686  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
687  inBlocks = PtrAdd(inBlocks, inIncrement);
688  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
689  inBlocks = PtrAdd(inBlocks, inIncrement);
690  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
691  inBlocks = PtrAdd(inBlocks, inIncrement);
692  block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
693  inBlocks = PtrAdd(inBlocks, inIncrement);
694  block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
695  inBlocks = PtrAdd(inBlocks, inIncrement);
696  }
697 
698  if (xorInput)
699  {
700  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
701  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
702  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
703  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
704  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
705  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
706  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
707  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
708  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
709  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
710  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
711  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
712  }
713 
714  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
715 
716  if (xorOutput)
717  {
718  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
719  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
720  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
721  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
722  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
723  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
724  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
725  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
726  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
727  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
728  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
729  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
730  }
731 
732  _mm_storeu_si128(M128_CAST(outBlocks), block0);
733  outBlocks = PtrAdd(outBlocks, outIncrement);
734  _mm_storeu_si128(M128_CAST(outBlocks), block1);
735  outBlocks = PtrAdd(outBlocks, outIncrement);
736  _mm_storeu_si128(M128_CAST(outBlocks), block2);
737  outBlocks = PtrAdd(outBlocks, outIncrement);
738  _mm_storeu_si128(M128_CAST(outBlocks), block3);
739  outBlocks = PtrAdd(outBlocks, outIncrement);
740  _mm_storeu_si128(M128_CAST(outBlocks), block4);
741  outBlocks = PtrAdd(outBlocks, outIncrement);
742  _mm_storeu_si128(M128_CAST(outBlocks), block5);
743  outBlocks = PtrAdd(outBlocks, outIncrement);
744 
745  length -= 6*blockSize;
746  }
747 
748  while (length >= 2*blockSize)
749  {
750  __m128i block0, block1;
751  if (flags & BT_InBlockIsCounter)
752  {
753  // Increment of 1 in big-endian compatible with the ctr byte array.
754  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
755  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
756  block1 = _mm_add_epi32(block0, s_one);
757  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
758  }
759  else
760  {
761  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
762  inBlocks = PtrAdd(inBlocks, inIncrement);
763  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
764  inBlocks = PtrAdd(inBlocks, inIncrement);
765  }
766 
767  if (xorInput)
768  {
769  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
770  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
771  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
772  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
773  }
774 
775  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
776 
777  if (xorOutput)
778  {
779  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
780  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
781  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
782  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
783  }
784 
785  _mm_storeu_si128(M128_CAST(outBlocks), block0);
786  outBlocks = PtrAdd(outBlocks, outIncrement);
787  _mm_storeu_si128(M128_CAST(outBlocks), block1);
788  outBlocks = PtrAdd(outBlocks, outIncrement);
789 
790  length -= 2*blockSize;
791  }
792  }
793 
794  while (length >= blockSize)
795  {
796  __m128i block, zero = _mm_setzero_si128();
797  block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
798 
799  if (xorInput)
800  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
801 
802  if (flags & BT_InBlockIsCounter)
803  const_cast<byte *>(inBlocks)[15]++;
804 
805  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
806 
807  if (xorOutput)
808  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
809 
810  _mm_storeu_si128(M128_CAST(outBlocks), block);
811 
812  inBlocks = PtrAdd(inBlocks, inIncrement);
813  outBlocks = PtrAdd(outBlocks, outIncrement);
814  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
815  length -= blockSize;
816  }
817 
818  return length;
819 }
820 
821 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
822 /// \tparam F1 function to process 1 128-bit block
823 /// \tparam F4 function to process 4 128-bit blocks
824 /// \tparam W word type of the subkey table
825 /// \details AdvancedProcessBlocks128_4x1_SSE processes 4 and 1 SSE SIMD words
826 /// at a time.
827 /// \details The subkey type is usually word32 or word64. F1 and F4 must use the
828 /// same word type.
829 template <typename F1, typename F4, typename W>
830 inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
831  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
832  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
833 {
834  CRYPTOPP_ASSERT(subKeys);
835  CRYPTOPP_ASSERT(inBlocks);
836  CRYPTOPP_ASSERT(outBlocks);
837  CRYPTOPP_ASSERT(length >= 16);
838 
839  const size_t blockSize = 16;
840  // const size_t xmmBlockSize = 16;
841 
842  size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
843  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
844  size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
845 
846  // Clang and Coverity are generating findings using xorBlocks as a flag.
847  const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
848  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
849 
850  if (flags & BT_ReverseDirection)
851  {
852  inBlocks = PtrAdd(inBlocks, length - blockSize);
853  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
854  outBlocks = PtrAdd(outBlocks, length - blockSize);
855  inIncrement = 0-inIncrement;
856  xorIncrement = 0-xorIncrement;
857  outIncrement = 0-outIncrement;
858  }
859 
860  if (flags & BT_AllowParallel)
861  {
862  while (length >= 4*blockSize)
863  {
864  __m128i block0, block1, block2, block3;
865  if (flags & BT_InBlockIsCounter)
866  {
867  // Increment of 1 in big-endian compatible with the ctr byte array.
868  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
869  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
870  block1 = _mm_add_epi32(block0, s_one);
871  block2 = _mm_add_epi32(block1, s_one);
872  block3 = _mm_add_epi32(block2, s_one);
873  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
874  }
875  else
876  {
877  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
878  inBlocks = PtrAdd(inBlocks, inIncrement);
879  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
880  inBlocks = PtrAdd(inBlocks, inIncrement);
881  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
882  inBlocks = PtrAdd(inBlocks, inIncrement);
883  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
884  inBlocks = PtrAdd(inBlocks, inIncrement);
885  }
886 
887  if (xorInput)
888  {
889  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
890  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
891  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
892  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
893  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
894  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
895  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
896  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
897  }
898 
899  func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
900 
901  if (xorOutput)
902  {
903  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
904  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
905  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
906  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
907  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
908  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
909  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
910  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
911  }
912 
913  _mm_storeu_si128(M128_CAST(outBlocks), block0);
914  outBlocks = PtrAdd(outBlocks, outIncrement);
915  _mm_storeu_si128(M128_CAST(outBlocks), block1);
916  outBlocks = PtrAdd(outBlocks, outIncrement);
917  _mm_storeu_si128(M128_CAST(outBlocks), block2);
918  outBlocks = PtrAdd(outBlocks, outIncrement);
919  _mm_storeu_si128(M128_CAST(outBlocks), block3);
920  outBlocks = PtrAdd(outBlocks, outIncrement);
921 
922  length -= 4*blockSize;
923  }
924  }
925 
926  while (length >= blockSize)
927  {
928  __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
929 
930  if (xorInput)
931  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
932 
933  if (flags & BT_InBlockIsCounter)
934  const_cast<byte *>(inBlocks)[15]++;
935 
936  func1(block, subKeys, static_cast<unsigned int>(rounds));
937 
938  if (xorOutput)
939  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
940 
941  _mm_storeu_si128(M128_CAST(outBlocks), block);
942 
943  inBlocks = PtrAdd(inBlocks, inIncrement);
944  outBlocks = PtrAdd(outBlocks, outIncrement);
945  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
946  length -= blockSize;
947  }
948 
949  return length;
950 }
951 
952 NAMESPACE_END // CryptoPP
953 
954 #endif // CRYPTOPP_SSSE3_AVAILABLE
955 
956 // ************************** Altivec/Power 4 ************************** //
957 
958 #if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
959 
960 NAMESPACE_BEGIN(CryptoPP)
961 
962 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
963 /// \tparam F1 function to process 1 128-bit block
964 /// \tparam F4 function to process 4 128-bit blocks
965 /// \tparam W word type of the subkey table
966 /// \details AdvancedProcessBlocks128_4x1_ALTIVEC processes 4 and 1 Altivec SIMD words
967 /// at a time.
968 /// \details The subkey type is usually word32 or word64. F1 and F4 must use the
969 /// same word type.
970 template <typename F1, typename F4, typename W>
971 inline size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
972  const W *subKeys, size_t rounds, const byte *inBlocks,
973  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
974 {
975  CRYPTOPP_ASSERT(subKeys);
976  CRYPTOPP_ASSERT(inBlocks);
977  CRYPTOPP_ASSERT(outBlocks);
978  CRYPTOPP_ASSERT(length >= 16);
979 
980 #if (CRYPTOPP_LITTLE_ENDIAN)
981  const uint32x4_p s_one = {1,0,0,0};
982 #else
983  const uint32x4_p s_one = {0,0,0,1};
984 #endif
985 
986  const size_t blockSize = 16;
987  // const size_t simdBlockSize = 16;
988 
989  size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
990  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
991  size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
992 
993  // Clang and Coverity are generating findings using xorBlocks as a flag.
994  const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
995  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
996 
997  if (flags & BT_ReverseDirection)
998  {
999  inBlocks = PtrAdd(inBlocks, length - blockSize);
1000  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1001  outBlocks = PtrAdd(outBlocks, length - blockSize);
1002  inIncrement = 0-inIncrement;
1003  xorIncrement = 0-xorIncrement;
1004  outIncrement = 0-outIncrement;
1005  }
1006 
1007  if (flags & BT_AllowParallel)
1008  {
1009  while (length >= 4*blockSize)
1010  {
1011  uint32x4_p block0, block1, block2, block3;
1012 
1013  if (flags & BT_InBlockIsCounter)
1014  {
1015  block0 = VecLoadBE(inBlocks);
1016  block1 = VecAdd(block0, s_one);
1017  block2 = VecAdd(block1, s_one);
1018  block3 = VecAdd(block2, s_one);
1019 
1020  // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
1021  // CTR_ModePolicy::OperateKeystream is wired such that after
1022  // returning from this function CTR_ModePolicy will detect wrap on
1023  // on the last counter byte and increment the next to last byte.
1024  // The problem is, with a big-endian load, inBlocks[15] is really
1025  // located at index 15. The vector addition using a 32-bit element
1026  // generates a carry into inBlocks[14] and then CTR_ModePolicy
1027  // increments inBlocks[14] too.
1028  const_cast<byte*>(inBlocks)[15] += 6;
1029  }
1030  else
1031  {
1032  block0 = VecLoadBE(inBlocks);
1033  inBlocks = PtrAdd(inBlocks, inIncrement);
1034  block1 = VecLoadBE(inBlocks);
1035  inBlocks = PtrAdd(inBlocks, inIncrement);
1036  block2 = VecLoadBE(inBlocks);
1037  inBlocks = PtrAdd(inBlocks, inIncrement);
1038  block3 = VecLoadBE(inBlocks);
1039  inBlocks = PtrAdd(inBlocks, inIncrement);
1040  }
1041 
1042  if (xorInput)
1043  {
1044  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1045  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1046  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1047  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1048  block2 = VecXor(block2, VecLoadBE(xorBlocks));
1049  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1050  block3 = VecXor(block3, VecLoadBE(xorBlocks));
1051  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1052  }
1053 
1054  func4(block0, block1, block2, block3, subKeys, rounds);
1055 
1056  if (xorOutput)
1057  {
1058  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1059  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1060  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1061  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1062  block2 = VecXor(block2, VecLoadBE(xorBlocks));
1063  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1064  block3 = VecXor(block3, VecLoadBE(xorBlocks));
1065  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1066  }
1067 
1068  VecStoreBE(block0, outBlocks);
1069  outBlocks = PtrAdd(outBlocks, outIncrement);
1070  VecStoreBE(block1, outBlocks);
1071  outBlocks = PtrAdd(outBlocks, outIncrement);
1072  VecStoreBE(block2, outBlocks);
1073  outBlocks = PtrAdd(outBlocks, outIncrement);
1074  VecStoreBE(block3, outBlocks);
1075  outBlocks = PtrAdd(outBlocks, outIncrement);
1076 
1077  length -= 4*blockSize;
1078  }
1079  }
1080 
1081  while (length >= blockSize)
1082  {
1083  uint32x4_p block = VecLoadBE(inBlocks);
1084 
1085  if (xorInput)
1086  block = VecXor(block, VecLoadBE(xorBlocks));
1087 
1088  if (flags & BT_InBlockIsCounter)
1089  const_cast<byte *>(inBlocks)[15]++;
1090 
1091  func1(block, subKeys, rounds);
1092 
1093  if (xorOutput)
1094  block = VecXor(block, VecLoadBE(xorBlocks));
1095 
1096  VecStoreBE(block, outBlocks);
1097 
1098  inBlocks = PtrAdd(inBlocks, inIncrement);
1099  outBlocks = PtrAdd(outBlocks, outIncrement);
1100  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1101  length -= blockSize;
1102  }
1103 
1104  return length;
1105 }
1106 
1107 /// \brief AdvancedProcessBlocks for 1 and 6 blocks
1108 /// \tparam F1 function to process 1 128-bit block
1109 /// \tparam F6 function to process 6 128-bit blocks
1110 /// \tparam W word type of the subkey table
1111 /// \details AdvancedProcessBlocks128_6x1_ALTIVEC processes 6 and 1 Altivec SIMD words
1112 /// at a time.
1113 /// \details The subkey type is usually word32 or word64. F1 and F6 must use the
1114 /// same word type.
1115 template <typename F1, typename F6, typename W>
1116 inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
1117  const W *subKeys, size_t rounds, const byte *inBlocks,
1118  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1119 {
1120  CRYPTOPP_ASSERT(subKeys);
1121  CRYPTOPP_ASSERT(inBlocks);
1122  CRYPTOPP_ASSERT(outBlocks);
1123  CRYPTOPP_ASSERT(length >= 16);
1124 
1125 #if (CRYPTOPP_LITTLE_ENDIAN)
1126  const uint32x4_p s_one = {1,0,0,0};
1127 #else
1128  const uint32x4_p s_one = {0,0,0,1};
1129 #endif
1130 
1131  const size_t blockSize = 16;
1132  // const size_t simdBlockSize = 16;
1133 
1134  size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
1135  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1136  size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1137 
1138  // Clang and Coverity are generating findings using xorBlocks as a flag.
1139  const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
1140  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
1141 
1142  if (flags & BT_ReverseDirection)
1143  {
1144  inBlocks = PtrAdd(inBlocks, length - blockSize);
1145  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1146  outBlocks = PtrAdd(outBlocks, length - blockSize);
1147  inIncrement = 0-inIncrement;
1148  xorIncrement = 0-xorIncrement;
1149  outIncrement = 0-outIncrement;
1150  }
1151 
1152  if (flags & BT_AllowParallel)
1153  {
1154  while (length >= 6*blockSize)
1155  {
1156  uint32x4_p block0, block1, block2, block3, block4, block5;
1157 
1158  if (flags & BT_InBlockIsCounter)
1159  {
1160  block0 = VecLoadBE(inBlocks);
1161  block1 = VecAdd(block0, s_one);
1162  block2 = VecAdd(block1, s_one);
1163  block3 = VecAdd(block2, s_one);
1164  block4 = VecAdd(block3, s_one);
1165  block5 = VecAdd(block4, s_one);
1166 
1167  // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
1168  // CTR_ModePolicy::OperateKeystream is wired such that after
1169  // returning from this function CTR_ModePolicy will detect wrap on
1170  // on the last counter byte and increment the next to last byte.
1171  // The problem is, with a big-endian load, inBlocks[15] is really
1172  // located at index 15. The vector addition using a 32-bit element
1173  // generates a carry into inBlocks[14] and then CTR_ModePolicy
1174  // increments inBlocks[14] too.
1175  //
1176  // To find this bug we needed a test case with a ctr of 0xNN...FA.
1177  // The last octet is 0xFA and adding 6 creates the wrap to trigger
1178  // the issue. If the last octet was 0xFC then 4 would trigger it.
1179  // We dumb-lucked into the test with SPECK-128. The test case of
1180  // interest is the one with IV 348ECA9766C09F04 826520DE47A212FA.
1181  uint8x16_p temp = VecAdd((uint8x16_p)block5, (uint8x16_p)s_one);
1182  VecStoreBE(temp, const_cast<byte*>(inBlocks));
1183  }
1184  else
1185  {
1186  block0 = VecLoadBE(inBlocks);
1187  inBlocks = PtrAdd(inBlocks, inIncrement);
1188  block1 = VecLoadBE(inBlocks);
1189  inBlocks = PtrAdd(inBlocks, inIncrement);
1190  block2 = VecLoadBE(inBlocks);
1191  inBlocks = PtrAdd(inBlocks, inIncrement);
1192  block3 = VecLoadBE(inBlocks);
1193  inBlocks = PtrAdd(inBlocks, inIncrement);
1194  block4 = VecLoadBE(inBlocks);
1195  inBlocks = PtrAdd(inBlocks, inIncrement);
1196  block5 = VecLoadBE(inBlocks);
1197  inBlocks = PtrAdd(inBlocks, inIncrement);
1198  }
1199 
1200  if (xorInput)
1201  {
1202  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1203  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1204  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1205  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1206  block2 = VecXor(block2, VecLoadBE(xorBlocks));
1207  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1208  block3 = VecXor(block3, VecLoadBE(xorBlocks));
1209  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1210  block4 = VecXor(block4, VecLoadBE(xorBlocks));
1211  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1212  block5 = VecXor(block5, VecLoadBE(xorBlocks));
1213  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1214  }
1215 
1216  func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
1217 
1218  if (xorOutput)
1219  {
1220  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1221  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1222  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1223  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1224  block2 = VecXor(block2, VecLoadBE(xorBlocks));
1225  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1226  block3 = VecXor(block3, VecLoadBE(xorBlocks));
1227  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1228  block4 = VecXor(block4, VecLoadBE(xorBlocks));
1229  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1230  block5 = VecXor(block5, VecLoadBE(xorBlocks));
1231  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1232  }
1233 
1234  VecStoreBE(block0, outBlocks);
1235  outBlocks = PtrAdd(outBlocks, outIncrement);
1236  VecStoreBE(block1, outBlocks);
1237  outBlocks = PtrAdd(outBlocks, outIncrement);
1238  VecStoreBE(block2, outBlocks);
1239  outBlocks = PtrAdd(outBlocks, outIncrement);
1240  VecStoreBE(block3, outBlocks);
1241  outBlocks = PtrAdd(outBlocks, outIncrement);
1242  VecStoreBE(block4, outBlocks);
1243  outBlocks = PtrAdd(outBlocks, outIncrement);
1244  VecStoreBE(block5, outBlocks);
1245  outBlocks = PtrAdd(outBlocks, outIncrement);
1246 
1247  length -= 6*blockSize;
1248  }
1249  }
1250 
1251  while (length >= blockSize)
1252  {
1253  uint32x4_p block = VecLoadBE(inBlocks);
1254 
1255  if (xorInput)
1256  block = VecXor(block, VecLoadBE(xorBlocks));
1257 
1258  if (flags & BT_InBlockIsCounter)
1259  const_cast<byte *>(inBlocks)[15]++;
1260 
1261  func1(block, subKeys, rounds);
1262 
1263  if (xorOutput)
1264  block = VecXor(block, VecLoadBE(xorBlocks));
1265 
1266  VecStoreBE(block, outBlocks);
1267 
1268  inBlocks = PtrAdd(inBlocks, inIncrement);
1269  outBlocks = PtrAdd(outBlocks, outIncrement);
1270  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1271  length -= blockSize;
1272  }
1273 
1274  return length;
1275 }
1276 
1277 NAMESPACE_END // CryptoPP
1278 
1279 #endif // __ALTIVEC__
1280 
1281 #endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES
size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 2 and 6 blocks.
Definition: adv_simd.h:388
size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 4 blocks.
Definition: adv_simd.h:254
size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 6 blocks.
Definition: adv_simd.h:1116
#define M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:609
size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 2 and 6 blocks.
Definition: adv_simd.h:635
size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 4 blocks.
Definition: adv_simd.h:971
size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 6 blocks.
Definition: adv_simd.h:100
#define MAYBE_CONST
SunCC workaround.
Definition: adv_simd.h:590
#define CONST_M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:614
size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 4 blocks.
Definition: adv_simd.h:830
@ BT_InBlockIsCounter
inBlock is a counter
Definition: cryptlib.h:922
@ BT_ReverseDirection
perform the transformation in reverse
Definition: cryptlib.h:928
@ BT_XorInput
Xor inputs before transformation.
Definition: cryptlib.h:926
@ BT_AllowParallel
Allow parallel transformations.
Definition: cryptlib.h:930
@ BT_DontIncrementInOutPointers
should not modify block pointers
Definition: cryptlib.h:924
Library configuration file.
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:72
Utility functions for the Crypto++ library.
PTR PtrAdd(PTR pointer, OFF offset)
Create a pointer with an offset.
Definition: misc.h:388
#define EnumToInt(v)
Integer value.
Definition: misc.h:504
Crypto++ library namespace.
Support functions for PowerPC and vector operations.
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:742
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:202
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:1231
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:192
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:1414
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:1438
Common C++ header files.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68