Crypto++  8.8
Free C++ class library of cryptographic schemes
sha_simd.cpp
1 // sha_simd.cpp - written and placed in the public domain by
2 // Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3 //
4 // This source file uses intrinsics to gain access to SHA-NI and
5 // ARMv8a SHA instructions. A separate source file is needed
6 // because additional CXXFLAGS are required to enable the
7 // appropriate instructions sets in some build configurations.
8 
9 #include "pch.h"
10 #include "config.h"
11 #include "sha.h"
12 #include "misc.h"
13 
14 #if defined(CRYPTOPP_DISABLE_SHA_ASM)
15 # undef CRYPTOPP_X86_ASM_AVAILABLE
16 # undef CRYPTOPP_X32_ASM_AVAILABLE
17 # undef CRYPTOPP_X64_ASM_AVAILABLE
18 # undef CRYPTOPP_SSE2_ASM_AVAILABLE
19 #endif
20 
21 #if (CRYPTOPP_SHANI_AVAILABLE)
22 # include <nmmintrin.h>
23 # include <immintrin.h>
24 #endif
25 
26 // Android makes <arm_acle.h> available with ARMv7-a
27 #if (CRYPTOPP_BOOL_ARMV8)
28 # if (CRYPTOPP_ARM_NEON_HEADER)
29 # include <arm_neon.h>
30 # endif
31 # if (CRYPTOPP_ARM_ACLE_HEADER)
32 # include <stdint.h>
33 # include <arm_acle.h>
34 # endif
35 #endif
36 
37 #if CRYPTOPP_POWER8_SHA_AVAILABLE
38 # include "ppc_simd.h"
39 #endif
40 
41 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
42 # include <signal.h>
43 # include <setjmp.h>
44 #endif
45 
46 #ifndef EXCEPTION_EXECUTE_HANDLER
47 # define EXCEPTION_EXECUTE_HANDLER 1
48 #endif
49 
50 // Squash MS LNK4221 and libtool warnings
51 extern const char SHA_SIMD_FNAME[] = __FILE__;
52 
53 NAMESPACE_BEGIN(CryptoPP)
54 
55 // ***************** SHA key tables ********************
56 
57 extern const word32 SHA256_K[64];
58 extern const word64 SHA512_K[80];
59 
60 // ***************** SIGILL probes ********************
61 
62 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
63 extern "C" {
64  typedef void (*SigHandler)(int);
65 
66  static jmp_buf s_jmpSIGILL;
67  static void SigIllHandler(int)
68  {
69  longjmp(s_jmpSIGILL, 1);
70  }
71 }
72 #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
73 
74 #if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8)
75 bool CPU_ProbeSHA1()
76 {
77 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
78  return false;
79 #elif (CRYPTOPP_ARM_SHA1_AVAILABLE)
80 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
81  volatile bool result = true;
82  __try
83  {
84  unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
85  uint32x4_t data1 = vld1q_u32(w+0);
86  uint32x4_t data2 = vld1q_u32(w+4);
87  uint32x4_t data3 = vld1q_u32(w+8);
88 
89  uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
90  uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
91  uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
92  uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
93  uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
94 
95  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
96  }
97  __except (EXCEPTION_EXECUTE_HANDLER)
98  {
99  return false;
100  }
101  return result;
102 # else
103 
104  // longjmp and clobber warnings. Volatile is required.
105  // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
106  volatile bool result = true;
107 
108  volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
109  if (oldHandler == SIG_ERR)
110  return false;
111 
112  volatile sigset_t oldMask;
113  if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
114  {
115  signal(SIGILL, oldHandler);
116  return false;
117  }
118 
119  if (setjmp(s_jmpSIGILL))
120  result = false;
121  else
122  {
123  unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
124  uint32x4_t data1 = vld1q_u32(w+0);
125  uint32x4_t data2 = vld1q_u32(w+4);
126  uint32x4_t data3 = vld1q_u32(w+8);
127 
128  uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
129  uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
130  uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
131  uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
132  uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
133 
134  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
135  }
136 
137  sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
138  signal(SIGILL, oldHandler);
139  return result;
140 # endif
141 #else
142  return false;
143 #endif // CRYPTOPP_ARM_SHA1_AVAILABLE
144 }
145 
146 bool CPU_ProbeSHA256()
147 {
148 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
149  return false;
150 #elif (CRYPTOPP_ARM_SHA2_AVAILABLE)
151 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
152  volatile bool result = true;
153  __try
154  {
155  unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
156  uint32x4_t data1 = vld1q_u32(w+0);
157  uint32x4_t data2 = vld1q_u32(w+4);
158  uint32x4_t data3 = vld1q_u32(w+8);
159 
160  uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
161  uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
162  uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
163  uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
164 
165  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
166  }
167  __except (EXCEPTION_EXECUTE_HANDLER)
168  {
169  return false;
170  }
171  return result;
172 #else
173 
174  // longjmp and clobber warnings. Volatile is required.
175  // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
176  volatile bool result = true;
177 
178  volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
179  if (oldHandler == SIG_ERR)
180  return false;
181 
182  volatile sigset_t oldMask;
183  if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
184  {
185  signal(SIGILL, oldHandler);
186  return false;
187  }
188 
189  if (setjmp(s_jmpSIGILL))
190  result = false;
191  else
192  {
193  unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
194  uint32x4_t data1 = vld1q_u32(w+0);
195  uint32x4_t data2 = vld1q_u32(w+4);
196  uint32x4_t data3 = vld1q_u32(w+8);
197 
198  uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
199  uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
200  uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
201  uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
202 
203  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
204  }
205 
206  sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
207  signal(SIGILL, oldHandler);
208  return result;
209 # endif
210 #else
211  return false;
212 #endif // CRYPTOPP_ARM_SHA2_AVAILABLE
213 }
214 #endif // ARM32 or ARM64
215 
216 // ***************** Intel x86 SHA ********************
217 
218 /////////////////////////////////////
219 // start of Walton and Gulley code //
220 /////////////////////////////////////
221 
222 #if CRYPTOPP_SHANI_AVAILABLE
223 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
224 void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
225 {
226  CRYPTOPP_ASSERT(state);
227  CRYPTOPP_ASSERT(data);
228  CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
229 
230  __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
231  __m128i MASK, MSG0, MSG1, MSG2, MSG3;
232 
233  // Load initial values
234  ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
235  E0 = _mm_set_epi32(state[4], 0, 0, 0);
236  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
237 
238  // IA-32 SHA is little endian, SHA::Transform is big endian,
239  // and SHA::HashMultipleBlocks can be either. ByteOrder
240  // allows us to avoid extra endian reversals. It saves 1.0 cpb.
241  MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
242  _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
243  _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
244 
245  while (length >= SHA1::BLOCKSIZE)
246  {
247  // Save current hash
248  ABCD_SAVE = ABCD;
249  E0_SAVE = E0;
250 
251  // Rounds 0-3
252  MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
253  MSG0 = _mm_shuffle_epi8(MSG0, MASK);
254  E0 = _mm_add_epi32(E0, MSG0);
255  E1 = ABCD;
256  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
257 
258  // Rounds 4-7
259  MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
260  MSG1 = _mm_shuffle_epi8(MSG1, MASK);
261  E1 = _mm_sha1nexte_epu32(E1, MSG1);
262  E0 = ABCD;
263  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
264  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
265 
266  // Rounds 8-11
267  MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
268  MSG2 = _mm_shuffle_epi8(MSG2, MASK);
269  E0 = _mm_sha1nexte_epu32(E0, MSG2);
270  E1 = ABCD;
271  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
272  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
273  MSG0 = _mm_xor_si128(MSG0, MSG2);
274 
275  // Rounds 12-15
276  MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
277  MSG3 = _mm_shuffle_epi8(MSG3, MASK);
278  E1 = _mm_sha1nexte_epu32(E1, MSG3);
279  E0 = ABCD;
280  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
281  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
282  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
283  MSG1 = _mm_xor_si128(MSG1, MSG3);
284 
285  // Rounds 16-19
286  E0 = _mm_sha1nexte_epu32(E0, MSG0);
287  E1 = ABCD;
288  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
289  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
290  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
291  MSG2 = _mm_xor_si128(MSG2, MSG0);
292 
293  // Rounds 20-23
294  E1 = _mm_sha1nexte_epu32(E1, MSG1);
295  E0 = ABCD;
296  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
297  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
298  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
299  MSG3 = _mm_xor_si128(MSG3, MSG1);
300 
301  // Rounds 24-27
302  E0 = _mm_sha1nexte_epu32(E0, MSG2);
303  E1 = ABCD;
304  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
305  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
306  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
307  MSG0 = _mm_xor_si128(MSG0, MSG2);
308 
309  // Rounds 28-31
310  E1 = _mm_sha1nexte_epu32(E1, MSG3);
311  E0 = ABCD;
312  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
313  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
314  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
315  MSG1 = _mm_xor_si128(MSG1, MSG3);
316 
317  // Rounds 32-35
318  E0 = _mm_sha1nexte_epu32(E0, MSG0);
319  E1 = ABCD;
320  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
321  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
322  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
323  MSG2 = _mm_xor_si128(MSG2, MSG0);
324 
325  // Rounds 36-39
326  E1 = _mm_sha1nexte_epu32(E1, MSG1);
327  E0 = ABCD;
328  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
329  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
330  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
331  MSG3 = _mm_xor_si128(MSG3, MSG1);
332 
333  // Rounds 40-43
334  E0 = _mm_sha1nexte_epu32(E0, MSG2);
335  E1 = ABCD;
336  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
337  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
338  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
339  MSG0 = _mm_xor_si128(MSG0, MSG2);
340 
341  // Rounds 44-47
342  E1 = _mm_sha1nexte_epu32(E1, MSG3);
343  E0 = ABCD;
344  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
345  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
346  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
347  MSG1 = _mm_xor_si128(MSG1, MSG3);
348 
349  // Rounds 48-51
350  E0 = _mm_sha1nexte_epu32(E0, MSG0);
351  E1 = ABCD;
352  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
353  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
354  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
355  MSG2 = _mm_xor_si128(MSG2, MSG0);
356 
357  // Rounds 52-55
358  E1 = _mm_sha1nexte_epu32(E1, MSG1);
359  E0 = ABCD;
360  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
361  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
362  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
363  MSG3 = _mm_xor_si128(MSG3, MSG1);
364 
365  // Rounds 56-59
366  E0 = _mm_sha1nexte_epu32(E0, MSG2);
367  E1 = ABCD;
368  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
369  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
370  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
371  MSG0 = _mm_xor_si128(MSG0, MSG2);
372 
373  // Rounds 60-63
374  E1 = _mm_sha1nexte_epu32(E1, MSG3);
375  E0 = ABCD;
376  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
377  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
378  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
379  MSG1 = _mm_xor_si128(MSG1, MSG3);
380 
381  // Rounds 64-67
382  E0 = _mm_sha1nexte_epu32(E0, MSG0);
383  E1 = ABCD;
384  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
385  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
386  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
387  MSG2 = _mm_xor_si128(MSG2, MSG0);
388 
389  // Rounds 68-71
390  E1 = _mm_sha1nexte_epu32(E1, MSG1);
391  E0 = ABCD;
392  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
393  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
394  MSG3 = _mm_xor_si128(MSG3, MSG1);
395 
396  // Rounds 72-75
397  E0 = _mm_sha1nexte_epu32(E0, MSG2);
398  E1 = ABCD;
399  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
400  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
401 
402  // Rounds 76-79
403  E1 = _mm_sha1nexte_epu32(E1, MSG3);
404  E0 = ABCD;
405  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
406 
407  // Add values back to state
408  E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
409  ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
410 
411  data += SHA1::BLOCKSIZE/sizeof(word32);
412  length -= SHA1::BLOCKSIZE;
413  }
414 
415  // Save state
416  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
417  _mm_storeu_si128(M128_CAST(state), ABCD);
418  state[4] = _mm_extract_epi32(E0, 3);
419 }
420 
421 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
422 void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
423 {
424  CRYPTOPP_ASSERT(state);
425  CRYPTOPP_ASSERT(data);
426  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
427 
428  __m128i STATE0, STATE1;
429  __m128i MSG, TMP, MASK;
430  __m128i TMSG0, TMSG1, TMSG2, TMSG3;
431  __m128i ABEF_SAVE, CDGH_SAVE;
432 
433  // Load initial values
434  TMP = _mm_loadu_si128(M128_CAST(&state[0]));
435  STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
436 
437  // IA-32 SHA is little endian, SHA::Transform is big endian,
438  // and SHA::HashMultipleBlocks can be either. ByteOrder
439  // allows us to avoid extra endian reversals. It saves 1.0 cpb.
440  MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
441  _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
442  _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
443 
444  TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
445  STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
446  STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
447  STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
448 
449  while (length >= SHA256::BLOCKSIZE)
450  {
451  // Save current hash
452  ABEF_SAVE = STATE0;
453  CDGH_SAVE = STATE1;
454 
455  // Rounds 0-3
456  MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
457  TMSG0 = _mm_shuffle_epi8(MSG, MASK);
458  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
459  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
460  MSG = _mm_shuffle_epi32(MSG, 0x0E);
461  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
462 
463  // Rounds 4-7
464  TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
465  TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
466  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
467  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
468  MSG = _mm_shuffle_epi32(MSG, 0x0E);
469  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
470  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
471 
472  // Rounds 8-11
473  TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
474  TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
475  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
476  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
477  MSG = _mm_shuffle_epi32(MSG, 0x0E);
478  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
479  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
480 
481  // Rounds 12-15
482  TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
483  TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
484  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
485  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
486  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
487  TMSG0 = _mm_add_epi32(TMSG0, TMP);
488  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
489  MSG = _mm_shuffle_epi32(MSG, 0x0E);
490  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
491  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
492 
493  // Rounds 16-19
494  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
495  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
496  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
497  TMSG1 = _mm_add_epi32(TMSG1, TMP);
498  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
499  MSG = _mm_shuffle_epi32(MSG, 0x0E);
500  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
501  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
502 
503  // Rounds 20-23
504  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
505  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
506  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
507  TMSG2 = _mm_add_epi32(TMSG2, TMP);
508  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
509  MSG = _mm_shuffle_epi32(MSG, 0x0E);
510  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
511  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
512 
513  // Rounds 24-27
514  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
515  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
516  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
517  TMSG3 = _mm_add_epi32(TMSG3, TMP);
518  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
519  MSG = _mm_shuffle_epi32(MSG, 0x0E);
520  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
521  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
522 
523  // Rounds 28-31
524  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
525  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
526  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
527  TMSG0 = _mm_add_epi32(TMSG0, TMP);
528  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
529  MSG = _mm_shuffle_epi32(MSG, 0x0E);
530  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
531  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
532 
533  // Rounds 32-35
534  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
535  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
536  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
537  TMSG1 = _mm_add_epi32(TMSG1, TMP);
538  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
539  MSG = _mm_shuffle_epi32(MSG, 0x0E);
540  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
541  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
542 
543  // Rounds 36-39
544  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
545  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
546  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
547  TMSG2 = _mm_add_epi32(TMSG2, TMP);
548  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
549  MSG = _mm_shuffle_epi32(MSG, 0x0E);
550  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
551  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
552 
553  // Rounds 40-43
554  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
555  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
556  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
557  TMSG3 = _mm_add_epi32(TMSG3, TMP);
558  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
559  MSG = _mm_shuffle_epi32(MSG, 0x0E);
560  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
561  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
562 
563  // Rounds 44-47
564  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
565  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
566  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
567  TMSG0 = _mm_add_epi32(TMSG0, TMP);
568  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
569  MSG = _mm_shuffle_epi32(MSG, 0x0E);
570  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
571  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
572 
573  // Rounds 48-51
574  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
575  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
576  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
577  TMSG1 = _mm_add_epi32(TMSG1, TMP);
578  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
579  MSG = _mm_shuffle_epi32(MSG, 0x0E);
580  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
581  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
582 
583  // Rounds 52-55
584  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
585  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
586  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
587  TMSG2 = _mm_add_epi32(TMSG2, TMP);
588  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
589  MSG = _mm_shuffle_epi32(MSG, 0x0E);
590  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
591 
592  // Rounds 56-59
593  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
594  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
595  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
596  TMSG3 = _mm_add_epi32(TMSG3, TMP);
597  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
598  MSG = _mm_shuffle_epi32(MSG, 0x0E);
599  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
600 
601  // Rounds 60-63
602  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
603  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
604  MSG = _mm_shuffle_epi32(MSG, 0x0E);
605  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
606 
607  // Add values back to state
608  STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
609  STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
610 
611  data += SHA256::BLOCKSIZE/sizeof(word32);
612  length -= SHA256::BLOCKSIZE;
613  }
614 
615  TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
616  STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
617  STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
618  STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
619 
620  // Save state
621  _mm_storeu_si128(M128_CAST(&state[0]), STATE0);
622  _mm_storeu_si128(M128_CAST(&state[4]), STATE1);
623 }
624 #endif // CRYPTOPP_SHANI_AVAILABLE
625 
626 ///////////////////////////////////
627 // end of Walton and Gulley code //
628 ///////////////////////////////////
629 
630 // ***************** ARMV8 SHA ********************
631 
632 /////////////////////////////////////////////////////////////
633 // start of Walton, Schneiders, O'Rourke and Hovsmith code //
634 /////////////////////////////////////////////////////////////
635 
636 #if CRYPTOPP_ARM_SHA1_AVAILABLE
637 void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
638 {
639  CRYPTOPP_ASSERT(state);
640  CRYPTOPP_ASSERT(data);
641  CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
642 
643  uint32x4_t C0, C1, C2, C3;
644  uint32x4_t ABCD, ABCD_SAVED;
645  uint32x4_t MSG0, MSG1, MSG2, MSG3;
646  uint32x4_t TMP0, TMP1;
647  uint32_t E0, E0_SAVED, E1;
648 
649  // Load initial values
650  C0 = vdupq_n_u32(0x5A827999);
651  C1 = vdupq_n_u32(0x6ED9EBA1);
652  C2 = vdupq_n_u32(0x8F1BBCDC);
653  C3 = vdupq_n_u32(0xCA62C1D6);
654 
655  ABCD = vld1q_u32(&state[0]);
656  E0 = state[4];
657 
658  while (length >= SHA1::BLOCKSIZE)
659  {
660  // Save current hash
661  ABCD_SAVED = ABCD;
662  E0_SAVED = E0;
663 
664  MSG0 = vld1q_u32(data + 0);
665  MSG1 = vld1q_u32(data + 4);
666  MSG2 = vld1q_u32(data + 8);
667  MSG3 = vld1q_u32(data + 12);
668 
669  if (order == BIG_ENDIAN_ORDER) // Data arrangement
670  {
671  MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
672  MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
673  MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
674  MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
675  }
676 
677  TMP0 = vaddq_u32(MSG0, C0);
678  TMP1 = vaddq_u32(MSG1, C0);
679 
680  // Rounds 0-3
681  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
682  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
683  TMP0 = vaddq_u32(MSG2, C0);
684  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
685 
686  // Rounds 4-7
687  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
688  ABCD = vsha1cq_u32(ABCD, E1, TMP1);
689  TMP1 = vaddq_u32(MSG3, C0);
690  MSG0 = vsha1su1q_u32(MSG0, MSG3);
691  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
692 
693  // Rounds 8-11
694  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
695  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
696  TMP0 = vaddq_u32(MSG0, C0);
697  MSG1 = vsha1su1q_u32(MSG1, MSG0);
698  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
699 
700  // Rounds 12-15
701  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
702  ABCD = vsha1cq_u32(ABCD, E1, TMP1);
703  TMP1 = vaddq_u32(MSG1, C1);
704  MSG2 = vsha1su1q_u32(MSG2, MSG1);
705  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
706 
707  // Rounds 16-19
708  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
709  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
710  TMP0 = vaddq_u32(MSG2, C1);
711  MSG3 = vsha1su1q_u32(MSG3, MSG2);
712  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
713 
714  // Rounds 20-23
715  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
716  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
717  TMP1 = vaddq_u32(MSG3, C1);
718  MSG0 = vsha1su1q_u32(MSG0, MSG3);
719  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
720 
721  // Rounds 24-27
722  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
723  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
724  TMP0 = vaddq_u32(MSG0, C1);
725  MSG1 = vsha1su1q_u32(MSG1, MSG0);
726  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
727 
728  // Rounds 28-31
729  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
730  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
731  TMP1 = vaddq_u32(MSG1, C1);
732  MSG2 = vsha1su1q_u32(MSG2, MSG1);
733  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
734 
735  // Rounds 32-35
736  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
737  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
738  TMP0 = vaddq_u32(MSG2, C2);
739  MSG3 = vsha1su1q_u32(MSG3, MSG2);
740  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
741 
742  // Rounds 36-39
743  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
744  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
745  TMP1 = vaddq_u32(MSG3, C2);
746  MSG0 = vsha1su1q_u32(MSG0, MSG3);
747  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
748 
749  // Rounds 40-43
750  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
751  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
752  TMP0 = vaddq_u32(MSG0, C2);
753  MSG1 = vsha1su1q_u32(MSG1, MSG0);
754  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
755 
756  // Rounds 44-47
757  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
758  ABCD = vsha1mq_u32(ABCD, E1, TMP1);
759  TMP1 = vaddq_u32(MSG1, C2);
760  MSG2 = vsha1su1q_u32(MSG2, MSG1);
761  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
762 
763  // Rounds 48-51
764  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
765  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
766  TMP0 = vaddq_u32(MSG2, C2);
767  MSG3 = vsha1su1q_u32(MSG3, MSG2);
768  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
769 
770  // Rounds 52-55
771  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
772  ABCD = vsha1mq_u32(ABCD, E1, TMP1);
773  TMP1 = vaddq_u32(MSG3, C3);
774  MSG0 = vsha1su1q_u32(MSG0, MSG3);
775  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
776 
777  // Rounds 56-59
778  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
779  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
780  TMP0 = vaddq_u32(MSG0, C3);
781  MSG1 = vsha1su1q_u32(MSG1, MSG0);
782  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
783 
784  // Rounds 60-63
785  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
786  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
787  TMP1 = vaddq_u32(MSG1, C3);
788  MSG2 = vsha1su1q_u32(MSG2, MSG1);
789  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
790 
791  // Rounds 64-67
792  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
793  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
794  TMP0 = vaddq_u32(MSG2, C3);
795  MSG3 = vsha1su1q_u32(MSG3, MSG2);
796  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
797 
798  // Rounds 68-71
799  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
800  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
801  TMP1 = vaddq_u32(MSG3, C3);
802  MSG0 = vsha1su1q_u32(MSG0, MSG3);
803 
804  // Rounds 72-75
805  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
806  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
807 
808  // Rounds 76-79
809  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
810  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
811 
812  E0 += E0_SAVED;
813  ABCD = vaddq_u32(ABCD_SAVED, ABCD);
814 
815  data += SHA1::BLOCKSIZE/sizeof(word32);
816  length -= SHA1::BLOCKSIZE;
817  }
818 
819  // Save state
820  vst1q_u32(&state[0], ABCD);
821  state[4] = E0;
822 }
823 #endif // CRYPTOPP_ARM_SHA1_AVAILABLE
824 
825 #if CRYPTOPP_ARM_SHA2_AVAILABLE
826 void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
827 {
828  CRYPTOPP_ASSERT(state);
829  CRYPTOPP_ASSERT(data);
830  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
831 
832  uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
833  uint32x4_t MSG0, MSG1, MSG2, MSG3;
834  uint32x4_t TMP0, TMP1, TMP2;
835 
836  // Load initial values
837  STATE0 = vld1q_u32(&state[0]);
838  STATE1 = vld1q_u32(&state[4]);
839 
840  while (length >= SHA256::BLOCKSIZE)
841  {
842  // Save current hash
843  ABEF_SAVE = STATE0;
844  CDGH_SAVE = STATE1;
845 
846  // Load message
847  MSG0 = vld1q_u32(data + 0);
848  MSG1 = vld1q_u32(data + 4);
849  MSG2 = vld1q_u32(data + 8);
850  MSG3 = vld1q_u32(data + 12);
851 
852  if (order == BIG_ENDIAN_ORDER) // Data arrangement
853  {
854  MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
855  MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
856  MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
857  MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
858  }
859 
860  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
861 
862  // Rounds 0-3
863  MSG0 = vsha256su0q_u32(MSG0, MSG1);
864  TMP2 = STATE0;
865  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
866  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
867  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
868  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
869 
870  // Rounds 4-7
871  MSG1 = vsha256su0q_u32(MSG1, MSG2);
872  TMP2 = STATE0;
873  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
874  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
875  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
876  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
877 
878  // Rounds 8-11
879  MSG2 = vsha256su0q_u32(MSG2, MSG3);
880  TMP2 = STATE0;
881  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
882  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
883  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
884  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
885 
886  // Rounds 12-15
887  MSG3 = vsha256su0q_u32(MSG3, MSG0);
888  TMP2 = STATE0;
889  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
890  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
891  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
892  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
893 
894  // Rounds 16-19
895  MSG0 = vsha256su0q_u32(MSG0, MSG1);
896  TMP2 = STATE0;
897  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
898  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
899  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
900  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
901 
902  // Rounds 20-23
903  MSG1 = vsha256su0q_u32(MSG1, MSG2);
904  TMP2 = STATE0;
905  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
906  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
907  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
908  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
909 
910  // Rounds 24-27
911  MSG2 = vsha256su0q_u32(MSG2, MSG3);
912  TMP2 = STATE0;
913  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
914  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
915  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
916  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
917 
918  // Rounds 28-31
919  MSG3 = vsha256su0q_u32(MSG3, MSG0);
920  TMP2 = STATE0;
921  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
922  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
923  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
924  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
925 
926  // Rounds 32-35
927  MSG0 = vsha256su0q_u32(MSG0, MSG1);
928  TMP2 = STATE0;
929  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
930  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
931  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
932  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
933 
934  // Rounds 36-39
935  MSG1 = vsha256su0q_u32(MSG1, MSG2);
936  TMP2 = STATE0;
937  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
938  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
939  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
940  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
941 
942  // Rounds 40-43
943  MSG2 = vsha256su0q_u32(MSG2, MSG3);
944  TMP2 = STATE0;
945  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
946  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
947  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
948  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
949 
950  // Rounds 44-47
951  MSG3 = vsha256su0q_u32(MSG3, MSG0);
952  TMP2 = STATE0;
953  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
954  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
955  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
956  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
957 
958  // Rounds 48-51
959  TMP2 = STATE0;
960  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
961  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
962  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
963 
964  // Rounds 52-55
965  TMP2 = STATE0;
966  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
967  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
968  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
969 
970  // Rounds 56-59
971  TMP2 = STATE0;
972  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
973  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
974  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
975 
976  // Rounds 60-63
977  TMP2 = STATE0;
978  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
979  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
980 
981  // Add back to state
982  STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
983  STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
984 
985  data += SHA256::BLOCKSIZE/sizeof(word32);
986  length -= SHA256::BLOCKSIZE;
987  }
988 
989  // Save state
990  vst1q_u32(&state[0], STATE0);
991  vst1q_u32(&state[4], STATE1);
992 }
993 #endif // CRYPTOPP_ARM_SHA2_AVAILABLE
994 
995 ///////////////////////////////////////////////////////////
996 // end of Walton, Schneiders, O'Rourke and Hovsmith code //
997 ///////////////////////////////////////////////////////////
998 
999 // ***************** Power8 SHA ********************
1000 
1001 //////////////////////////////////////////////////
1002 // start Gustavo, Serra, Scalet and Walton code //
1003 //////////////////////////////////////////////////
1004 
1005 #if CRYPTOPP_POWER8_SHA_AVAILABLE
1006 
1007 // Indexes into the S[] array
1008 enum {A=0, B=1, C, D, E, F, G, H};
1009 
1010 inline
1011 uint32x4_p VecLoad32(const word32* data, int offset)
1012 {
1013 #if (CRYPTOPP_LITTLE_ENDIAN)
1014  const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
1015  const uint32x4_p val = VecLoad(offset, data);
1016  return (uint32x4_p)VecPermute(val, val, mask);
1017 #else
1018  return VecLoad(offset, data);
1019 #endif
1020 }
1021 
1022 template<class T> inline
1023 void VecStore32(const T data, word32 dest[4])
1024 {
1025  VecStore(data, dest);
1026 }
1027 
1028 inline
1029 uint32x4_p VectorCh(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1030 {
1031  // The trick below is due to Andy Polyakov and Jack Lloyd
1032  return vec_sel(z,y,x);
1033 }
1034 
1035 inline
1036 uint32x4_p VectorMaj(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1037 {
1038  // The trick below is due to Andy Polyakov and Jack Lloyd
1039  return vec_sel(y, z, VecXor(x, y));
1040 }
1041 
1042 inline
1043 uint32x4_p Vector_sigma0(const uint32x4_p val)
1044 {
1045  return VecSHA256<0,0>(val);
1046 }
1047 
1048 inline
1049 uint32x4_p Vector_sigma1(const uint32x4_p val)
1050 {
1051  return VecSHA256<0,0xf>(val);
1052 }
1053 
1054 inline
1055 uint32x4_p VectorSigma0(const uint32x4_p val)
1056 {
1057  return VecSHA256<1,0>(val);
1058 }
1059 
1060 inline
1061 uint32x4_p VectorSigma1(const uint32x4_p val)
1062 {
1063  return VecSHA256<1,0xf>(val);
1064 }
1065 
1066 inline
1067 uint32x4_p VectorPack(const uint32x4_p a, const uint32x4_p b,
1068  const uint32x4_p c, const uint32x4_p d)
1069 {
1070  const uint8x16_p m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
1071  const uint8x16_p m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1072  return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2);
1073 }
1074 
1075 template <unsigned int R> inline
1076 void SHA256_ROUND1(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K, const uint32x4_p M)
1077 {
1078  uint32x4_p T1, T2;
1079 
1080  W[R] = M;
1081  T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1082  T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1083 
1084  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1085  S[E] = S[D] + T1;
1086  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1087  S[A] = T1 + T2;
1088 }
1089 
1090 template <unsigned int R> inline
1091 void SHA256_ROUND2(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K)
1092 {
1093  // Indexes into the W[] array
1094  enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1095 
1096  const uint32x4_p s0 = Vector_sigma0(W[IDX1]);
1097  const uint32x4_p s1 = Vector_sigma1(W[IDX14]);
1098 
1099  uint32x4_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1100  T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1101  uint32x4_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1102 
1103  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1104  S[E] = S[D] + T1;
1105  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1106  S[A] = T1 + T2;
1107 }
1108 
1109 void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t length, ByteOrder order)
1110 {
1111  CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1112  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
1113  CRYPTOPP_UNUSED(order);
1114 
1115  const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
1116  const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
1117 
1118  uint32x4_p abcd = VecLoad(state+0);
1119  uint32x4_p efgh = VecLoad(state+4);
1120  uint32x4_p W[16], S[8], vm, vk;
1121 
1122  size_t blocks = length / SHA256::BLOCKSIZE;
1123  while (blocks--)
1124  {
1125  unsigned int offset=0;
1126 
1127  S[A] = abcd; S[E] = efgh;
1128  S[B] = VecShiftLeftOctet<4>(S[A]);
1129  S[F] = VecShiftLeftOctet<4>(S[E]);
1130  S[C] = VecShiftLeftOctet<4>(S[B]);
1131  S[G] = VecShiftLeftOctet<4>(S[F]);
1132  S[D] = VecShiftLeftOctet<4>(S[C]);
1133  S[H] = VecShiftLeftOctet<4>(S[G]);
1134 
1135  // Rounds 0-16
1136  vk = VecLoad(offset, k);
1137  vm = VecLoad32(m, offset);
1138  SHA256_ROUND1<0>(W,S, vk,vm);
1139  offset+=16;
1140 
1141  vk = VecShiftLeftOctet<4>(vk);
1142  vm = VecShiftLeftOctet<4>(vm);
1143  SHA256_ROUND1<1>(W,S, vk,vm);
1144 
1145  vk = VecShiftLeftOctet<4>(vk);
1146  vm = VecShiftLeftOctet<4>(vm);
1147  SHA256_ROUND1<2>(W,S, vk,vm);
1148 
1149  vk = VecShiftLeftOctet<4>(vk);
1150  vm = VecShiftLeftOctet<4>(vm);
1151  SHA256_ROUND1<3>(W,S, vk,vm);
1152 
1153  vk = VecLoad(offset, k);
1154  vm = VecLoad32(m, offset);
1155  SHA256_ROUND1<4>(W,S, vk,vm);
1156  offset+=16;
1157 
1158  vk = VecShiftLeftOctet<4>(vk);
1159  vm = VecShiftLeftOctet<4>(vm);
1160  SHA256_ROUND1<5>(W,S, vk,vm);
1161 
1162  vk = VecShiftLeftOctet<4>(vk);
1163  vm = VecShiftLeftOctet<4>(vm);
1164  SHA256_ROUND1<6>(W,S, vk,vm);
1165 
1166  vk = VecShiftLeftOctet<4>(vk);
1167  vm = VecShiftLeftOctet<4>(vm);
1168  SHA256_ROUND1<7>(W,S, vk,vm);
1169 
1170  vk = VecLoad(offset, k);
1171  vm = VecLoad32(m, offset);
1172  SHA256_ROUND1<8>(W,S, vk,vm);
1173  offset+=16;
1174 
1175  vk = VecShiftLeftOctet<4>(vk);
1176  vm = VecShiftLeftOctet<4>(vm);
1177  SHA256_ROUND1<9>(W,S, vk,vm);
1178 
1179  vk = VecShiftLeftOctet<4>(vk);
1180  vm = VecShiftLeftOctet<4>(vm);
1181  SHA256_ROUND1<10>(W,S, vk,vm);
1182 
1183  vk = VecShiftLeftOctet<4>(vk);
1184  vm = VecShiftLeftOctet<4>(vm);
1185  SHA256_ROUND1<11>(W,S, vk,vm);
1186 
1187  vk = VecLoad(offset, k);
1188  vm = VecLoad32(m, offset);
1189  SHA256_ROUND1<12>(W,S, vk,vm);
1190  offset+=16;
1191 
1192  vk = VecShiftLeftOctet<4>(vk);
1193  vm = VecShiftLeftOctet<4>(vm);
1194  SHA256_ROUND1<13>(W,S, vk,vm);
1195 
1196  vk = VecShiftLeftOctet<4>(vk);
1197  vm = VecShiftLeftOctet<4>(vm);
1198  SHA256_ROUND1<14>(W,S, vk,vm);
1199 
1200  vk = VecShiftLeftOctet<4>(vk);
1201  vm = VecShiftLeftOctet<4>(vm);
1202  SHA256_ROUND1<15>(W,S, vk,vm);
1203 
1204  m += 16; // 32-bit words, not bytes
1205 
1206  // Rounds 16-64
1207  for (unsigned int i=16; i<64; i+=16)
1208  {
1209  vk = VecLoad(offset, k);
1210  SHA256_ROUND2<0>(W,S, vk);
1211  SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk));
1212  SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk));
1213  SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk));
1214  offset+=16;
1215 
1216  vk = VecLoad(offset, k);
1217  SHA256_ROUND2<4>(W,S, vk);
1218  SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk));
1219  SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk));
1220  SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk));
1221  offset+=16;
1222 
1223  vk = VecLoad(offset, k);
1224  SHA256_ROUND2<8>(W,S, vk);
1225  SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk));
1226  SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk));
1227  SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk));
1228  offset+=16;
1229 
1230  vk = VecLoad(offset, k);
1231  SHA256_ROUND2<12>(W,S, vk);
1232  SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk));
1233  SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk));
1234  SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk));
1235  offset+=16;
1236  }
1237 
1238  abcd += VectorPack(S[A],S[B],S[C],S[D]);
1239  efgh += VectorPack(S[E],S[F],S[G],S[H]);
1240  }
1241 
1242  VecStore32(abcd, state+0);
1243  VecStore32(efgh, state+4);
1244 }
1245 
1246 inline
1247 void VecStore64(const uint64x2_p val, word64* data)
1248 {
1249  VecStore(val, data);
1250 }
1251 
1252 inline
1253 uint64x2_p VecLoad64(const word64* data, int offset)
1254 {
1255 #if (CRYPTOPP_LITTLE_ENDIAN)
1256  const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
1257  return VecPermute(VecLoad(offset, data), mask);
1258 #else
1259  return VecLoad(offset, data);
1260 #endif
1261 }
1262 
1263 inline
1264 uint64x2_p VectorCh(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1265 {
1266  // The trick below is due to Andy Polyakov and Jack Lloyd
1267  return vec_sel(z,y,x);
1268 }
1269 
1270 inline
1271 uint64x2_p VectorMaj(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1272 {
1273  // The trick below is due to Andy Polyakov and Jack Lloyd
1274  return vec_sel(y, z, VecXor(x, y));
1275 }
1276 
1277 inline
1278 uint64x2_p Vector_sigma0(const uint64x2_p val)
1279 {
1280  return VecSHA512<0,0>(val);
1281 }
1282 
1283 inline
1284 uint64x2_p Vector_sigma1(const uint64x2_p val)
1285 {
1286  return VecSHA512<0,0xf>(val);
1287 }
1288 
1289 inline
1290 uint64x2_p VectorSigma0(const uint64x2_p val)
1291 {
1292  return VecSHA512<1,0>(val);
1293 }
1294 
1295 inline
1296 uint64x2_p VectorSigma1(const uint64x2_p val)
1297 {
1298  return VecSHA512<1,0xf>(val);
1299 }
1300 
1301 inline
1302 uint64x2_p VectorPack(const uint64x2_p x, const uint64x2_p y)
1303 {
1304  const uint8x16_p m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1305  return VecPermute(x,y,m);
1306 }
1307 
1308 template <unsigned int R> inline
1309 void SHA512_ROUND1(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K, const uint64x2_p M)
1310 {
1311  uint64x2_p T1, T2;
1312 
1313  W[R] = M;
1314  T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1315  T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1316 
1317  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1318  S[E] = S[D] + T1;
1319  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1320  S[A] = T1 + T2;
1321 }
1322 
1323 template <unsigned int R> inline
1324 void SHA512_ROUND2(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K)
1325 {
1326  // Indexes into the W[] array
1327  enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1328 
1329  const uint64x2_p s0 = Vector_sigma0(W[IDX1]);
1330  const uint64x2_p s1 = Vector_sigma1(W[IDX14]);
1331 
1332  uint64x2_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1333  T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1334  uint64x2_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1335 
1336  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1337  S[E] = S[D] + T1;
1338  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1339  S[A] = T1 + T2;
1340 }
1341 
1342 void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order)
1343 {
1344  CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1345  CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE);
1346  CRYPTOPP_UNUSED(order);
1347 
1348  const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
1349  const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
1350 
1351  uint64x2_p ab = VecLoad(state+0);
1352  uint64x2_p cd = VecLoad(state+2);
1353  uint64x2_p ef = VecLoad(state+4);
1354  uint64x2_p gh = VecLoad(state+6);
1355  uint64x2_p W[16], S[8], vm, vk;
1356 
1357  size_t blocks = length / SHA512::BLOCKSIZE;
1358  while (blocks--)
1359  {
1360  unsigned int offset=0;
1361 
1362  S[A] = ab; S[C] = cd;
1363  S[E] = ef; S[G] = gh;
1364  S[B] = VecShiftLeftOctet<8>(S[A]);
1365  S[D] = VecShiftLeftOctet<8>(S[C]);
1366  S[F] = VecShiftLeftOctet<8>(S[E]);
1367  S[H] = VecShiftLeftOctet<8>(S[G]);
1368 
1369  // Rounds 0-16
1370  vk = VecLoad(offset, k);
1371  vm = VecLoad64(m, offset);
1372  SHA512_ROUND1<0>(W,S, vk,vm);
1373  offset+=16;
1374 
1375  vk = VecShiftLeftOctet<8>(vk);
1376  vm = VecShiftLeftOctet<8>(vm);
1377  SHA512_ROUND1<1>(W,S, vk,vm);
1378 
1379  vk = VecLoad(offset, k);
1380  vm = VecLoad64(m, offset);
1381  SHA512_ROUND1<2>(W,S, vk,vm);
1382  offset+=16;
1383 
1384  vk = VecShiftLeftOctet<8>(vk);
1385  vm = VecShiftLeftOctet<8>(vm);
1386  SHA512_ROUND1<3>(W,S, vk,vm);
1387 
1388  vk = VecLoad(offset, k);
1389  vm = VecLoad64(m, offset);
1390  SHA512_ROUND1<4>(W,S, vk,vm);
1391  offset+=16;
1392 
1393  vk = VecShiftLeftOctet<8>(vk);
1394  vm = VecShiftLeftOctet<8>(vm);
1395  SHA512_ROUND1<5>(W,S, vk,vm);
1396 
1397  vk = VecLoad(offset, k);
1398  vm = VecLoad64(m, offset);
1399  SHA512_ROUND1<6>(W,S, vk,vm);
1400  offset+=16;
1401 
1402  vk = VecShiftLeftOctet<8>(vk);
1403  vm = VecShiftLeftOctet<8>(vm);
1404  SHA512_ROUND1<7>(W,S, vk,vm);
1405 
1406  vk = VecLoad(offset, k);
1407  vm = VecLoad64(m, offset);
1408  SHA512_ROUND1<8>(W,S, vk,vm);
1409  offset+=16;
1410 
1411  vk = VecShiftLeftOctet<8>(vk);
1412  vm = VecShiftLeftOctet<8>(vm);
1413  SHA512_ROUND1<9>(W,S, vk,vm);
1414 
1415  vk = VecLoad(offset, k);
1416  vm = VecLoad64(m, offset);
1417  SHA512_ROUND1<10>(W,S, vk,vm);
1418  offset+=16;
1419 
1420  vk = VecShiftLeftOctet<8>(vk);
1421  vm = VecShiftLeftOctet<8>(vm);
1422  SHA512_ROUND1<11>(W,S, vk,vm);
1423 
1424  vk = VecLoad(offset, k);
1425  vm = VecLoad64(m, offset);
1426  SHA512_ROUND1<12>(W,S, vk,vm);
1427  offset+=16;
1428 
1429  vk = VecShiftLeftOctet<8>(vk);
1430  vm = VecShiftLeftOctet<8>(vm);
1431  SHA512_ROUND1<13>(W,S, vk,vm);
1432 
1433  vk = VecLoad(offset, k);
1434  vm = VecLoad64(m, offset);
1435  SHA512_ROUND1<14>(W,S, vk,vm);
1436  offset+=16;
1437 
1438  vk = VecShiftLeftOctet<8>(vk);
1439  vm = VecShiftLeftOctet<8>(vm);
1440  SHA512_ROUND1<15>(W,S, vk,vm);
1441 
1442  m += 16; // 64-bit words, not bytes
1443 
1444  // Rounds 16-80
1445  for (unsigned int i=16; i<80; i+=16)
1446  {
1447  vk = VecLoad(offset, k);
1448  SHA512_ROUND2<0>(W,S, vk);
1449  SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk));
1450  offset+=16;
1451 
1452  vk = VecLoad(offset, k);
1453  SHA512_ROUND2<2>(W,S, vk);
1454  SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk));
1455  offset+=16;
1456 
1457  vk = VecLoad(offset, k);
1458  SHA512_ROUND2<4>(W,S, vk);
1459  SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk));
1460  offset+=16;
1461 
1462  vk = VecLoad(offset, k);
1463  SHA512_ROUND2<6>(W,S, vk);
1464  SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk));
1465  offset+=16;
1466 
1467  vk = VecLoad(offset, k);
1468  SHA512_ROUND2<8>(W,S, vk);
1469  SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk));
1470  offset+=16;
1471 
1472  vk = VecLoad(offset, k);
1473  SHA512_ROUND2<10>(W,S, vk);
1474  SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk));
1475  offset+=16;
1476 
1477  vk = VecLoad(offset, k);
1478  SHA512_ROUND2<12>(W,S, vk);
1479  SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk));
1480  offset+=16;
1481 
1482  vk = VecLoad(offset, k);
1483  SHA512_ROUND2<14>(W,S, vk);
1484  SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk));
1485  offset+=16;
1486  }
1487 
1488  ab += VectorPack(S[A],S[B]);
1489  cd += VectorPack(S[C],S[D]);
1490  ef += VectorPack(S[E],S[F]);
1491  gh += VectorPack(S[G],S[H]);
1492  }
1493 
1494  VecStore64(ab, state+0);
1495  VecStore64(cd, state+2);
1496  VecStore64(ef, state+4);
1497  VecStore64(gh, state+6);
1498 }
1499 
1500 #endif // CRYPTOPP_POWER8_SHA_AVAILABLE
1501 
1502 ////////////////////////////////////////////////
1503 // end Gustavo, Serra, Scalet and Walton code //
1504 ////////////////////////////////////////////////
1505 
1506 NAMESPACE_END
#define M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:609
#define CONST_M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:614
Library configuration file.
#define W64LIT(x)
Declare an unsigned word64.
Definition: config_int.h:129
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:72
unsigned long long word64
64-bit unsigned datatype
Definition: config_int.h:101
ByteOrder
Provides the byte ordering.
Definition: cryptlib.h:148
@ BIG_ENDIAN_ORDER
byte order is big-endian
Definition: cryptlib.h:152
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.
Support functions for PowerPC and vector operations.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:202
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1478
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:192
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:1414
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:212
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:895
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:369
Classes for SHA-1 and SHA-2 family of message digests.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68