Crypto++  8.8
Free C++ class library of cryptographic schemes
lsh512_avx.cpp
1 // lsh.cpp - written and placed in the public domain by Jeffrey Walton
2 // Based on the specification and source code provided by
3 // Korea Internet & Security Agency (KISA) website. Also
4 // see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
5 // and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
6 
7 // We are hitting some sort of GCC bug in the LSH AVX2 code path.
8 // Clang is OK on the AVX2 code path. We believe it is GCC Issue
9 // 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
10 // makes using zeroupper a little tricky.
11 
12 #include "pch.h"
13 #include "config.h"
14 
15 #include "lsh.h"
16 #include "misc.h"
17 
18 // Squash MS LNK4221 and libtool warnings
19 extern const char LSH512_AVX_FNAME[] = __FILE__;
20 
21 #if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
22 
23 #if defined(CRYPTOPP_AVX2_AVAILABLE)
24 # include <emmintrin.h>
25 # include <immintrin.h>
26 #endif
27 
28 #if defined(CRYPTOPP_GCC_COMPATIBLE)
29 # include <x86intrin.h>
30 #endif
31 
32 ANONYMOUS_NAMESPACE_BEGIN
33 
34 /* LSH Constants */
35 
36 const unsigned int LSH512_MSG_BLK_BYTE_LEN = 256;
37 // const unsigned int LSH512_MSG_BLK_BIT_LEN = 2048;
38 // const unsigned int LSH512_CV_BYTE_LEN = 128;
39 const unsigned int LSH512_HASH_VAL_MAX_BYTE_LEN = 64;
40 
41 // const unsigned int MSG_BLK_WORD_LEN = 32;
42 const unsigned int CV_WORD_LEN = 16;
43 const unsigned int CONST_WORD_LEN = 8;
44 // const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
45 const unsigned int NUM_STEPS = 28;
46 
47 const unsigned int ROT_EVEN_ALPHA = 23;
48 const unsigned int ROT_EVEN_BETA = 59;
49 const unsigned int ROT_ODD_ALPHA = 7;
50 const unsigned int ROT_ODD_BETA = 3;
51 
52 const unsigned int LSH_TYPE_512_512 = 0x0010040;
53 const unsigned int LSH_TYPE_512_384 = 0x0010030;
54 const unsigned int LSH_TYPE_512_256 = 0x0010020;
55 const unsigned int LSH_TYPE_512_224 = 0x001001C;
56 
57 // const unsigned int LSH_TYPE_384 = LSH_TYPE_512_384;
58 // const unsigned int LSH_TYPE_512 = LSH_TYPE_512_512;
59 
60 /* Error Code */
61 
62 const unsigned int LSH_SUCCESS = 0x0;
63 // const unsigned int LSH_ERR_NULL_PTR = 0x2401;
64 // const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
65 const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
66 const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
67 
68 /* Index into our state array */
69 
70 const unsigned int AlgorithmType = 80;
71 const unsigned int RemainingBits = 81;
72 
73 NAMESPACE_END
74 
75 NAMESPACE_BEGIN(CryptoPP)
76 NAMESPACE_BEGIN(LSH)
77 
78 // lsh512.cpp
79 extern const word64 LSH512_IV224[CV_WORD_LEN];
80 extern const word64 LSH512_IV256[CV_WORD_LEN];
81 extern const word64 LSH512_IV384[CV_WORD_LEN];
82 extern const word64 LSH512_IV512[CV_WORD_LEN];
83 extern const word64 LSH512_StepConstants[CONST_WORD_LEN * NUM_STEPS];
84 
85 NAMESPACE_END // LSH
86 NAMESPACE_END // Crypto++
87 
88 ANONYMOUS_NAMESPACE_BEGIN
89 
90 using CryptoPP::byte;
91 using CryptoPP::word32;
92 using CryptoPP::word64;
95 
96 using CryptoPP::GetBlock;
100 
101 using CryptoPP::LSH::LSH512_IV224;
102 using CryptoPP::LSH::LSH512_IV256;
103 using CryptoPP::LSH::LSH512_IV384;
104 using CryptoPP::LSH::LSH512_IV512;
105 using CryptoPP::LSH::LSH512_StepConstants;
106 
107 typedef byte lsh_u8;
108 typedef word32 lsh_u32;
109 typedef word64 lsh_u64;
110 typedef word32 lsh_uint;
111 typedef word32 lsh_err;
112 typedef word32 lsh_type;
113 
114 struct LSH512_AVX2_Context
115 {
116  LSH512_AVX2_Context(word64* state, word64 algType, word64& remainingBitLength) :
117  cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
118  last_block(reinterpret_cast<byte*>(state+48)),
119  remain_databitlen(remainingBitLength),
120  alg_type(static_cast<lsh_type>(algType)) {}
121 
122  lsh_u64* cv_l; // start of our state block
123  lsh_u64* cv_r;
124  lsh_u64* sub_msgs;
125  lsh_u8* last_block;
126  lsh_u64& remain_databitlen;
127  lsh_type alg_type;
128 };
129 
130 struct LSH512_AVX2_Internal
131 {
132  LSH512_AVX2_Internal(word64* state) :
133  submsg_e_l(state+16), submsg_e_r(state+24),
134  submsg_o_l(state+32), submsg_o_r(state+40) { }
135 
136  lsh_u64* submsg_e_l; /* even left sub-message */
137  lsh_u64* submsg_e_r; /* even right sub-message */
138  lsh_u64* submsg_o_l; /* odd left sub-message */
139  lsh_u64* submsg_o_r; /* odd right sub-message */
140 };
141 
142 // Zero the upper 128 bits of all YMM registers on exit.
143 // It avoids AVX state transition penalties when saving state.
144 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735
145 // makes using zeroupper a little tricky.
146 
147 struct AVX_Cleanup
148 {
149  ~AVX_Cleanup() {
150  _mm256_zeroupper();
151  }
152 };
153 
154 // const lsh_u32 g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
155 
156 /* LSH AlgType Macro */
157 
158 inline bool LSH_IS_LSH512(lsh_uint val) {
159  return (val & 0xf0000) == 0x10000;
160 }
161 
162 inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
163  return val >> 24;
164 }
165 
166 inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
167  return val & 0xffff;
168 }
169 
170 inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
171  return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
172 }
173 
174 inline lsh_u64 loadLE64(lsh_u64 v) {
176 }
177 
178 lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) {
179  return rotlFixed(x, r);
180 }
181 
182 // Original code relied upon unaligned lsh_u64 buffer
183 inline void load_msg_blk(LSH512_AVX2_Internal* i_state, const lsh_u8 msgblk[LSH512_MSG_BLK_BYTE_LEN])
184 {
185  lsh_u64* submsg_e_l = i_state->submsg_e_l;
186  lsh_u64* submsg_e_r = i_state->submsg_e_r;
187  lsh_u64* submsg_o_l = i_state->submsg_o_l;
188  lsh_u64* submsg_o_r = i_state->submsg_o_r;
189 
190  _mm256_storeu_si256(M256_CAST(submsg_e_l+0),
191  _mm256_loadu_si256(CONST_M256_CAST(msgblk+0)));
192  _mm256_storeu_si256(M256_CAST(submsg_e_l+4),
193  _mm256_loadu_si256(CONST_M256_CAST(msgblk+32)));
194 
195  _mm256_storeu_si256(M256_CAST(submsg_e_r+0),
196  _mm256_loadu_si256(CONST_M256_CAST(msgblk+64)));
197  _mm256_storeu_si256(M256_CAST(submsg_e_r+4),
198  _mm256_loadu_si256(CONST_M256_CAST(msgblk+96)));
199 
200  _mm256_storeu_si256(M256_CAST(submsg_o_l+0),
201  _mm256_loadu_si256(CONST_M256_CAST(msgblk+128)));
202  _mm256_storeu_si256(M256_CAST(submsg_o_l+4),
203  _mm256_loadu_si256(CONST_M256_CAST(msgblk+160)));
204 
205  _mm256_storeu_si256(M256_CAST(submsg_o_r+0),
206  _mm256_loadu_si256(CONST_M256_CAST(msgblk+192)));
207  _mm256_storeu_si256(M256_CAST(submsg_o_r+4),
208  _mm256_loadu_si256(CONST_M256_CAST(msgblk+224)));
209 }
210 
211 inline void msg_exp_even(LSH512_AVX2_Internal* i_state)
212 {
213  CRYPTOPP_ASSERT(i_state != NULLPTR);
214 
215  lsh_u64* submsg_e_l = i_state->submsg_e_l;
216  lsh_u64* submsg_e_r = i_state->submsg_e_r;
217  lsh_u64* submsg_o_l = i_state->submsg_o_l;
218  lsh_u64* submsg_o_r = i_state->submsg_o_r;
219 
220  _mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi64(
221  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
222  _mm256_permute4x64_epi64(
223  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
224  _MM_SHUFFLE(1,0,2,3))));
225  _mm256_storeu_si256(M256_CAST(submsg_e_l+4), _mm256_add_epi64(
226  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)),
227  _mm256_permute4x64_epi64(
228  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)),
229  _MM_SHUFFLE(2,1,0,3))));
230 
231  _mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi64(
232  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
233  _mm256_permute4x64_epi64(
234  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
235  _MM_SHUFFLE(1,0,2,3))));
236  _mm256_storeu_si256(M256_CAST(submsg_e_r+4), _mm256_add_epi64(
237  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)),
238  _mm256_permute4x64_epi64(
239  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)),
240  _MM_SHUFFLE(2,1,0,3))));
241 }
242 
243 inline void msg_exp_odd(LSH512_AVX2_Internal* i_state)
244 {
245  CRYPTOPP_ASSERT(i_state != NULLPTR);
246 
247  lsh_u64* submsg_e_l = i_state->submsg_e_l;
248  lsh_u64* submsg_e_r = i_state->submsg_e_r;
249  lsh_u64* submsg_o_l = i_state->submsg_o_l;
250  lsh_u64* submsg_o_r = i_state->submsg_o_r;
251 
252  _mm256_storeu_si256(M256_CAST(submsg_o_l+0),
253  _mm256_add_epi64(
254  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
255  _mm256_permute4x64_epi64(
256  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
257  _MM_SHUFFLE(1,0,2,3))));
258  _mm256_storeu_si256(M256_CAST(submsg_o_l+4),
259  _mm256_add_epi64(
260  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)),
261  _mm256_permute4x64_epi64(
262  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)),
263  _MM_SHUFFLE(2,1,0,3))));
264 
265  _mm256_storeu_si256(M256_CAST(submsg_o_r+0),
266  _mm256_add_epi64(
267  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
268  _mm256_permute4x64_epi64(
269  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
270  _MM_SHUFFLE(1,0,2,3))));
271  _mm256_storeu_si256(M256_CAST(submsg_o_r+4),
272  _mm256_add_epi64(
273  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)),
274  _mm256_permute4x64_epi64(
275  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)),
276  _MM_SHUFFLE(2,1,0,3))));
277 }
278 
279 inline void load_sc(const lsh_u64** p_const_v, size_t i)
280 {
281  *p_const_v = &LSH512_StepConstants[i];
282 }
283 
284 inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state)
285 {
286  CRYPTOPP_ASSERT(i_state != NULLPTR);
287 
288  lsh_u64* submsg_e_l = i_state->submsg_e_l;
289  lsh_u64* submsg_e_r = i_state->submsg_e_r;
290 
291  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
292  _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
293  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l))));
294  _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
295  _mm256_loadu_si256(CONST_M256_CAST(cv_r)),
296  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r))));
297 
298  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
299  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
300  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4))));
301  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256(
302  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
303  _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4))));
304 }
305 
306 inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state)
307 {
308  CRYPTOPP_ASSERT(i_state != NULLPTR);
309 
310  lsh_u64* submsg_o_l = i_state->submsg_o_l;
311  lsh_u64* submsg_o_r = i_state->submsg_o_r;
312 
313  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
314  _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
315  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l))));
316  _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
317  _mm256_loadu_si256(CONST_M256_CAST(cv_r)),
318  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r))));
319 
320  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
321  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
322  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4))));
323  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256(
324  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
325  _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4))));
326 }
327 
328 inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
329 {
330  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi64(
331  _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
332  _mm256_loadu_si256(CONST_M256_CAST(cv_r))));
333  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_add_epi64(
334  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
335  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4))));
336 }
337 
338 template <unsigned int R>
339 inline void rotate_blk(lsh_u64 cv[8])
340 {
341  _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
342  _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
343  _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), 64-R)));
344  _mm256_storeu_si256(M256_CAST(cv+4), _mm256_or_si256(
345  _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), R),
346  _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), 64-R)));
347 }
348 
349 inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64 const_v[8])
350 {
351  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
352  _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
353  _mm256_loadu_si256(CONST_M256_CAST(const_v))));
354  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
355  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
356  _mm256_loadu_si256(CONST_M256_CAST(const_v+4))));
357 }
358 
359 inline void rotate_msg_gamma(lsh_u64 cv_r[8])
360 {
361  // g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
362  _mm256_storeu_si256(M256_CAST(cv_r+0),
363  _mm256_shuffle_epi8(
364  _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)),
365  _mm256_set_epi8(
366  /* hi lane */ 9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4,
367  /* lo lane */ 13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0)));
368  _mm256_storeu_si256(M256_CAST(cv_r+4),
369  _mm256_shuffle_epi8(
370  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
371  _mm256_set_epi8(
372  /* hi lane */ 8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3,
373  /* lo lane */ 12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7)));
374 }
375 
376 inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
377 {
378  __m256i temp[2];
379  _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_permute4x64_epi64(
380  _mm256_loadu_si256(CONST_M256_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2)));
381  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_permute4x64_epi64(
382  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2)));
383  _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_permute4x64_epi64(
384  _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0)));
385  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_permute4x64_epi64(
386  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0)));
387 
388  temp[0] = _mm256_loadu_si256(CONST_M256_CAST(cv_l+0));
389  temp[1] = _mm256_loadu_si256(CONST_M256_CAST(cv_r+0));
390 
391  _mm256_storeu_si256(M256_CAST(cv_l+0),
392  _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)));
393  _mm256_storeu_si256(M256_CAST(cv_l+4),
394  _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)));
395 
396  _mm256_storeu_si256(M256_CAST(cv_r+0), temp[0]);
397  _mm256_storeu_si256(M256_CAST(cv_r+4), temp[1]);
398 }
399 
400 /* -------------------------------------------------------- *
401 * step function
402 * -------------------------------------------------------- */
403 
404 template <unsigned int Alpha, unsigned int Beta>
405 inline void mix(lsh_u64 cv_l[8], lsh_u64 cv_r[8], const lsh_u64 const_v[8])
406 {
407  add_blk(cv_l, cv_r);
408  rotate_blk<Alpha>(cv_l);
409  xor_with_const(cv_l, const_v);
410  add_blk(cv_r, cv_l);
411  rotate_blk<Beta>(cv_r);
412  add_blk(cv_l, cv_r);
413  rotate_msg_gamma(cv_r);
414 }
415 
416 /* -------------------------------------------------------- *
417 * compression function
418 * -------------------------------------------------------- */
419 
420 inline void compress(LSH512_AVX2_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_BYTE_LEN])
421 {
422  CRYPTOPP_ASSERT(ctx != NULLPTR);
423 
424  LSH512_AVX2_Internal s_state(ctx->cv_l);
425  LSH512_AVX2_Internal* i_state = &s_state;
426 
427  const lsh_u64* const_v = NULL;
428  lsh_u64 *cv_l = ctx->cv_l;
429  lsh_u64 *cv_r = ctx->cv_r;
430 
431  load_msg_blk(i_state, pdMsgBlk);
432 
433  msg_add_even(cv_l, cv_r, i_state);
434  load_sc(&const_v, 0);
435  mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
436  word_perm(cv_l, cv_r);
437 
438  msg_add_odd(cv_l, cv_r, i_state);
439  load_sc(&const_v, 8);
440  mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
441  word_perm(cv_l, cv_r);
442 
443  for (size_t i = 1; i < NUM_STEPS / 2; i++)
444  {
445  msg_exp_even(i_state);
446  msg_add_even(cv_l, cv_r, i_state);
447  load_sc(&const_v, 16 * i);
448  mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
449  word_perm(cv_l, cv_r);
450 
451  msg_exp_odd(i_state);
452  msg_add_odd(cv_l, cv_r, i_state);
453  load_sc(&const_v, 16 * i + 8);
454  mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
455  word_perm(cv_l, cv_r);
456  }
457 
458  msg_exp_even(i_state);
459  msg_add_even(cv_l, cv_r, i_state);
460 }
461 
462 /* -------------------------------------------------------- */
463 
464 inline void load_iv(word64 cv_l[8], word64 cv_r[8], const word64 iv[16])
465 {
466  // The IV's are 32-byte aligned so we can use aligned loads.
467  _mm256_storeu_si256(M256_CAST(cv_l+0),
468  _mm256_load_si256(CONST_M256_CAST(iv+0)));
469  _mm256_storeu_si256(M256_CAST(cv_l+4),
470  _mm256_load_si256(CONST_M256_CAST(iv+4)));
471 
472  _mm256_storeu_si256(M256_CAST(cv_r+0),
473  _mm256_load_si256(CONST_M256_CAST(iv+8)));
474  _mm256_storeu_si256(M256_CAST(cv_r+4),
475  _mm256_load_si256(CONST_M256_CAST(iv+12)));
476 }
477 
478 inline void zero_iv(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
479 {
480  _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256());
481  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_setzero_si256());
482  _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256());
483  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_setzero_si256());
484 }
485 
486 inline void zero_submsgs(LSH512_AVX2_Context* ctx)
487 {
488  lsh_u64* sub_msgs = ctx->sub_msgs;
489 
490  _mm256_storeu_si256(M256_CAST(sub_msgs+ 0),
491  _mm256_setzero_si256());
492  _mm256_storeu_si256(M256_CAST(sub_msgs+ 4),
493  _mm256_setzero_si256());
494 
495  _mm256_storeu_si256(M256_CAST(sub_msgs+ 8),
496  _mm256_setzero_si256());
497  _mm256_storeu_si256(M256_CAST(sub_msgs+12),
498  _mm256_setzero_si256());
499 }
500 
501 inline void init224(LSH512_AVX2_Context* ctx)
502 {
503  CRYPTOPP_ASSERT(ctx != NULLPTR);
504 
505  zero_submsgs(ctx);
506  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV224);
507 }
508 
509 inline void init256(LSH512_AVX2_Context* ctx)
510 {
511  CRYPTOPP_ASSERT(ctx != NULLPTR);
512 
513  zero_submsgs(ctx);
514  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV256);
515 }
516 
517 inline void init384(LSH512_AVX2_Context* ctx)
518 {
519  CRYPTOPP_ASSERT(ctx != NULLPTR);
520 
521  zero_submsgs(ctx);
522  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV384);
523 }
524 
525 inline void init512(LSH512_AVX2_Context* ctx)
526 {
527  CRYPTOPP_ASSERT(ctx != NULLPTR);
528 
529  zero_submsgs(ctx);
530  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV512);
531 }
532 
533 /* -------------------------------------------------------- */
534 
535 inline void fin(LSH512_AVX2_Context* ctx)
536 {
537  CRYPTOPP_ASSERT(ctx != NULLPTR);
538 
539  _mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256(
540  _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)),
541  _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0))));
542 
543  _mm256_storeu_si256(M256_CAST(ctx->cv_l+4), _mm256_xor_si256(
544  _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+4)),
545  _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+4))));
546 }
547 
548 /* -------------------------------------------------------- */
549 
550 inline void get_hash(LSH512_AVX2_Context* ctx, lsh_u8* pbHashVal)
551 {
552  CRYPTOPP_ASSERT(ctx != NULLPTR);
553  CRYPTOPP_ASSERT(ctx->alg_type != 0);
554  CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
555 
556  lsh_uint alg_type = ctx->alg_type;
557  lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
558  lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
559 
560  // Multiplying by sizeof(lsh_u8) looks odd...
561  std::memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
562  if (hash_val_bit_len){
563  pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
564  }
565 }
566 
567 /* -------------------------------------------------------- */
568 
569 lsh_err lsh512_init_avx2(LSH512_AVX2_Context* ctx)
570 {
571  CRYPTOPP_ASSERT(ctx != NULLPTR);
572  CRYPTOPP_ASSERT(ctx->alg_type != 0);
573 
574  lsh_u32 alg_type = ctx->alg_type;
575  const lsh_u64* const_v = NULL;
576  ctx->remain_databitlen = 0;
577 
578  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
579  AVX_Cleanup cleanup;
580 
581  switch (alg_type){
582  case LSH_TYPE_512_512:
583  init512(ctx);
584  return LSH_SUCCESS;
585  case LSH_TYPE_512_384:
586  init384(ctx);
587  return LSH_SUCCESS;
588  case LSH_TYPE_512_256:
589  init256(ctx);
590  return LSH_SUCCESS;
591  case LSH_TYPE_512_224:
592  init224(ctx);
593  return LSH_SUCCESS;
594  default:
595  break;
596  }
597 
598  lsh_u64* cv_l = ctx->cv_l;
599  lsh_u64* cv_r = ctx->cv_r;
600 
601  zero_iv(cv_l, cv_r);
602  cv_l[0] = LSH512_HASH_VAL_MAX_BYTE_LEN;
603  cv_l[1] = LSH_GET_HASHBIT(alg_type);
604 
605  for (size_t i = 0; i < NUM_STEPS / 2; i++)
606  {
607  //Mix
608  load_sc(&const_v, i * 16);
609  mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
610  word_perm(cv_l, cv_r);
611 
612  load_sc(&const_v, i * 16 + 8);
613  mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
614  word_perm(cv_l, cv_r);
615  }
616 
617  return LSH_SUCCESS;
618 }
619 
620 lsh_err lsh512_update_avx2(LSH512_AVX2_Context* ctx, const lsh_u8* data, size_t databitlen)
621 {
622  CRYPTOPP_ASSERT(ctx != NULLPTR);
623  CRYPTOPP_ASSERT(data != NULLPTR);
624  CRYPTOPP_ASSERT(databitlen % 8 == 0);
625  CRYPTOPP_ASSERT(ctx->alg_type != 0);
626 
627  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
628  AVX_Cleanup cleanup;
629 
630  if (databitlen == 0){
631  return LSH_SUCCESS;
632  }
633 
634  // We are byte oriented. tail bits will always be 0.
635  size_t databytelen = databitlen >> 3;
636  // lsh_uint pos2 = databitlen & 0x7;
637  const size_t pos2 = 0;
638 
639  size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
640  // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
641  const size_t remain_msg_bit = 0;
642 
643  if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
644  return LSH_ERR_INVALID_STATE;
645  }
646  if (remain_msg_bit > 0){
647  return LSH_ERR_INVALID_DATABITLEN;
648  }
649 
650  if (databytelen + remain_msg_byte < LSH512_MSG_BLK_BYTE_LEN){
651  std::memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
652  ctx->remain_databitlen += (lsh_uint)databitlen;
653  remain_msg_byte += (lsh_uint)databytelen;
654  if (pos2){
655  ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
656  }
657  return LSH_SUCCESS;
658  }
659 
660  if (remain_msg_byte > 0){
661  size_t more_byte = LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte;
662  std::memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
663  compress(ctx, ctx->last_block);
664  data += more_byte;
665  databytelen -= more_byte;
666  remain_msg_byte = 0;
667  ctx->remain_databitlen = 0;
668  }
669 
670  while (databytelen >= LSH512_MSG_BLK_BYTE_LEN)
671  {
672  // This call to compress caused some trouble.
673  // The data pointer can become unaligned in the
674  // previous block.
675  compress(ctx, data);
676  data += LSH512_MSG_BLK_BYTE_LEN;
677  databytelen -= LSH512_MSG_BLK_BYTE_LEN;
678  }
679 
680  if (databytelen > 0){
681  std::memcpy(ctx->last_block, data, databytelen);
682  ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
683  }
684 
685  if (pos2){
686  ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
687  ctx->remain_databitlen += pos2;
688  }
689  return LSH_SUCCESS;
690 }
691 
692 lsh_err lsh512_final_avx2(LSH512_AVX2_Context* ctx, lsh_u8* hashval)
693 {
694  CRYPTOPP_ASSERT(ctx != NULLPTR);
695  CRYPTOPP_ASSERT(hashval != NULLPTR);
696 
697  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
698  AVX_Cleanup cleanup;
699 
700  // We are byte oriented. tail bits will always be 0.
701  size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
702  // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
703  const size_t remain_msg_bit = 0;
704 
705  if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
706  return LSH_ERR_INVALID_STATE;
707  }
708 
709  if (remain_msg_bit){
710  ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
711  }
712  else{
713  ctx->last_block[remain_msg_byte] = 0x80;
714  }
715  std::memset(ctx->last_block + remain_msg_byte + 1, 0, LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
716 
717  compress(ctx, ctx->last_block);
718 
719  fin(ctx);
720  get_hash(ctx, hashval);
721 
722  return LSH_SUCCESS;
723 }
724 
725 ANONYMOUS_NAMESPACE_END
726 
727 NAMESPACE_BEGIN(CryptoPP)
728 
729 extern
730 void LSH512_Base_Restart_AVX2(word64* state)
731 {
732  state[RemainingBits] = 0;
733  LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
734  lsh_err err = lsh512_init_avx2(&ctx);
735 
736  if (err != LSH_SUCCESS)
737  throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_init_avx2 failed");
738 }
739 
740 extern
741 void LSH512_Base_Update_AVX2(word64* state, const byte *input, size_t size)
742 {
743  LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
744  lsh_err err = lsh512_update_avx2(&ctx, input, 8*size);
745 
746  if (err != LSH_SUCCESS)
747  throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_update_avx2 failed");
748 }
749 
750 extern
751 void LSH512_Base_TruncatedFinal_AVX2(word64* state, byte *hash, size_t)
752 {
753  LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
754  lsh_err err = lsh512_final_avx2(&ctx, hash);
755 
756  if (err != LSH_SUCCESS)
757  throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_final_avx2 failed");
758 }
759 
760 NAMESPACE_END
761 
762 #endif // CRYPTOPP_AVX2_AVAILABLE
Base class for all exceptions thrown by the library.
Definition: cryptlib.h:164
@ OTHER_ERROR
Some other error occurred not belonging to other categories.
Definition: cryptlib.h:182
Library configuration file.
unsigned char byte
8-bit unsigned datatype
Definition: config_int.h:66
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:72
unsigned long long word64
64-bit unsigned datatype
Definition: config_int.h:101
@ LITTLE_ENDIAN_ORDER
byte order is little-endian
Definition: cryptlib.h:150
EnumToType< ByteOrder, LITTLE_ENDIAN_ORDER > LittleEndian
Provides a constant for LittleEndian.
Definition: cryptlib.h:155
Classes for the LSH hash functions.
Utility functions for the Crypto++ library.
T rotlConstant(T x)
Performs a left rotate.
Definition: misc.h:1757
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianness.
Definition: misc.h:2417
T rotlFixed(T x, unsigned int y)
Performs a left rotate.
Definition: misc.h:1808
Crypto++ library namespace.
Precompiled header file.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68