Crypto++  8.8
Free C++ class library of cryptographic schemes
aria_simd.cpp
1 // aria_simd.cpp - written and placed in the public domain by
2 // Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3 //
4 // This source file uses intrinsics to gain access to ARMv7a and
5 // ARMv8a NEON instructions. A separate source file is needed
6 // because additional CXXFLAGS are required to enable the
7 // appropriate instructions sets in some build configurations.
8 
9 #include "pch.h"
10 #include "config.h"
11 #include "misc.h"
12 
13 #if (CRYPTOPP_SSSE3_AVAILABLE)
14 # include <tmmintrin.h>
15 #endif
16 
17 #if (CRYPTOPP_ARM_NEON_HEADER)
18 # include <arm_neon.h>
19 #endif
20 
21 #if (CRYPTOPP_ARM_ACLE_HEADER)
22 # include <stdint.h>
23 # include <arm_acle.h>
24 #endif
25 
26 // Squash MS LNK4221 and libtool warnings
27 extern const char ARIA_SIMD_FNAME[] = __FILE__;
28 
29 NAMESPACE_BEGIN(CryptoPP)
30 NAMESPACE_BEGIN(ARIATab)
31 
32 extern const word32 S1[256];
33 extern const word32 S2[256];
34 extern const word32 X1[256];
35 extern const word32 X2[256];
36 extern const word32 KRK[3][4];
37 
38 NAMESPACE_END
39 NAMESPACE_END
40 
41 ANONYMOUS_NAMESPACE_BEGIN
42 
43 using CryptoPP::byte;
44 using CryptoPP::word32;
45 
46 inline byte ARIA_BRF(const word32 x, const int y) {
47  return static_cast<byte>(GETBYTE(x, y));
48 }
49 
50 ANONYMOUS_NAMESPACE_END
51 
52 NAMESPACE_BEGIN(CryptoPP)
53 
54 using CryptoPP::ARIATab::S1;
55 using CryptoPP::ARIATab::S2;
56 using CryptoPP::ARIATab::X1;
57 using CryptoPP::ARIATab::X2;
58 using CryptoPP::ARIATab::KRK;
59 
60 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
61 
62 template <unsigned int N>
63 inline void ARIA_GSRK_NEON(const uint32x4_t X, const uint32x4_t Y, byte RK[16])
64 {
65  enum { Q1 = (4-(N/32)) % 4,
66  Q2 = (3-(N/32)) % 4,
67  R = N % 32
68  };
69 
70  vst1q_u8(RK, vreinterpretq_u8_u32(
71  veorq_u32(X, veorq_u32(
72  vshrq_n_u32(vextq_u32(Y, Y, Q1), R),
73  vshlq_n_u32(vextq_u32(Y, Y, Q2), 32-R)))));
74 }
75 
76 void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen)
77 {
78  const uint32x4_t w0 = vld1q_u32(ws+ 0);
79  const uint32x4_t w1 = vld1q_u32(ws+ 8);
80  const uint32x4_t w2 = vld1q_u32(ws+12);
81  const uint32x4_t w3 = vld1q_u32(ws+16);
82 
83  ARIA_GSRK_NEON<19>(w0, w1, rk + 0);
84  ARIA_GSRK_NEON<19>(w1, w2, rk + 16);
85  ARIA_GSRK_NEON<19>(w2, w3, rk + 32);
86  ARIA_GSRK_NEON<19>(w3, w0, rk + 48);
87  ARIA_GSRK_NEON<31>(w0, w1, rk + 64);
88  ARIA_GSRK_NEON<31>(w1, w2, rk + 80);
89  ARIA_GSRK_NEON<31>(w2, w3, rk + 96);
90  ARIA_GSRK_NEON<31>(w3, w0, rk + 112);
91  ARIA_GSRK_NEON<67>(w0, w1, rk + 128);
92  ARIA_GSRK_NEON<67>(w1, w2, rk + 144);
93  ARIA_GSRK_NEON<67>(w2, w3, rk + 160);
94  ARIA_GSRK_NEON<67>(w3, w0, rk + 176);
95  ARIA_GSRK_NEON<97>(w0, w1, rk + 192);
96 
97  if (keylen > 16)
98  {
99  ARIA_GSRK_NEON<97>(w1, w2, rk + 208);
100  ARIA_GSRK_NEON<97>(w2, w3, rk + 224);
101 
102  if (keylen > 24)
103  {
104  ARIA_GSRK_NEON< 97>(w3, w0, rk + 240);
105  ARIA_GSRK_NEON<109>(w0, w1, rk + 256);
106  }
107  }
108 }
109 
110 void ARIA_ProcessAndXorBlock_NEON(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t)
111 {
112  outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
113  outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
114  outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
115  outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
116  outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
117  outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
118  outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
119  outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
120  outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
121  outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
122  outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
123  outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
124  outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
125  outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
126  outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
127  outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
128 
129  // 'outBlock' and 'xorBlock' may be unaligned.
130  if (xorBlock != NULLPTR)
131  {
132  vst1q_u8(outBlock,
133  veorq_u8(
134  vld1q_u8(xorBlock),
135  veorq_u8(
136  vld1q_u8(outBlock),
137  vrev32q_u8(vld1q_u8((rk))))));
138  }
139  else
140  {
141  vst1q_u8(outBlock,
142  veorq_u8(
143  vld1q_u8(outBlock),
144  vrev32q_u8(vld1q_u8(rk))));
145  }
146 }
147 
148 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
149 
150 #if (CRYPTOPP_SSSE3_AVAILABLE)
151 
152 void ARIA_ProcessAndXorBlock_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t)
153 {
154  const __m128i MASK = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
155 
156  outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
157  outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
158  outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
159  outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
160  outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
161  outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
162  outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
163  outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
164  outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
165  outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
166  outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
167  outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
168  outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
169  outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
170  outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
171  outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
172 
173  // 'outBlock' and 'xorBlock' may be unaligned.
174  if (xorBlock != NULLPTR)
175  {
176  _mm_storeu_si128(M128_CAST(outBlock),
177  _mm_xor_si128(
178  _mm_loadu_si128(CONST_M128_CAST(xorBlock)),
179  _mm_xor_si128(
180  _mm_loadu_si128(CONST_M128_CAST(outBlock)),
181  _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK)))
182  );
183  }
184  else
185  {
186  _mm_storeu_si128(M128_CAST(outBlock),
187  _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(outBlock)),
188  _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK)));
189  }
190 }
191 
192 #endif // CRYPTOPP_SSSE3_AVAILABLE
193 
194 NAMESPACE_END
#define M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:609
#define CONST_M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:614
Library configuration file.
unsigned char byte
8-bit unsigned datatype
Definition: config_int.h:66
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:72
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.