Crypto++  8.8
Free C++ class library of cryptographic schemes
arm_simd.h
Go to the documentation of this file.
1 // arm_simd.h - written and placed in public domain by Jeffrey Walton
2 
3 /// \file arm_simd.h
4 /// \brief Support functions for ARM and vector operations
5 
6 #ifndef CRYPTOPP_ARM_SIMD_H
7 #define CRYPTOPP_ARM_SIMD_H
8 
9 #include "config.h"
10 
11 #if (CRYPTOPP_ARM_NEON_HEADER)
12 # include <stdint.h>
13 # include <arm_neon.h>
14 #endif
15 
16 #if (CRYPTOPP_ARM_ACLE_HEADER)
17 # include <stdint.h>
18 # include <arm_acle.h>
19 #endif
20 
21 #if (CRYPTOPP_ARM_CRC32_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
22 /// \name CRC32 checksum
23 //@{
24 
25 /// \brief CRC32 checksum
26 /// \param crc the starting crc value
27 /// \param val the value to checksum
28 /// \return CRC32 value
29 /// \since Crypto++ 8.6
30 inline uint32_t CRC32B (uint32_t crc, uint8_t val)
31 {
32 #if defined(CRYPTOPP_MSC_VERSION)
33  return __crc32b(crc, val);
34 #else
35  __asm__ ("crc32b %w0, %w0, %w1 \n\t"
36  :"+r" (crc) : "r" (val) );
37  return crc;
38 #endif
39 }
40 
41 /// \brief CRC32 checksum
42 /// \param crc the starting crc value
43 /// \param val the value to checksum
44 /// \return CRC32 value
45 /// \since Crypto++ 8.6
46 inline uint32_t CRC32W (uint32_t crc, uint32_t val)
47 {
48 #if defined(CRYPTOPP_MSC_VERSION)
49  return __crc32w(crc, val);
50 #else
51  __asm__ ("crc32w %w0, %w0, %w1 \n\t"
52  :"+r" (crc) : "r" (val) );
53  return crc;
54 #endif
55 }
56 
57 /// \brief CRC32 checksum
58 /// \param crc the starting crc value
59 /// \param vals the values to checksum
60 /// \return CRC32 value
61 /// \since Crypto++ 8.6
62 inline uint32_t CRC32Wx4 (uint32_t crc, const uint32_t vals[4])
63 {
64 #if defined(CRYPTOPP_MSC_VERSION)
65  return __crc32w(__crc32w(__crc32w(__crc32w(
66  crc, vals[0]), vals[1]), vals[2]), vals[3]);
67 #else
68  __asm__ ("crc32w %w0, %w0, %w1 \n\t"
69  "crc32w %w0, %w0, %w2 \n\t"
70  "crc32w %w0, %w0, %w3 \n\t"
71  "crc32w %w0, %w0, %w4 \n\t"
72  :"+r" (crc) : "r" (vals[0]), "r" (vals[1]),
73  "r" (vals[2]), "r" (vals[3]));
74  return crc;
75 #endif
76 }
77 
78 //@}
79 /// \name CRC32-C checksum
80 
81 /// \brief CRC32-C checksum
82 /// \param crc the starting crc value
83 /// \param val the value to checksum
84 /// \return CRC32-C value
85 /// \since Crypto++ 8.6
86 inline uint32_t CRC32CB (uint32_t crc, uint8_t val)
87 {
88 #if defined(CRYPTOPP_MSC_VERSION)
89  return __crc32cb(crc, val);
90 #else
91  __asm__ ("crc32cb %w0, %w0, %w1 \n\t"
92  :"+r" (crc) : "r" (val) );
93  return crc;
94 #endif
95 }
96 
97 /// \brief CRC32-C checksum
98 /// \param crc the starting crc value
99 /// \param val the value to checksum
100 /// \return CRC32-C value
101 /// \since Crypto++ 8.6
102 inline uint32_t CRC32CW (uint32_t crc, uint32_t val)
103 {
104 #if defined(CRYPTOPP_MSC_VERSION)
105  return __crc32cw(crc, val);
106 #else
107  __asm__ ("crc32cw %w0, %w0, %w1 \n\t"
108  :"+r" (crc) : "r" (val) );
109  return crc;
110 #endif
111 }
112 
113 /// \brief CRC32-C checksum
114 /// \param crc the starting crc value
115 /// \param vals the values to checksum
116 /// \return CRC32-C value
117 /// \since Crypto++ 8.6
118 inline uint32_t CRC32CWx4 (uint32_t crc, const uint32_t vals[4])
119 {
120 #if defined(CRYPTOPP_MSC_VERSION)
121  return __crc32cw(__crc32cw(__crc32cw(__crc32cw(
122  crc, vals[0]), vals[1]), vals[2]), vals[3]);
123 #else
124  __asm__ ("crc32cw %w0, %w0, %w1 \n\t"
125  "crc32cw %w0, %w0, %w2 \n\t"
126  "crc32cw %w0, %w0, %w3 \n\t"
127  "crc32cw %w0, %w0, %w4 \n\t"
128  :"+r" (crc) : "r" (vals[0]), "r" (vals[1]),
129  "r" (vals[2]), "r" (vals[3]));
130  return crc;
131 #endif
132 }
133 //@}
134 #endif // CRYPTOPP_ARM_CRC32_AVAILABLE
135 
136 #if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
137 /// \name Polynomial multiplication
138 //@{
139 
140 /// \brief Polynomial multiplication
141 /// \param a the first value
142 /// \param b the second value
143 /// \return vector product
144 /// \details PMULL_00() performs polynomial multiplication and presents
145 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
146 /// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
147 /// are multiplied.
148 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
149 /// is MSB and numbered 127, while the rightmost bit is LSB and
150 /// numbered 0.
151 /// \since Crypto++ 8.0
152 inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
153 {
154 #if defined(CRYPTOPP_MSC_VERSION)
155  const __n64 x = { vgetq_lane_u64(a, 0) };
156  const __n64 y = { vgetq_lane_u64(b, 0) };
157  return vmull_p64(x, y);
158 #elif defined(__GNUC__)
159  uint64x2_t r;
160  __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
161  :"=w" (r) : "w" (a), "w" (b) );
162  return r;
163 #else
164  return (uint64x2_t)(vmull_p64(
165  vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
166  vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
167 #endif
168 }
169 
170 /// \brief Polynomial multiplication
171 /// \param a the first value
172 /// \param b the second value
173 /// \return vector product
174 /// \details PMULL_01 performs() polynomial multiplication and presents
175 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
176 /// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
177 /// 64-bits of <tt>b</tt> are multiplied.
178 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
179 /// is MSB and numbered 127, while the rightmost bit is LSB and
180 /// numbered 0.
181 /// \since Crypto++ 8.0
182 inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
183 {
184 #if defined(CRYPTOPP_MSC_VERSION)
185  const __n64 x = { vgetq_lane_u64(a, 0) };
186  const __n64 y = { vgetq_lane_u64(b, 1) };
187  return vmull_p64(x, y);
188 #elif defined(__GNUC__)
189  uint64x2_t r;
190  __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
191  :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) );
192  return r;
193 #else
194  return (uint64x2_t)(vmull_p64(
195  vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
196  vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
197 #endif
198 }
199 
200 /// \brief Polynomial multiplication
201 /// \param a the first value
202 /// \param b the second value
203 /// \return vector product
204 /// \details PMULL_10() performs polynomial multiplication and presents
205 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
206 /// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
207 /// 64-bits of <tt>b</tt> are multiplied.
208 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
209 /// is MSB and numbered 127, while the rightmost bit is LSB and
210 /// numbered 0.
211 /// \since Crypto++ 8.0
212 inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
213 {
214 #if defined(CRYPTOPP_MSC_VERSION)
215  const __n64 x = { vgetq_lane_u64(a, 1) };
216  const __n64 y = { vgetq_lane_u64(b, 0) };
217  return vmull_p64(x, y);
218 #elif defined(__GNUC__)
219  uint64x2_t r;
220  __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
221  :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) );
222  return r;
223 #else
224  return (uint64x2_t)(vmull_p64(
225  vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
226  vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
227 #endif
228 }
229 
230 /// \brief Polynomial multiplication
231 /// \param a the first value
232 /// \param b the second value
233 /// \return vector product
234 /// \details PMULL_11() performs polynomial multiplication and presents
235 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
236 /// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
237 /// are multiplied.
238 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
239 /// is MSB and numbered 127, while the rightmost bit is LSB and
240 /// numbered 0.
241 /// \since Crypto++ 8.0
242 inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
243 {
244 #if defined(CRYPTOPP_MSC_VERSION)
245  const __n64 x = { vgetq_lane_u64(a, 1) };
246  const __n64 y = { vgetq_lane_u64(b, 1) };
247  return vmull_p64(x, y);
248 #elif defined(__GNUC__)
249  uint64x2_t r;
250  __asm__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
251  :"=w" (r) : "w" (a), "w" (b) );
252  return r;
253 #else
254  return (uint64x2_t)(vmull_p64(
255  vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
256  vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
257 #endif
258 }
259 
260 /// \brief Polynomial multiplication
261 /// \param a the first value
262 /// \param b the second value
263 /// \return vector product
264 /// \details PMULL() performs vmull_p64(). PMULL is provided as
265 /// GCC inline assembly due to Clang and lack of support for the intrinsic.
266 /// \since Crypto++ 8.0
267 inline uint64x2_t PMULL(const uint64x2_t a, const uint64x2_t b)
268 {
269 #if defined(CRYPTOPP_MSC_VERSION)
270  const __n64 x = { vgetq_lane_u64(a, 0) };
271  const __n64 y = { vgetq_lane_u64(b, 0) };
272  return vmull_p64(x, y);
273 #elif defined(__GNUC__)
274  uint64x2_t r;
275  __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
276  :"=w" (r) : "w" (a), "w" (b) );
277  return r;
278 #else
279  return (uint64x2_t)(vmull_p64(
280  vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
281  vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
282 #endif
283 }
284 
285 /// \brief Polynomial multiplication
286 /// \param a the first value
287 /// \param b the second value
288 /// \return vector product
289 /// \details PMULL_HIGH() performs vmull_high_p64(). PMULL_HIGH is provided as
290 /// GCC inline assembly due to Clang and lack of support for the intrinsic.
291 /// \since Crypto++ 8.0
292 inline uint64x2_t PMULL_HIGH(const uint64x2_t a, const uint64x2_t b)
293 {
294 #if defined(CRYPTOPP_MSC_VERSION)
295  const __n64 x = { vgetq_lane_u64(a, 1) };
296  const __n64 y = { vgetq_lane_u64(b, 1) };
297  return vmull_p64(x, y);
298 #elif defined(__GNUC__)
299  uint64x2_t r;
300  __asm__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
301  :"=w" (r) : "w" (a), "w" (b) );
302  return r;
303 #else
304  return (uint64x2_t)(vmull_p64(
305  vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
306  vgetq_lane_u64(vreinterpretq_u64_u8(b),1))));
307 #endif
308 }
309 
310 /// \brief Vector extraction
311 /// \tparam C the byte count
312 /// \param a the first value
313 /// \param b the second value
314 /// \return vector
315 /// \details VEXT_U8() extracts the first <tt>C</tt> bytes of vector
316 /// <tt>a</tt> and the remaining bytes in <tt>b</tt>. VEXT_U8 is provided
317 /// as GCC inline assembly due to Clang and lack of support for the intrinsic.
318 /// \since Crypto++ 8.0
319 template <unsigned int C>
320 inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
321 {
322  // https://github.com/weidai11/cryptopp/issues/366
323 #if defined(CRYPTOPP_MSC_VERSION)
324  return vreinterpretq_u64_u8(vextq_u8(
325  vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C));
326 #else
327  uint64x2_t r;
328  __asm__ ("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
329  :"=w" (r) : "w" (a), "w" (b), "I" (C) );
330  return r;
331 #endif
332 }
333 
334 //@}
335 #endif // CRYPTOPP_ARM_PMULL_AVAILABLE
336 
337 #if CRYPTOPP_ARM_SHA3_AVAILABLE || defined(CRYPTOPP_DOXYGEN_PROCESSING)
338 /// \name ARMv8.2 operations
339 //@{
340 
341 /// \brief Three-way XOR
342 /// \param a the first value
343 /// \param b the second value
344 /// \param c the third value
345 /// \return three-way exclusive OR of the values
346 /// \details VEOR3() performs veor3q_u64(). VEOR3 is provided as GCC inline assembly due
347 /// to Clang and lack of support for the intrinsic.
348 /// \details VEOR3 requires ARMv8.2.
349 /// \since Crypto++ 8.6
350 inline uint64x2_t VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
351 {
352 #if defined(CRYPTOPP_MSC_VERSION)
353  return veor3q_u64(a, b, c);
354 #else
355  uint64x2_t r;
356  __asm__ ("eor3 %0.16b, %1.16b, %2.16b, %3.16b \n\t"
357  :"=w" (r) : "w" (a), "w" (b), "w" (c));
358  return r;
359 #endif
360 }
361 
362 /// \brief XOR and rotate
363 /// \param a the first value
364 /// \param b the second value
365 /// \param c the third value
366 /// \return two-way exclusive OR of the values, then rotated by c
367 /// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due
368 /// to Clang and lack of support for the intrinsic.
369 /// \details VXARQ requires ARMv8.2.
370 /// \since Crypto++ 8.6
371 inline uint64x2_t VXAR(uint64x2_t a, uint64x2_t b, const int c)
372 {
373 #if defined(CRYPTOPP_MSC_VERSION)
374  return vxarq_u64(a, b, c);
375 #else
376  uint64x2_t r;
377  __asm__ ("xar %0.2d, %1.2d, %2.2d, %3 \n\t"
378  :"=w" (r) : "w" (a), "w" (b), "I" (c));
379  return r;
380 #endif
381 }
382 
383 /// \brief XOR and rotate
384 /// \tparam C the rotate amount
385 /// \param a the first value
386 /// \param b the second value
387 /// \return two-way exclusive OR of the values, then rotated by C
388 /// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due
389 /// to Clang and lack of support for the intrinsic.
390 /// \details VXARQ requires ARMv8.2.
391 /// \since Crypto++ 8.6
392 template <unsigned int C>
393 inline uint64x2_t VXAR(uint64x2_t a, uint64x2_t b)
394 {
395 #if defined(CRYPTOPP_MSC_VERSION)
396  return vxarq_u64(a, b, C);
397 #else
398  uint64x2_t r;
399  __asm__ ("xar %0.2d, %1.2d, %2.2d, %3 \n\t"
400  :"=w" (r) : "w" (a), "w" (b), "I" (C));
401  return r;
402 #endif
403 }
404 
405 /// \brief XOR and rotate
406 /// \param a the first value
407 /// \param b the second value
408 /// \return two-way exclusive OR of the values, then rotated 1-bit
409 /// \details VRAX1() performs vrax1q_u64(). VRAX1 is provided as GCC inline assembly due
410 /// to Clang and lack of support for the intrinsic.
411 /// \details VRAX1 requires ARMv8.2.
412 /// \since Crypto++ 8.6
413 inline uint64x2_t VRAX1(uint64x2_t a, uint64x2_t b)
414 {
415 #if defined(CRYPTOPP_MSC_VERSION)
416  return vrax1q_u64(a, b);
417 #else
418  uint64x2_t r;
419  __asm__ ("rax1 %0.2d, %1.2d, %2.2d \n\t"
420  :"=w" (r) : "w" (a), "w" (b));
421  return r;
422 #endif
423 }
424 //@}
425 #endif // CRYPTOPP_ARM_SHA3_AVAILABLE
426 
427 #endif // CRYPTOPP_ARM_SIMD_H
uint64x2_t VXAR(uint64x2_t a, uint64x2_t b, const int c)
XOR and rotate.
Definition: arm_simd.h:371
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:152
uint64x2_t VRAX1(uint64x2_t a, uint64x2_t b)
XOR and rotate.
Definition: arm_simd.h:413
uint32_t CRC32CWx4(uint32_t crc, const uint32_t vals[4])
CRC32-C checksum.
Definition: arm_simd.h:118
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:242
uint32_t CRC32CB(uint32_t crc, uint8_t val)
CRC32-C checksum.
Definition: arm_simd.h:86
uint64x2_t PMULL_HIGH(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:292
uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:182
uint64x2_t VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
Three-way XOR.
Definition: arm_simd.h:350
uint32_t CRC32W(uint32_t crc, uint32_t val)
CRC32 checksum.
Definition: arm_simd.h:46
uint32_t CRC32B(uint32_t crc, uint8_t val)
CRC32 checksum.
Definition: arm_simd.h:30
uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:212
uint64x2_t PMULL(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:267
uint32_t CRC32CW(uint32_t crc, uint32_t val)
CRC32-C checksum.
Definition: arm_simd.h:102
uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
Vector extraction.
Definition: arm_simd.h:320
uint32_t CRC32Wx4(uint32_t crc, const uint32_t vals[4])
CRC32 checksum.
Definition: arm_simd.h:62
Library configuration file.