Crypto++  8.8
Free C++ class library of cryptographic schemes
ppc_simd.h
Go to the documentation of this file.
1 // ppc_simd.h - written and placed in public domain by Jeffrey Walton
2 
3 /// \file ppc_simd.h
4 /// \brief Support functions for PowerPC and vector operations
5 /// \details This header provides an agnostic interface into Clang, GCC
6 /// and IBM XL C/C++ compilers modulo their different built-in functions
7 /// for accessing vector instructions.
8 /// \details The abstractions are necessary to support back to GCC 4.8 and
9 /// XLC 11 and 12. GCC 4.8 and 4.9 are still popular, and they are the
10 /// default compiler for GCC112, GCC119 and others on the compile farm.
11 /// Older IBM XL C/C++ compilers also have the need due to lack of
12 /// <tt>vec_xl</tt> and <tt>vec_xst</tt> support on some platforms. Modern
13 /// compilers provide best support and don't need many of the hacks
14 /// below.
15 /// \details The library is tested with the following PowerPC machines and
16 /// compilers. GCC110, GCC111, GCC112, GCC119 and GCC135 are provided by
17 /// the <A HREF="https://cfarm.tetaneutral.net/">GCC Compile Farm</A>
18 /// - PowerMac G5, OSX 10.5, POWER4, Apple GCC 4.0
19 /// - PowerMac G5, OSX 10.5, POWER4, Macports GCC 5.0
20 /// - GCC110, Linux, POWER7, GCC 4.8.5
21 /// - GCC110, Linux, POWER7, XLC 12.01
22 /// - GCC111, AIX, POWER7, GCC 4.8.1
23 /// - GCC111, AIX, POWER7, XLC 12.01
24 /// - GCC112, Linux, POWER8, GCC 4.8.5
25 /// - GCC112, Linux, POWER8, XLC 13.01
26 /// - GCC112, Linux, POWER8, Clang 7.0
27 /// - GCC119, AIX, POWER8, GCC 7.2.0
28 /// - GCC119, AIX, POWER8, XLC 13.01
29 /// - GCC135, Linux, POWER9, GCC 7.0
30 /// \details 12 machines are used for testing because the three compilers form
31 /// five or six profiles. The profiles are listed below.
32 /// - GCC (Linux GCC, Macports GCC, etc. Consistent across machines)
33 /// - XLC 13.0 and earlier (all IBM components)
34 /// - XLC 13.1 and later on Linux (LLVM front-end, no compatibility macros)
35 /// - XLC 13.1 and later on Linux (LLVM front-end, -qxlcompatmacros option)
36 /// - early LLVM Clang (traditional Clang compiler)
37 /// - late LLVM Clang (traditional Clang compiler)
38 /// \details The LLVM front-end makes it tricky to write portable code because
39 /// LLVM pretends to be other compilers but cannot consume other compiler's
40 /// builtins. When using XLC with -qxlcompatmacros the compiler pretends to
41 /// be GCC, Clang and XLC all at once but it can only consume it's variety
42 /// of builtins.
43 /// \details At Crypto++ 8.0 the various <tt>Vector{FuncName}</tt> were
44 /// renamed to <tt>Vec{FuncName}</tt>. For example, <tt>VectorAnd</tt> was
45 /// changed to <tt>VecAnd</tt>. The name change helped consolidate two
46 /// slightly different implementations.
47 /// \details At Crypto++ 8.3 the library added select 64-bit functions for
48 /// 32-bit Altivec. For example, <tt>VecAdd64</tt> and <tt>VecSub64</tt>
49 /// take 32-bit vectors and adds or subtracts them as if there were vectors
50 /// with two 64-bit elements. The functions dramtically improve performance
51 /// for some algorithms on some platforms, like SIMON128 and SPECK128 on
52 /// Power6 and earlier. For example, SPECK128 improved from 70 cpb to
53 /// 10 cpb on an old PowerMac. Use the functions like shown below.
54 /// <pre>
55 /// \#if defined(_ARCH_PWR8)
56 /// \# define speck128_t uint64x2_p
57 /// \#else
58 /// \# define speck128_t uint32x4_p
59 /// \#endif
60 ///
61 /// speck128_t rk, x1, x2, y1, y2;
62 /// rk = (speck128_t)VecLoadAligned(ptr);
63 /// x1 = VecRotateRight64<8>(x1);
64 /// x1 = VecAdd64(x1, y1);
65 /// ...</pre>
66 /// \since Crypto++ 6.0, LLVM Clang compiler support since Crypto++ 8.0
67 
68 // Use __ALTIVEC__, _ARCH_PWR7, __VSX__, and _ARCH_PWR8 when detecting
69 // actual availaibility of the feature for the source file being compiled.
70 // The preprocessor macros depend on compiler options like -maltivec; and
71 // not compiler versions.
72 
73 // For GCC see https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions.html
74 // For XLC see the Compiler Reference manual. For Clang you have to experiment.
75 // Clang does not document the compiler options, does not reject options it does
76 // not understand, and pretends to be other compilers even though it cannot
77 // process the builtins and intrinsics. Clang will waste hours of your time.
78 
79 // DO NOT USE this pattern in VecLoad and VecStore. We have to use the
80 // code paths guarded by preprocessor macros because XLC 12 generates
81 // bad code in some places. To verify the bad code generation test on
82 // GCC111 with XLC 12.01 installed. XLC 13.01 on GCC112 and GCC119 are OK.
83 //
84 // inline uint32x4_p VecLoad(const byte src[16])
85 // {
86 // #if defined(__VSX__) || defined(_ARCH_PWR8)
87 // return (uint32x4_p) *(uint8x16_p*)((byte*)src);
88 // #else
89 // return VecLoad_ALTIVEC(src);
90 // #endif
91 // }
92 
93 // We should be able to perform the load using inline asm on Power7 with
94 // VSX or Power8. The inline asm will avoid C undefined behavior due to
95 // casting from byte* to word32*. We are safe because our byte* are
96 // 16-byte aligned for Altivec. Below is the big endian load. Little
97 // endian would need to follow with xxpermdi for the reversal.
98 //
99 // __asm__ ("lxvw4x %x0, %1, %2" : "=wa"(v) : "r"(0), "r"(src) : );
100 
101 // GCC and XLC use integer math for the address (D-form or byte-offset
102 // in the ISA manual). LLVM uses pointer math for the address (DS-form
103 // or indexed in the ISA manual). To keep them consistent we calculate
104 // the address from the offset and pass to a load or store function
105 // using a 0 offset.
106 
107 #ifndef CRYPTOPP_PPC_CRYPTO_H
108 #define CRYPTOPP_PPC_CRYPTO_H
109 
110 #include "config.h"
111 #include "misc.h"
112 
113 #if defined(__ALTIVEC__)
114 # include <altivec.h>
115 # undef vector
116 # undef pixel
117 # undef bool
118 #endif
119 
120 // XL C++ on AIX does not define VSX and does not
121 // provide an option to set it. We have to set it
122 // for the code below. This define must stay in
123 // sync with the define in test_ppc_power7.cpp.
124 #ifndef CRYPTOPP_DISABLE_POWER7
125 # if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
126 # define __VSX__ 1
127 # endif
128 #endif
129 
130 // XL C++ on AIX does not define CRYPTO and does not
131 // provide an option to set it. We have to set it
132 // for the code below. This define must stay in
133 // sync with the define in test_ppc_power8.cpp
134 #ifndef CRYPTOPP_DISABLE_POWER8
135 # if defined(_AIX) && defined(_ARCH_PWR8) && defined(__xlC__)
136 # define __CRYPTO__ 1
137 # endif
138 #endif
139 
140 /// \brief Cast array to vector pointer
141 /// \details CONST_V8_CAST casts a const array to a vector
142 /// pointer for a byte array. The Power ABI says source arrays
143 /// are non-const, so this define removes the const. XLC++ will
144 /// fail the compile if the source array is const.
145 #define CONST_V8_CAST(x) ((unsigned char*)(x))
146 /// \brief Cast array to vector pointer
147 /// \details CONST_V32_CAST casts a const array to a vector
148 /// pointer for a word array. The Power ABI says source arrays
149 /// are non-const, so this define removes the const. XLC++ will
150 /// fail the compile if the source array is const.
151 #define CONST_V32_CAST(x) ((unsigned int*)(x))
152 /// \brief Cast array to vector pointer
153 /// \details CONST_V64_CAST casts a const array to a vector
154 /// pointer for a double word array. The Power ABI says source arrays
155 /// are non-const, so this define removes the const. XLC++ will
156 /// fail the compile if the source array is const.
157 #define CONST_V64_CAST(x) ((unsigned long long*)(x))
158 /// \brief Cast array to vector pointer
159 /// \details NCONST_V8_CAST casts an array to a vector
160 /// pointer for a byte array. The Power ABI says source arrays
161 /// are non-const, so this define removes the const. XLC++ will
162 /// fail the compile if the source array is const.
163 #define NCONST_V8_CAST(x) ((unsigned char*)(x))
164 /// \brief Cast array to vector pointer
165 /// \details NCONST_V32_CAST casts an array to a vector
166 /// pointer for a word array. The Power ABI says source arrays
167 /// are non-const, so this define removes the const. XLC++ will
168 /// fail the compile if the source array is const.
169 #define NCONST_V32_CAST(x) ((unsigned int*)(x))
170 /// \brief Cast array to vector pointer
171 /// \details NCONST_V64_CAST casts an array to a vector
172 /// pointer for a double word array. The Power ABI says source arrays
173 /// are non-const, so this define removes the const. XLC++ will
174 /// fail the compile if the source array is const.
175 #define NCONST_V64_CAST(x) ((unsigned long long*)(x))
176 
177 // VecLoad_ALTIVEC and VecStore_ALTIVEC are
178 // too noisy on modern compilers
179 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
180 # pragma GCC diagnostic push
181 # pragma GCC diagnostic ignored "-Wdeprecated"
182 #endif
183 
184 NAMESPACE_BEGIN(CryptoPP)
185 
186 #if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
187 
188 /// \brief Vector of 8-bit elements
189 /// \par Wraps
190 /// __vector unsigned char
191 /// \since Crypto++ 6.0
192 typedef __vector unsigned char uint8x16_p;
193 /// \brief Vector of 16-bit elements
194 /// \par Wraps
195 /// __vector unsigned short
196 /// \since Crypto++ 6.0
197 typedef __vector unsigned short uint16x8_p;
198 /// \brief Vector of 32-bit elements
199 /// \par Wraps
200 /// __vector unsigned int
201 /// \since Crypto++ 6.0
202 typedef __vector unsigned int uint32x4_p;
203 
204 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
205 /// \brief Vector of 64-bit elements
206 /// \details uint64x2_p is available on POWER7 with VSX and above. Most
207 /// supporting functions, like 64-bit <tt>vec_add</tt> (<tt>vaddudm</tt>)
208 /// and <tt>vec_sub</tt> (<tt>vsubudm</tt>), did not arrive until POWER8.
209 /// \par Wraps
210 /// __vector unsigned long long
211 /// \since Crypto++ 6.0
212 typedef __vector unsigned long long uint64x2_p;
213 #endif // VSX or ARCH_PWR8
214 
215 /// \brief The 0 vector
216 /// \return a 32-bit vector of 0's
217 /// \since Crypto++ 8.0
219 {
220  const uint32x4_p v = {0,0,0,0};
221  return v;
222 }
223 
224 /// \brief The 1 vector
225 /// \return a 32-bit vector of 1's
226 /// \since Crypto++ 8.0
228 {
229  const uint32x4_p v = {1,1,1,1};
230  return v;
231 }
232 
233 /// \brief Reverse bytes in a vector
234 /// \tparam T vector type
235 /// \param data the vector
236 /// \return vector
237 /// \details VecReverse() reverses the bytes in a vector
238 /// \par Wraps
239 /// vec_perm
240 /// \since Crypto++ 6.0
241 template <class T>
242 inline T VecReverse(const T data)
243 {
244 #if defined(CRYPTOPP_BIG_ENDIAN)
245  const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
246  return (T)vec_perm(data, data, mask);
247 #else
248  const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
249  return (T)vec_perm(data, data, mask);
250 #endif
251 }
252 
253 /// \brief Reverse bytes in a vector
254 /// \tparam T vector type
255 /// \param data the vector
256 /// \return vector
257 /// \details VecReverseLE() reverses the bytes in a vector on
258 /// little-endian systems.
259 /// \par Wraps
260 /// vec_perm
261 /// \since Crypto++ 6.0
262 template <class T>
263 inline T VecReverseLE(const T data)
264 {
265 #if defined(CRYPTOPP_LITTLE_ENDIAN)
266  const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
267  return (T)vec_perm(data, data, mask);
268 #else
269  return data;
270 #endif
271 }
272 
273 /// \brief Reverse bytes in a vector
274 /// \tparam T vector type
275 /// \param data the vector
276 /// \return vector
277 /// \details VecReverseBE() reverses the bytes in a vector on
278 /// big-endian systems.
279 /// \par Wraps
280 /// vec_perm
281 /// \since Crypto++ 6.0
282 template <class T>
283 inline T VecReverseBE(const T data)
284 {
285 #if defined(CRYPTOPP_BIG_ENDIAN)
286  const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
287  return (T)vec_perm(data, data, mask);
288 #else
289  return data;
290 #endif
291 }
292 
293 /// \name LOAD OPERATIONS
294 //@{
295 
296 /// \brief Loads a vector from a byte array
297 /// \param src the byte array
298 /// \details Loads a vector in native endian format from a byte array.
299 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
300 /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
301 /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>. The fixups using
302 /// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so
303 /// you should provide aligned memory addresses.
304 /// \par Wraps
305 /// vec_ld, vec_lvsl, vec_perm
306 /// \sa VecLoad, VecLoadAligned
307 /// \since Crypto++ 6.0
308 inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])
309 {
310  // Avoid IsAlignedOn for convenience.
311  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
312  if (addr % 16 == 0)
313  {
314  return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
315  }
316  else
317  {
318  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
319  const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
320  const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
321  const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
322  return (uint32x4_p)vec_perm(low, high, perm);
323  }
324 }
325 
326 /// \brief Loads a vector from a byte array
327 /// \param src the byte array
328 /// \param off offset into the src byte array
329 /// \details Loads a vector in native endian format from a byte array.
330 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
331 /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
332 /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>.
333 /// \details The fixups using <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are
334 /// relatively expensive so you should provide aligned memory addresses.
335 /// \par Wraps
336 /// vec_ld, vec_lvsl, vec_perm
337 /// \sa VecLoad, VecLoadAligned
338 /// \since Crypto++ 6.0
339 inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])
340 {
341  // Avoid IsAlignedOn for convenience.
342  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
343  if (addr % 16 == 0)
344  {
345  return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
346  }
347  else
348  {
349  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
350  const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
351  const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
352  const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
353  return (uint32x4_p)vec_perm(low, high, perm);
354  }
355 }
356 
357 /// \brief Loads a vector from a byte array
358 /// \param src the byte array
359 /// \details VecLoad() loads a vector from a byte array.
360 /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.
361 /// The instruction does not require aligned effective memory addresses.
362 /// VecLoad_ALTIVEC() is used if POWER9 is not available.
363 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
364 /// are required to fix up unaligned memory addresses.
365 /// \par Wraps
366 /// vec_xl on POWER9 and above, Altivec load on POWER8 and below
367 /// \sa VecLoad_ALTIVEC, VecLoadAligned
368 /// \since Crypto++ 6.0
369 inline uint32x4_p VecLoad(const byte src[16])
370 {
371  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
372  // word pointers. The ISA lacks loads for short* and char*.
373  // Power9/ISA 3.0 provides vec_xl for all datatypes.
374 
375  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
376  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
377  CRYPTOPP_UNUSED(addr);
378 
379 #if defined(_ARCH_PWR9)
380  return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
381 #else
383 #endif
384 }
385 
386 /// \brief Loads a vector from a byte array
387 /// \param src the byte array
388 /// \param off offset into the src byte array
389 /// \details VecLoad() loads a vector from a byte array.
390 /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.
391 /// The instruction does not require aligned effective memory addresses.
392 /// VecLoad_ALTIVEC() is used if POWER9 is not available.
393 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
394 /// are required to fix up unaligned memory addresses.
395 /// \par Wraps
396 /// vec_xl on POWER9 and above, Altivec load on POWER8 and below
397 /// \sa VecLoad_ALTIVEC, VecLoadAligned
398 /// \since Crypto++ 6.0
399 inline uint32x4_p VecLoad(int off, const byte src[16])
400 {
401  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
402  // word pointers. The ISA lacks loads for short* and char*.
403  // Power9/ISA 3.0 provides vec_xl for all datatypes.
404 
405  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
406  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
407  CRYPTOPP_UNUSED(addr);
408 
409 #if defined(_ARCH_PWR9)
410  return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
411 #else
413 #endif
414 }
415 
416 /// \brief Loads a vector from a word array
417 /// \param src the word array
418 /// \details VecLoad() loads a vector from a word array.
419 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
420 /// The instruction does not require aligned effective memory addresses.
421 /// VecLoad_ALTIVEC() is used if POWER7 is not available.
422 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
423 /// are required to fix up unaligned memory addresses.
424 /// \par Wraps
425 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
426 /// \sa VecLoad_ALTIVEC, VecLoadAligned
427 /// \since Crypto++ 8.0
428 inline uint32x4_p VecLoad(const word32 src[4])
429 {
430  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
431  // word pointers. The ISA lacks loads for short* and char*.
432  // Power9/ISA 3.0 provides vec_xl for all datatypes.
433 
434  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
435  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
436  CRYPTOPP_UNUSED(addr);
437 
438 #if defined(_ARCH_PWR9)
439  return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
440 #elif defined(__VSX__) || defined(_ARCH_PWR8)
441  return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
442 #else
444 #endif
445 }
446 
447 /// \brief Loads a vector from a word array
448 /// \param src the word array
449 /// \param off offset into the word array
450 /// \details VecLoad() loads a vector from a word array.
451 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
452 /// The instruction does not require aligned effective memory addresses.
453 /// VecLoad_ALTIVEC() is used if POWER7 is not available.
454 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
455 /// are required to fix up unaligned memory addresses.
456 /// \par Wraps
457 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
458 /// \sa VecLoad_ALTIVEC, VecLoadAligned
459 /// \since Crypto++ 8.0
460 inline uint32x4_p VecLoad(int off, const word32 src[4])
461 {
462  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
463  // word pointers. The ISA lacks loads for short* and char*.
464  // Power9/ISA 3.0 provides vec_xl for all datatypes.
465 
466  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
467  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
468  CRYPTOPP_UNUSED(addr);
469 
470 #if defined(_ARCH_PWR9)
471  return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
472 #elif defined(__VSX__) || defined(_ARCH_PWR8)
473  return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
474 #else
476 #endif
477 }
478 
479 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
480 
481 /// \brief Loads a vector from a double word array
482 /// \param src the double word array
483 /// \details VecLoad() loads a vector from a double word array.
484 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
485 /// The instruction does not require aligned effective memory addresses.
486 /// VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.
487 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
488 /// are required to fix up unaligned memory addresses.
489 /// \details VecLoad() with 64-bit elements is available on POWER7 and above.
490 /// \par Wraps
491 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
492 /// \sa VecLoad_ALTIVEC, VecLoadAligned
493 /// \since Crypto++ 8.0
494 inline uint64x2_p VecLoad(const word64 src[2])
495 {
496  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
497  // word pointers. The ISA lacks loads for short* and char*.
498  // Power9/ISA 3.0 provides vec_xl for all datatypes.
499 
500  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
501  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
502  CRYPTOPP_UNUSED(addr);
503 
504 #if defined(_ARCH_PWR9)
505  return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
506 #elif defined(__VSX__) || defined(_ARCH_PWR8)
507  // The 32-bit cast is not a typo. Compiler workaround.
508  return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
509 #else
511 #endif
512 }
513 
514 /// \brief Loads a vector from a double word array
515 /// \param src the double word array
516 /// \param off offset into the double word array
517 /// \details VecLoad() loads a vector from a double word array.
518 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
519 /// The instruction does not require aligned effective memory addresses.
520 /// VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.
521 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
522 /// are required to fix up unaligned memory addresses.
523 /// \details VecLoad() with 64-bit elements is available on POWER8 and above.
524 /// \par Wraps
525 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
526 /// \sa VecLoad_ALTIVEC, VecLoadAligned
527 /// \since Crypto++ 8.0
528 inline uint64x2_p VecLoad(int off, const word64 src[2])
529 {
530  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
531  // word pointers. The ISA lacks loads for short* and char*.
532  // Power9/ISA 3.0 provides vec_xl for all datatypes.
533 
534  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
535  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
536  CRYPTOPP_UNUSED(addr);
537 
538 #if defined(_ARCH_PWR9)
539  return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
540 #elif defined(__VSX__) || defined(_ARCH_PWR8)
541  // The 32-bit cast is not a typo. Compiler workaround.
542  return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
543 #else
545 #endif
546 }
547 
548 #endif // VSX or ARCH_PWR8
549 
550 /// \brief Loads a vector from an aligned byte array
551 /// \param src the byte array
552 /// \details VecLoadAligned() loads a vector from an aligned byte array.
553 /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.
554 /// <tt>vec_ld</tt> is used if POWER9 is not available. The effective
555 /// address of <tt>src</tt> must be 16-byte aligned for Altivec.
556 /// \par Wraps
557 /// vec_xl on POWER9, vec_ld on POWER8 and below
558 /// \sa VecLoad_ALTIVEC, VecLoad
559 /// \since Crypto++ 8.0
560 inline uint32x4_p VecLoadAligned(const byte src[16])
561 {
562  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
563  // word pointers. The ISA lacks loads for short* and char*.
564  // Power9/ISA 3.0 provides vec_xl for all datatypes.
565 
566  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
567  CRYPTOPP_ASSERT(addr % 16 == 0);
568  CRYPTOPP_UNUSED(addr);
569 
570 #if defined(_ARCH_PWR9)
571  return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
572 #else
573  return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
574 #endif
575 }
576 
577 /// \brief Loads a vector from an aligned byte array
578 /// \param src the byte array
579 /// \param off offset into the src byte array
580 /// \details VecLoadAligned() loads a vector from an aligned byte array.
581 /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.
582 /// <tt>vec_ld</tt> is used if POWER9 is not available. The effective
583 /// address of <tt>src</tt> must be 16-byte aligned for Altivec.
584 /// \par Wraps
585 /// vec_xl on POWER9, vec_ld on POWER8 and below
586 /// \sa VecLoad_ALTIVEC, VecLoad
587 /// \since Crypto++ 8.0
588 inline uint32x4_p VecLoadAligned(int off, const byte src[16])
589 {
590  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
591  // word pointers. The ISA lacks loads for short* and char*.
592  // Power9/ISA 3.0 provides vec_xl for all datatypes.
593 
594  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
595  CRYPTOPP_ASSERT(addr % 16 == 0);
596  CRYPTOPP_UNUSED(addr);
597 
598 #if defined(_ARCH_PWR9)
599  return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
600 #else
601  return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
602 #endif
603 }
604 
605 /// \brief Loads a vector from an aligned word array
606 /// \param src the word array
607 /// \details VecLoadAligned() loads a vector from an aligned word array.
608 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
609 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
610 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
611 /// \par Wraps
612 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
613 /// \sa VecLoad_ALTIVEC, VecLoad
614 /// \since Crypto++ 8.0
615 inline uint32x4_p VecLoadAligned(const word32 src[4])
616 {
617  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
618  // word pointers. The ISA lacks loads for short* and char*.
619  // Power9/ISA 3.0 provides vec_xl for all datatypes.
620 
621  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
622  CRYPTOPP_ASSERT(addr % 16 == 0);
623  CRYPTOPP_UNUSED(addr);
624 
625 #if defined(_ARCH_PWR9)
626  return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
627 #elif defined(__VSX__) || defined(_ARCH_PWR8)
628  return (uint32x4_p)vec_xl(0, CONST_V32_CAST(src));
629 #else
630  return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
631 #endif
632 }
633 
634 /// \brief Loads a vector from an aligned word array
635 /// \param src the word array
636 /// \param off offset into the src word array
637 /// \details VecLoadAligned() loads a vector from an aligned word array.
638 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
639 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
640 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
641 /// \par Wraps
642 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
643 /// \sa VecLoad_ALTIVEC, VecLoad
644 /// \since Crypto++ 8.0
645 inline uint32x4_p VecLoadAligned(int off, const word32 src[4])
646 {
647  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
648  // word pointers. The ISA lacks loads for short* and char*.
649  // Power9/ISA 3.0 provides vec_xl for all datatypes.
650 
651  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
652  CRYPTOPP_ASSERT(addr % 16 == 0);
653  CRYPTOPP_UNUSED(addr);
654 
655 #if defined(_ARCH_PWR9)
656  return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
657 #elif defined(__VSX__) || defined(_ARCH_PWR8)
658  return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
659 #else
660  return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
661 #endif
662 }
663 
664 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
665 
666 /// \brief Loads a vector from an aligned double word array
667 /// \param src the double word array
668 /// \details VecLoadAligned() loads a vector from an aligned double word array.
669 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
670 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
671 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
672 /// \par Wraps
673 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
674 /// \sa VecLoad_ALTIVEC, VecLoad
675 /// \since Crypto++ 8.0
676 inline uint64x2_p VecLoadAligned(const word64 src[4])
677 {
678  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
679  // word pointers. The ISA lacks loads for short* and char*.
680  // Power9/ISA 3.0 provides vec_xl for all datatypes.
681 
682  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
683  CRYPTOPP_ASSERT(addr % 16 == 0);
684  CRYPTOPP_UNUSED(addr);
685 
686 #if defined(_ARCH_PWR9)
687  return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
688 #elif defined(__VSX__) || defined(_ARCH_PWR8)
689  // The 32-bit cast is not a typo. Compiler workaround.
690  return (uint64x2_p)vec_xl(0, CONST_V32_CAST(src));
691 #else
692  return (uint64x2_p)vec_ld(0, CONST_V8_CAST(src));
693 #endif
694 }
695 
696 /// \brief Loads a vector from an aligned double word array
697 /// \param src the double word array
698 /// \param off offset into the src double word array
699 /// \details VecLoadAligned() loads a vector from an aligned double word array.
700 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
701 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
702 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
703 /// \par Wraps
704 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
705 /// \sa VecLoad_ALTIVEC, VecLoad
706 /// \since Crypto++ 8.0
707 inline uint64x2_p VecLoadAligned(int off, const word64 src[4])
708 {
709  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
710  // word pointers. The ISA lacks loads for short* and char*.
711  // Power9/ISA 3.0 provides vec_xl for all datatypes.
712 
713  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
714  CRYPTOPP_ASSERT(addr % 16 == 0);
715  CRYPTOPP_UNUSED(addr);
716 
717 #if defined(_ARCH_PWR9)
718  return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
719 #elif defined(__VSX__) || defined(_ARCH_PWR8)
720  // The 32-bit cast is not a typo. Compiler workaround.
721  return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
722 #else
723  return (uint64x2_p)vec_ld(off, CONST_V8_CAST(src));
724 #endif
725 }
726 
727 #endif
728 
729 /// \brief Loads a vector from a byte array
730 /// \param src the byte array
731 /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
732 /// will reverse all bytes in the array on a little endian system.
733 /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
734 /// The instruction does not require aligned effective memory addresses.
735 /// VecLoad_ALTIVEC() is used if POWER7 or VSX are not available.
736 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
737 /// are required to fix up unaligned memory addresses.
738 /// \par Wraps
739 /// vec_xl on POWER8, Altivec load on POWER7 and below
740 /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned
741 /// \since Crypto++ 6.0
742 inline uint32x4_p VecLoadBE(const byte src[16])
743 {
744  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
745  // word pointers. The ISA lacks loads for short* and char*.
746  // Power9/ISA 3.0 provides vec_xl for all datatypes.
747 
748  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
749  // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
750  CRYPTOPP_UNUSED(addr);
751 
752 #if defined(_ARCH_PWR9)
753  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
754  return (uint32x4_p)vec_xl_be(0, CONST_V8_CAST(src));
755 #elif defined(CRYPTOPP_BIG_ENDIAN)
756  return (uint32x4_p)VecLoad_ALTIVEC(0, CONST_V8_CAST(src));
757 #else
759 #endif
760 }
761 
762 /// \brief Loads a vector from a byte array
763 /// \param src the byte array
764 /// \param off offset into the src byte array
765 /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
766 /// will reverse all bytes in the array on a little endian system.
767 /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
768 /// The instruction does not require aligned effective memory addresses.
769 /// VecLoad_ALTIVEC() is used if POWER7 is not available.
770 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
771 /// are required to fix up unaligned memory addresses.
772 /// \par Wraps
773 /// vec_xl on POWER8, Altivec load on POWER7 and below
774 /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned
775 /// \since Crypto++ 6.0
776 inline uint32x4_p VecLoadBE(int off, const byte src[16])
777 {
778  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
779  // word pointers. The ISA lacks loads for short* and char*.
780  // Power9/ISA 3.0 provides vec_xl for all datatypes.
781 
782  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
783  // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
784  CRYPTOPP_UNUSED(addr);
785 
786 #if defined(_ARCH_PWR9)
787  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
788  return (uint32x4_p)vec_xl_be(off, CONST_V8_CAST(src));
789 #elif defined(CRYPTOPP_BIG_ENDIAN)
791 #else
793 #endif
794 }
795 
796 //@}
797 
798 /// \name STORE OPERATIONS
799 //@{
800 
801 /// \brief Stores a vector to a byte array
802 /// \tparam T vector type
803 /// \param data the vector
804 /// \param dest the byte array
805 /// \details VecStore_ALTIVEC() stores a vector to a byte array.
806 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
807 /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
808 /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
809 /// memory addresses.
810 /// \details VecStore_ALTIVEC() is used when POWER7 or above
811 /// and unaligned loads is not available.
812 /// \par Wraps
813 /// vec_st, vec_ste, vec_lvsr, vec_perm
814 /// \sa VecStore, VecStoreAligned
815 /// \since Crypto++ 8.0
816 template<class T>
817 inline void VecStore_ALTIVEC(const T data, byte dest[16])
818 {
819  // Avoid IsAlignedOn for convenience.
820  uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
821  if (addr % 16 == 0)
822  {
823  vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
824  }
825  else
826  {
827  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
828  uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
829  vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr));
830  vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr));
831  vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr));
832  vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr));
833  vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr));
834  vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr));
835  vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
836  vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
837  }
838 }
839 
840 /// \brief Stores a vector to a byte array
841 /// \tparam T vector type
842 /// \param data the vector
843 /// \param off offset into the dest byte array
844 /// \param dest the byte array
845 /// \details VecStore_ALTIVEC() stores a vector to a byte array.
846 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
847 /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
848 /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
849 /// memory addresses.
850 /// \details VecStore_ALTIVEC() is used when POWER7 or above
851 /// and unaligned loads is not available.
852 /// \par Wraps
853 /// vec_st, vec_ste, vec_lvsr, vec_perm
854 /// \sa VecStore, VecStoreAligned
855 /// \since Crypto++ 8.0
856 template<class T>
857 inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])
858 {
859  // Avoid IsAlignedOn for convenience.
860  uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
861  if (addr % 16 == 0)
862  {
863  vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
864  }
865  else
866  {
867  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
868  uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
869  vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr));
870  vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr));
871  vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr));
872  vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr));
873  vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr));
874  vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr));
875  vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
876  vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
877  }
878 }
879 
880 /// \brief Stores a vector to a byte array
881 /// \tparam T vector type
882 /// \param data the vector
883 /// \param dest the byte array
884 /// \details VecStore() stores a vector to a byte array.
885 /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.
886 /// The instruction does not require aligned effective memory addresses.
887 /// VecStore_ALTIVEC() is used if POWER9 is not available.
888 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
889 /// are required to fix up unaligned memory addresses.
890 /// \par Wraps
891 /// vec_xst on POWER9 and above, Altivec store on POWER8 and below
892 /// \sa VecStore_ALTIVEC, VecStoreAligned
893 /// \since Crypto++ 6.0
894 template<class T>
895 inline void VecStore(const T data, byte dest[16])
896 {
897  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
898  // word pointers. The ISA lacks loads for short* and char*.
899  // Power9/ISA 3.0 provides vec_xl for all datatypes.
900 
901  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
902  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
903  CRYPTOPP_UNUSED(addr);
904 
905 #if defined(_ARCH_PWR9)
906  vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
907 #else
909 #endif
910 }
911 
912 /// \brief Stores a vector to a byte array
913 /// \tparam T vector type
914 /// \param data the vector
915 /// \param off offset into the dest byte array
916 /// \param dest the byte array
917 /// \details VecStore() stores a vector to a byte array.
918 /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.
919 /// The instruction does not require aligned effective memory addresses.
920 /// VecStore_ALTIVEC() is used if POWER9 is not available.
921 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
922 /// are required to fix up unaligned memory addresses.
923 /// \par Wraps
924 /// vec_xst on POWER9 and above, Altivec store on POWER8 and below
925 /// \sa VecStore_ALTIVEC, VecStoreAligned
926 /// \since Crypto++ 6.0
927 template<class T>
928 inline void VecStore(const T data, int off, byte dest[16])
929 {
930  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
931  // word pointers. The ISA lacks loads for short* and char*.
932  // Power9/ISA 3.0 provides vec_xl for all datatypes.
933 
934  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
935  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
936  CRYPTOPP_UNUSED(addr);
937 
938 #if defined(_ARCH_PWR9)
939  vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
940 #else
942 #endif
943 }
944 
945 /// \brief Stores a vector to a word array
946 /// \tparam T vector type
947 /// \param data the vector
948 /// \param dest the word array
949 /// \details VecStore() stores a vector to a word array.
950 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
951 /// The instruction does not require aligned effective memory addresses.
952 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
953 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
954 /// are required to fix up unaligned memory addresses.
955 /// \par Wraps
956 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
957 /// \sa VecStore_ALTIVEC, VecStoreAligned
958 /// \since Crypto++ 8.0
959 template<class T>
960 inline void VecStore(const T data, word32 dest[4])
961 {
962  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
963  // word pointers. The ISA lacks stores for short* and char*.
964  // Power9/ISA 3.0 provides vec_xst for all datatypes.
965 
966  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
967  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
968  CRYPTOPP_UNUSED(addr);
969 
970 #if defined(_ARCH_PWR9)
971  vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
972 #elif defined(__VSX__) || defined(_ARCH_PWR8)
973  vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
974 #else
976 #endif
977 }
978 
979 /// \brief Stores a vector to a word array
980 /// \tparam T vector type
981 /// \param data the vector
982 /// \param off offset into the dest word array
983 /// \param dest the word array
984 /// \details VecStore() stores a vector to a word array.
985 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
986 /// The instruction does not require aligned effective memory addresses.
987 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
988 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
989 /// are required to fix up unaligned memory addresses.
990 /// \par Wraps
991 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
992 /// \sa VecStore_ALTIVEC, VecStoreAligned
993 /// \since Crypto++ 8.0
994 template<class T>
995 inline void VecStore(const T data, int off, word32 dest[4])
996 {
997  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
998  // word pointers. The ISA lacks stores for short* and char*.
999  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1000 
1001  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1002  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1003  CRYPTOPP_UNUSED(addr);
1004 
1005 #if defined(_ARCH_PWR9)
1006  vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1007 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1008  vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1009 #else
1011 #endif
1012 }
1013 
1014 /// \brief Stores a vector to a word array
1015 /// \tparam T vector type
1016 /// \param data the vector
1017 /// \param dest the word array
1018 /// \details VecStore() stores a vector to a word array.
1019 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1020 /// The instruction does not require aligned effective memory addresses.
1021 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
1022 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1023 /// are required to fix up unaligned memory addresses.
1024 /// \details VecStore() with 64-bit elements is available on POWER8 and above.
1025 /// \par Wraps
1026 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
1027 /// \sa VecStore_ALTIVEC, VecStoreAligned
1028 /// \since Crypto++ 8.0
1029 template<class T>
1030 inline void VecStore(const T data, word64 dest[2])
1031 {
1032  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1033  // word pointers. The ISA lacks stores for short* and char*.
1034  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1035 
1036  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1037  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1038  CRYPTOPP_UNUSED(addr);
1039 
1040 #if defined(_ARCH_PWR9)
1041  vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1042 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1043  // 32-bit cast is not a typo. Compiler workaround.
1044  vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1045 #else
1047 #endif
1048 }
1049 
1050 /// \brief Stores a vector to a word array
1051 /// \tparam T vector type
1052 /// \param data the vector
1053 /// \param off offset into the dest word array
1054 /// \param dest the word array
1055 /// \details VecStore() stores a vector to a word array.
1056 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1057 /// The instruction does not require aligned effective memory addresses.
1058 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
1059 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1060 /// are required to fix up unaligned memory addresses.
1061 /// \details VecStore() with 64-bit elements is available on POWER8 and above.
1062 /// \par Wraps
1063 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
1064 /// \sa VecStore_ALTIVEC, VecStoreAligned
1065 /// \since Crypto++ 8.0
1066 template<class T>
1067 inline void VecStore(const T data, int off, word64 dest[2])
1068 {
1069  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1070  // word pointers. The ISA lacks stores for short* and char*.
1071  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1072 
1073  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1074  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1075  CRYPTOPP_UNUSED(addr);
1076 
1077 #if defined(_ARCH_PWR9)
1078  vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1079 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1080  // 32-bit cast is not a typo. Compiler workaround.
1081  vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1082 #else
1084 #endif
1085 }
1086 
1087 /// \brief Stores a vector to a byte array
1088 /// \tparam T vector type
1089 /// \param data the vector
1090 /// \param dest the byte array
1091 /// \details VecStoreAligned() stores a vector from an aligned byte array.
1092 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1093 /// <tt>vec_st</tt> is used if POWER9 is not available. The effective
1094 /// address of <tt>dest</tt> must be 16-byte aligned for Altivec.
1095 /// \par Wraps
1096 /// vec_xst on POWER9 or above, vec_st on POWER8 and below
1097 /// \sa VecStore_ALTIVEC, VecStore
1098 /// \since Crypto++ 8.0
1099 template<class T>
1100 inline void VecStoreAligned(const T data, byte dest[16])
1101 {
1102  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
1103  // word pointers. The ISA lacks loads for short* and char*.
1104  // Power9/ISA 3.0 provides vec_xl for all datatypes.
1105 
1106  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1107  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1108  CRYPTOPP_UNUSED(addr);
1109 
1110 #if defined(_ARCH_PWR9)
1111  vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1112 #else
1113  vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1114 #endif
1115 }
1116 
1117 /// \brief Stores a vector to a byte array
1118 /// \tparam T vector type
1119 /// \param data the vector
1120 /// \param off offset into the dest byte array
1121 /// \param dest the byte array
1122 /// \details VecStoreAligned() stores a vector from an aligned byte array.
1123 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1124 /// <tt>vec_st</tt> is used if POWER9 is not available. The effective
1125 /// address of <tt>dest</tt> must be 16-byte aligned for Altivec.
1126 /// \par Wraps
1127 /// vec_xst on POWER9 or above, vec_st on POWER8 and below
1128 /// \sa VecStore_ALTIVEC, VecStore
1129 /// \since Crypto++ 8.0
1130 template<class T>
1131 inline void VecStoreAligned(const T data, int off, byte dest[16])
1132 {
1133  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
1134  // word pointers. The ISA lacks loads for short* and char*.
1135  // Power9/ISA 3.0 provides vec_xl for all datatypes.
1136 
1137  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1138  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1139  CRYPTOPP_UNUSED(addr);
1140 
1141 #if defined(_ARCH_PWR9)
1142  vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1143 #else
1144  vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1145 #endif
1146 }
1147 
1148 /// \brief Stores a vector to a word array
1149 /// \tparam T vector type
1150 /// \param data the vector
1151 /// \param dest the word array
1152 /// \details VecStoreAligned() stores a vector from an aligned word array.
1153 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1154 /// POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>
1155 /// is used if POWER7 is not available. The effective address of <tt>dest</tt>
1156 /// must be 16-byte aligned for Altivec.
1157 /// \par Wraps
1158 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1159 /// \sa VecStore_ALTIVEC, VecStore
1160 /// \since Crypto++ 8.0
1161 template<class T>
1162 inline void VecStoreAligned(const T data, word32 dest[4])
1163 {
1164  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1165  // word pointers. The ISA lacks stores for short* and char*.
1166  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1167 
1168  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1169  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1170  CRYPTOPP_UNUSED(addr);
1171 
1172 #if defined(_ARCH_PWR9)
1173  vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1174 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1175  vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1176 #else
1177  vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1178 #endif
1179 }
1180 
1181 /// \brief Stores a vector to a word array
1182 /// \tparam T vector type
1183 /// \param data the vector
1184 /// \param off offset into the dest word array
1185 /// \param dest the word array
1186 /// \details VecStoreAligned() stores a vector from an aligned word array.
1187 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1188 /// POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>
1189 /// is used if POWER7 is not available. The effective address of <tt>dest</tt>
1190 /// must be 16-byte aligned for Altivec.
1191 /// \par Wraps
1192 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1193 /// \sa VecStore_ALTIVEC, VecStore
1194 /// \since Crypto++ 8.0
1195 template<class T>
1196 inline void VecStoreAligned(const T data, int off, word32 dest[4])
1197 {
1198  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1199  // word pointers. The ISA lacks stores for short* and char*.
1200  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1201 
1202  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1203  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1204  CRYPTOPP_UNUSED(addr);
1205 
1206 #if defined(_ARCH_PWR9)
1207  vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1208 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1209  vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1210 #else
1211  vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1212 #endif
1213 }
1214 
1215 /// \brief Stores a vector to a byte array
1216 /// \tparam T vector type
1217 /// \param data the vector
1218 /// \param dest the byte array
1219 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
1220 /// will reverse all bytes in the array on a little endian system.
1221 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1222 /// The instruction does not require aligned effective memory addresses.
1223 /// VecStore_ALTIVEC() is used if POWER7 is not available.
1224 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1225 /// are required to fix up unaligned memory addresses.
1226 /// \par Wraps
1227 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1228 /// \sa VecStore_ALTIVEC, VecStoreAligned
1229 /// \since Crypto++ 6.0
1230 template <class T>
1231 inline void VecStoreBE(const T data, byte dest[16])
1232 {
1233  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1234  // word pointers. The ISA lacks stores for short* and char*.
1235  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1236 
1237  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1238  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1239  CRYPTOPP_UNUSED(addr);
1240 
1241 #if defined(_ARCH_PWR9)
1242  vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1243 #elif defined(CRYPTOPP_BIG_ENDIAN)
1244  VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1245 #else
1247 #endif
1248 }
1249 
1250 /// \brief Stores a vector to a byte array
1251 /// \tparam T vector type
1252 /// \param data the vector
1253 /// \param off offset into the dest byte array
1254 /// \param dest the byte array
1255 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
1256 /// will reverse all bytes in the array on a little endian system.
1257 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1258 /// The instruction does not require aligned effective memory addresses.
1259 /// VecStore_ALTIVEC() is used if POWER7 is not available.
1260 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1261 /// are required to fix up unaligned memory addresses.
1262 /// \par Wraps
1263 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1264 /// \sa VecStore_ALTIVEC, VecStoreAligned
1265 /// \since Crypto++ 6.0
1266 template <class T>
1267 inline void VecStoreBE(const T data, int off, byte dest[16])
1268 {
1269  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1270  // word pointers. The ISA lacks stores for short* and char*.
1271  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1272 
1273  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1274  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1275  CRYPTOPP_UNUSED(addr);
1276 
1277 #if defined(_ARCH_PWR9)
1278  vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1279 #elif defined(CRYPTOPP_BIG_ENDIAN)
1280  VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1281 #else
1283 #endif
1284 }
1285 
1286 /// \brief Stores a vector to a word array
1287 /// \tparam T vector type
1288 /// \param data the vector
1289 /// \param dest the word array
1290 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
1291 /// will reverse all bytes in the array on a little endian system.
1292 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1293 /// The instruction does not require aligned effective memory addresses.
1294 /// VecStore_ALTIVEC() is used if POWER7 is not available.
1295 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1296 /// are required to fix up unaligned memory addresses.
1297 /// \par Wraps
1298 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1299 /// \sa VecStore_ALTIVEC, VecStoreAligned
1300 /// \since Crypto++ 8.0
1301 template <class T>
1302 inline void VecStoreBE(const T data, word32 dest[4])
1303 {
1304  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1305  // word pointers. The ISA lacks stores for short* and char*.
1306  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1307 
1308  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1309  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1310  CRYPTOPP_UNUSED(addr);
1311 
1312 #if defined(_ARCH_PWR9)
1313  vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1314 #elif defined(CRYPTOPP_BIG_ENDIAN)
1315  VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1316 #else
1318 #endif
1319 }
1320 
1321 /// \brief Stores a vector to a word array
1322 /// \tparam T vector type
1323 /// \param data the vector
1324 /// \param off offset into the dest word array
1325 /// \param dest the word array
1326 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
1327 /// will reverse all words in the array on a little endian system.
1328 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1329 /// The instruction does not require aligned effective memory addresses.
1330 /// VecStore_ALTIVEC() is used if POWER7 is not available.
1331 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1332 /// are required to fix up unaligned memory addresses.
1333 /// \par Wraps
1334 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1335 /// \sa VecStore_ALTIVEC, VecStoreAligned
1336 /// \since Crypto++ 8.0
1337 template <class T>
1338 inline void VecStoreBE(const T data, int off, word32 dest[4])
1339 {
1340  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1341  // word pointers. The ISA lacks stores for short* and char*.
1342  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1343 
1344  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1345  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1346  CRYPTOPP_UNUSED(addr);
1347 
1348 #if defined(_ARCH_PWR9)
1349  vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1350 #elif defined(CRYPTOPP_BIG_ENDIAN)
1351  VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1352 #else
1354 #endif
1355 }
1356 
1357 //@}
1358 
1359 /// \name LOGICAL OPERATIONS
1360 //@{
1361 
1362 /// \brief AND two vectors
1363 /// \tparam T1 vector type
1364 /// \tparam T2 vector type
1365 /// \param vec1 the first vector
1366 /// \param vec2 the second vector
1367 /// \return vector
1368 /// \details VecAnd() performs <tt>vec1 & vec2</tt>.
1369 /// vec2 is cast to the same type as vec1. The return vector
1370 /// is the same type as vec1.
1371 /// \par Wraps
1372 /// vec_and
1373 /// \sa VecAnd64
1374 /// \since Crypto++ 6.0
1375 template <class T1, class T2>
1376 inline T1 VecAnd(const T1 vec1, const T2 vec2)
1377 {
1378  return (T1)vec_and(vec1, (T1)vec2);
1379 }
1380 
1381 /// \brief OR two vectors
1382 /// \tparam T1 vector type
1383 /// \tparam T2 vector type
1384 /// \param vec1 the first vector
1385 /// \param vec2 the second vector
1386 /// \return vector
1387 /// \details VecOr() performs <tt>vec1 | vec2</tt>.
1388 /// vec2 is cast to the same type as vec1. The return vector
1389 /// is the same type as vec1.
1390 /// \par Wraps
1391 /// vec_or
1392 /// \sa VecOr64
1393 /// \since Crypto++ 6.0
1394 template <class T1, class T2>
1395 inline T1 VecOr(const T1 vec1, const T2 vec2)
1396 {
1397  return (T1)vec_or(vec1, (T1)vec2);
1398 }
1399 
1400 /// \brief XOR two vectors
1401 /// \tparam T1 vector type
1402 /// \tparam T2 vector type
1403 /// \param vec1 the first vector
1404 /// \param vec2 the second vector
1405 /// \return vector
1406 /// \details VecXor() performs <tt>vec1 ^ vec2</tt>.
1407 /// vec2 is cast to the same type as vec1. The return vector
1408 /// is the same type as vec1.
1409 /// \par Wraps
1410 /// vec_xor
1411 /// \sa VecXor64
1412 /// \since Crypto++ 6.0
1413 template <class T1, class T2>
1414 inline T1 VecXor(const T1 vec1, const T2 vec2)
1415 {
1416  return (T1)vec_xor(vec1, (T1)vec2);
1417 }
1418 
1419 //@}
1420 
1421 /// \name ARITHMETIC OPERATIONS
1422 //@{
1423 
1424 /// \brief Add two vectors
1425 /// \tparam T1 vector type
1426 /// \tparam T2 vector type
1427 /// \param vec1 the first vector
1428 /// \param vec2 the second vector
1429 /// \return vector
1430 /// \details VecAdd() performs <tt>vec1 + vec2</tt>.
1431 /// vec2 is cast to the same type as vec1. The return vector
1432 /// is the same type as vec1.
1433 /// \par Wraps
1434 /// vec_add
1435 /// \sa VecAdd64
1436 /// \since Crypto++ 6.0
1437 template <class T1, class T2>
1438 inline T1 VecAdd(const T1 vec1, const T2 vec2)
1439 {
1440  return (T1)vec_add(vec1, (T1)vec2);
1441 }
1442 
1443 /// \brief Subtract two vectors
1444 /// \tparam T1 vector type
1445 /// \tparam T2 vector type
1446 /// \param vec1 the first vector
1447 /// \param vec2 the second vector
1448 /// \details VecSub() performs <tt>vec1 - vec2</tt>.
1449 /// vec2 is cast to the same type as vec1. The return vector
1450 /// is the same type as vec1.
1451 /// \par Wraps
1452 /// vec_sub
1453 /// \sa VecSub64
1454 /// \since Crypto++ 6.0
1455 template <class T1, class T2>
1456 inline T1 VecSub(const T1 vec1, const T2 vec2)
1457 {
1458  return (T1)vec_sub(vec1, (T1)vec2);
1459 }
1460 
1461 //@}
1462 
1463 /// \name PERMUTE OPERATIONS
1464 //@{
1465 
1466 /// \brief Permutes a vector
1467 /// \tparam T1 vector type
1468 /// \tparam T2 vector type
1469 /// \param vec the vector
1470 /// \param mask vector mask
1471 /// \return vector
1472 /// \details VecPermute() creates a new vector from vec according to mask.
1473 /// mask is an uint8x16_p vector. The return vector is the same type as vec.
1474 /// \par Wraps
1475 /// vec_perm
1476 /// \since Crypto++ 6.0
1477 template <class T1, class T2>
1478 inline T1 VecPermute(const T1 vec, const T2 mask)
1479 {
1480  return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
1481 }
1482 
1483 /// \brief Permutes two vectors
1484 /// \tparam T1 vector type
1485 /// \tparam T2 vector type
1486 /// \param vec1 the first vector
1487 /// \param vec2 the second vector
1488 /// \param mask vector mask
1489 /// \return vector
1490 /// \details VecPermute() creates a new vector from vec1 and vec2 according to mask.
1491 /// mask is an uint8x16_p vector. The return vector is the same type as vec.
1492 /// \par Wraps
1493 /// vec_perm
1494 /// \since Crypto++ 6.0
1495 template <class T1, class T2>
1496 inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)
1497 {
1498  return (T1)vec_perm(vec1, (T1)vec2, (uint8x16_p)mask);
1499 }
1500 
1501 //@}
1502 
1503 /// \name SHIFT AND ROTATE OPERATIONS
1504 //@{
1505 
1506 /// \brief Shift a vector left
1507 /// \tparam C shift byte count
1508 /// \tparam T vector type
1509 /// \param vec the vector
1510 /// \return vector
1511 /// \details VecShiftLeftOctet() returns a new vector after shifting the
1512 /// concatenation of the zero vector and the source vector by the specified
1513 /// number of bytes. The return vector is the same type as vec.
1514 /// \details On big endian machines VecShiftLeftOctet() is <tt>vec_sld(a, z,
1515 /// c)</tt>. On little endian machines VecShiftLeftOctet() is translated to
1516 /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1517 /// if on a big endian machine as shown below.
1518 /// <pre>
1519 /// uint8x16_p x = VecLoad(ptr);
1520 /// uint8x16_p y = VecShiftLeftOctet<12>(x);
1521 /// </pre>
1522 /// \par Wraps
1523 /// vec_sld
1524 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1525 /// endian sensitive?</A> on Stack Overflow
1526 /// \since Crypto++ 6.0
1527 template <unsigned int C, class T>
1528 inline T VecShiftLeftOctet(const T vec)
1529 {
1530  const T zero = {0};
1531  if (C >= 16)
1532  {
1533  // Out of range
1534  return zero;
1535  }
1536  else if (C == 0)
1537  {
1538  // Noop
1539  return vec;
1540  }
1541  else
1542  {
1543 #if defined(CRYPTOPP_BIG_ENDIAN)
1544  enum { R=C&0xf };
1545  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1546 #else
1547  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1548  return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1549 #endif
1550  }
1551 }
1552 
1553 /// \brief Shift a vector right
1554 /// \tparam C shift byte count
1555 /// \tparam T vector type
1556 /// \param vec the vector
1557 /// \return vector
1558 /// \details VecShiftRightOctet() returns a new vector after shifting the
1559 /// concatenation of the zero vector and the source vector by the specified
1560 /// number of bytes. The return vector is the same type as vec.
1561 /// \details On big endian machines VecShiftRightOctet() is <tt>vec_sld(a, z,
1562 /// c)</tt>. On little endian machines VecShiftRightOctet() is translated to
1563 /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1564 /// if on a big endian machine as shown below.
1565 /// <pre>
1566 /// uint8x16_p x = VecLoad(ptr);
1567 /// uint8x16_p y = VecShiftRightOctet<12>(y);
1568 /// </pre>
1569 /// \par Wraps
1570 /// vec_sld
1571 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1572 /// endian sensitive?</A> on Stack Overflow
1573 /// \since Crypto++ 6.0
1574 template <unsigned int C, class T>
1575 inline T VecShiftRightOctet(const T vec)
1576 {
1577  const T zero = {0};
1578  if (C >= 16)
1579  {
1580  // Out of range
1581  return zero;
1582  }
1583  else if (C == 0)
1584  {
1585  // Noop
1586  return vec;
1587  }
1588  else
1589  {
1590 #if defined(CRYPTOPP_BIG_ENDIAN)
1591  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1592  return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1593 #else
1594  enum { R=C&0xf };
1595  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1596 #endif
1597  }
1598 }
1599 
1600 /// \brief Rotate a vector left
1601 /// \tparam C shift byte count
1602 /// \tparam T vector type
1603 /// \param vec the vector
1604 /// \return vector
1605 /// \details VecRotateLeftOctet() returns a new vector after rotating the
1606 /// concatenation of the source vector with itself by the specified
1607 /// number of bytes. The return vector is the same type as vec.
1608 /// \par Wraps
1609 /// vec_sld
1610 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1611 /// endian sensitive?</A> on Stack Overflow
1612 /// \since Crypto++ 6.0
1613 template <unsigned int C, class T>
1614 inline T VecRotateLeftOctet(const T vec)
1615 {
1616 #if defined(CRYPTOPP_BIG_ENDIAN)
1617  enum { R = C&0xf };
1618  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1619 #else
1620  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1621  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1622 #endif
1623 }
1624 
1625 /// \brief Rotate a vector right
1626 /// \tparam C shift byte count
1627 /// \tparam T vector type
1628 /// \param vec the vector
1629 /// \return vector
1630 /// \details VecRotateRightOctet() returns a new vector after rotating the
1631 /// concatenation of the source vector with itself by the specified
1632 /// number of bytes. The return vector is the same type as vec.
1633 /// \par Wraps
1634 /// vec_sld
1635 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1636 /// endian sensitive?</A> on Stack Overflow
1637 /// \since Crypto++ 6.0
1638 template <unsigned int C, class T>
1639 inline T VecRotateRightOctet(const T vec)
1640 {
1641 #if defined(CRYPTOPP_BIG_ENDIAN)
1642  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1643  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1644 #else
1645  enum { R = C&0xf };
1646  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1647 #endif
1648 }
1649 
1650 /// \brief Rotate a vector left
1651 /// \tparam C rotate bit count
1652 /// \param vec the vector
1653 /// \return vector
1654 /// \details VecRotateLeft() rotates each element in a vector by
1655 /// bit count. The return vector is the same type as vec.
1656 /// \par Wraps
1657 /// vec_rl
1658 /// \since Crypto++ 7.0
1659 template<unsigned int C>
1661 {
1662  const uint32x4_p m = {C, C, C, C};
1663  return vec_rl(vec, m);
1664 }
1665 
1666 /// \brief Rotate a vector right
1667 /// \tparam C rotate bit count
1668 /// \param vec the vector
1669 /// \return vector
1670 /// \details VecRotateRight() rotates each element in a vector
1671 /// by bit count. The return vector is the same type as vec.
1672 /// \par Wraps
1673 /// vec_rl
1674 /// \since Crypto++ 7.0
1675 template<unsigned int C>
1677 {
1678  const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
1679  return vec_rl(vec, m);
1680 }
1681 
1682 /// \brief Shift a vector left
1683 /// \tparam C shift bit count
1684 /// \param vec the vector
1685 /// \return vector
1686 /// \details VecShiftLeft() rotates each element in a vector
1687 /// by bit count. The return vector is the same type as vec.
1688 /// \par Wraps
1689 /// vec_sl
1690 /// \since Crypto++ 8.1
1691 template<unsigned int C>
1693 {
1694  const uint32x4_p m = {C, C, C, C};
1695  return vec_sl(vec, m);
1696 }
1697 
1698 /// \brief Shift a vector right
1699 /// \tparam C shift bit count
1700 /// \param vec the vector
1701 /// \return vector
1702 /// \details VecShiftRight() rotates each element in a vector
1703 /// by bit count. The return vector is the same type as vec.
1704 /// \par Wraps
1705 /// vec_rl
1706 /// \since Crypto++ 8.1
1707 template<unsigned int C>
1709 {
1710  const uint32x4_p m = {C, C, C, C};
1711  return vec_sr(vec, m);
1712 }
1713 
1714 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
1715 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1716 
1717 /// \brief Rotate a vector left
1718 /// \tparam C rotate bit count
1719 /// \param vec the vector
1720 /// \return vector
1721 /// \details VecRotateLeft() rotates each element in a vector
1722 /// by bit count. The return vector is the same type as vec.
1723 /// \details VecRotateLeft() with 64-bit elements is available on
1724 /// POWER8 and above.
1725 /// \par Wraps
1726 /// vec_rl
1727 /// \since Crypto++ 8.0
1728 template<unsigned int C>
1730 {
1731  const uint64x2_p m = {C, C};
1732  return vec_rl(vec, m);
1733 }
1734 
1735 /// \brief Shift a vector left
1736 /// \tparam C shift bit count
1737 /// \param vec the vector
1738 /// \return vector
1739 /// \details VecShiftLeft() rotates each element in a vector
1740 /// by bit count. The return vector is the same type as vec.
1741 /// \details VecShiftLeft() with 64-bit elements is available on
1742 /// POWER8 and above.
1743 /// \par Wraps
1744 /// vec_sl
1745 /// \since Crypto++ 8.1
1746 template<unsigned int C>
1748 {
1749  const uint64x2_p m = {C, C};
1750  return vec_sl(vec, m);
1751 }
1752 
1753 /// \brief Rotate a vector right
1754 /// \tparam C rotate bit count
1755 /// \param vec the vector
1756 /// \return vector
1757 /// \details VecRotateRight() rotates each element in a vector
1758 /// by bit count. The return vector is the same type as vec.
1759 /// \details VecRotateRight() with 64-bit elements is available on
1760 /// POWER8 and above.
1761 /// \par Wraps
1762 /// vec_rl
1763 /// \since Crypto++ 8.0
1764 template<unsigned int C>
1766 {
1767  const uint64x2_p m = {64-C, 64-C};
1768  return vec_rl(vec, m);
1769 }
1770 
1771 /// \brief Shift a vector right
1772 /// \tparam C shift bit count
1773 /// \param vec the vector
1774 /// \return vector
1775 /// \details VecShiftRight() rotates each element in a vector
1776 /// by bit count. The return vector is the same type as vec.
1777 /// \details VecShiftRight() with 64-bit elements is available on
1778 /// POWER8 and above.
1779 /// \par Wraps
1780 /// vec_sr
1781 /// \since Crypto++ 8.1
1782 template<unsigned int C>
1784 {
1785  const uint64x2_p m = {C, C};
1786  return vec_sr(vec, m);
1787 }
1788 
1789 #endif // ARCH_PWR8
1790 
1791 //@}
1792 
1793 /// \name OTHER OPERATIONS
1794 //@{
1795 
1796 /// \brief Merge two vectors
1797 /// \tparam T vector type
1798 /// \param vec1 the first vector
1799 /// \param vec2 the second vector
1800 /// \return vector
1801 /// \par Wraps
1802 /// vec_mergel
1803 /// \since Crypto++ 8.1
1804 template <class T>
1805 inline T VecMergeLow(const T vec1, const T vec2)
1806 {
1807  return vec_mergel(vec1, vec2);
1808 }
1809 
1810 /// \brief Merge two vectors
1811 /// \tparam T vector type
1812 /// \param vec1 the first vector
1813 /// \param vec2 the second vector
1814 /// \return vector
1815 /// \par Wraps
1816 /// vec_mergeh
1817 /// \since Crypto++ 8.1
1818 template <class T>
1819 inline T VecMergeHigh(const T vec1, const T vec2)
1820 {
1821  return vec_mergeh(vec1, vec2);
1822 }
1823 
1824 /// \brief Broadcast 32-bit word to a vector
1825 /// \param val the 32-bit value
1826 /// \return vector
1827 /// \par Wraps
1828 /// vec_splats
1829 /// \since Crypto++ 8.3
1831 {
1832  // Fix spurious GCC warning???
1833  CRYPTOPP_UNUSED(val);
1834 
1835  // Apple Altivec and XL C++ do not offer vec_splats.
1836  // GCC offers vec_splats back to -mcpu=power4.
1837 #if defined(_ARCH_PWR4) && defined(__GNUC__)
1838  return vec_splats(val);
1839 #else
1840  //const word32 x[4] = {val,val,val,val};
1841  //return VecLoad(x);
1842  const word32 x[4] = {val};
1843  return vec_splat(VecLoad(x),0);
1844 #endif
1845 }
1846 
1847 /// \brief Broadcast 32-bit element to a vector
1848 /// \tparam the element number
1849 /// \param val the 32-bit value
1850 /// \return vector
1851 /// \par Wraps
1852 /// vec_splat
1853 /// \since Crypto++ 8.3
1854 template <unsigned int N>
1856 {
1857  return vec_splat(val, N);
1858 }
1859 
1860 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1861 /// \brief Broadcast 64-bit double word to a vector
1862 /// \param val the 64-bit value
1863 /// \return vector
1864 /// \par Wraps
1865 /// vec_splats
1866 /// \since Crypto++ 8.3
1868 {
1869  // The PPC64 ABI says so.
1870  return vec_splats((unsigned long long)val);
1871 }
1872 
1873 /// \brief Broadcast 64-bit element to a vector
1874 /// \tparam the element number
1875 /// \param val the 64-bit value
1876 /// \return vector
1877 /// \par Wraps
1878 /// vec_splat
1879 /// \since Crypto++ 8.3
1880 template <unsigned int N>
1882 {
1883 #if defined(__VSX__) || defined(_ARCH_PWR8)
1884  return vec_splat(val, N);
1885 #else
1886  enum {E=N&1};
1887  if (E == 0)
1888  {
1889  const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
1890  return vec_perm(val, val, m);
1891  }
1892  else // (E == 1)
1893  {
1894  const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
1895  return vec_perm(val, val, m);
1896  }
1897 #endif
1898 }
1899 #endif
1900 
1901 /// \brief Extract a dword from a vector
1902 /// \tparam T vector type
1903 /// \param val the vector
1904 /// \return vector created from low dword
1905 /// \details VecGetLow() extracts the low dword from a vector. The low dword
1906 /// is composed of the least significant bits and occupies bytes 8 through 15
1907 /// when viewed as a big endian array. The return vector is the same type as
1908 /// the original vector and padded with 0's in the most significant bit positions.
1909 /// \par Wraps
1910 /// vec_sld
1911 /// \since Crypto++ 7.0
1912 template <class T>
1913 inline T VecGetLow(const T val)
1914 {
1915 #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1916  const T zero = {0};
1917  return (T)VecMergeLow((uint64x2_p)zero, (uint64x2_p)val);
1918 #else
1919  return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
1920 #endif
1921 }
1922 
1923 /// \brief Extract a dword from a vector
1924 /// \tparam T vector type
1925 /// \param val the vector
1926 /// \return vector created from high dword
1927 /// \details VecGetHigh() extracts the high dword from a vector. The high dword
1928 /// is composed of the most significant bits and occupies bytes 0 through 7
1929 /// when viewed as a big endian array. The return vector is the same type as
1930 /// the original vector and padded with 0's in the most significant bit positions.
1931 /// \par Wraps
1932 /// vec_sld
1933 /// \since Crypto++ 7.0
1934 template <class T>
1935 inline T VecGetHigh(const T val)
1936 {
1937 #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1938  const T zero = {0};
1939  return (T)VecMergeHigh((uint64x2_p)zero, (uint64x2_p)val);
1940 #else
1941  return VecShiftRightOctet<8>(val);
1942 #endif
1943 }
1944 
1945 /// \brief Exchange high and low double words
1946 /// \tparam T vector type
1947 /// \param vec the vector
1948 /// \return vector
1949 /// \par Wraps
1950 /// vec_sld
1951 /// \since Crypto++ 7.0
1952 template <class T>
1953 inline T VecSwapWords(const T vec)
1954 {
1955  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
1956 }
1957 
1958 //@}
1959 
1960 /// \name COMPARISON
1961 //@{
1962 
1963 /// \brief Compare two vectors
1964 /// \tparam T1 vector type
1965 /// \tparam T2 vector type
1966 /// \param vec1 the first vector
1967 /// \param vec2 the second vector
1968 /// \return true if vec1 equals vec2, false otherwise
1969 /// \details VecEqual() performs a bitwise compare. The vector element types do
1970 /// not matter.
1971 /// \par Wraps
1972 /// vec_all_eq
1973 /// \since Crypto++ 8.0
1974 template <class T1, class T2>
1975 inline bool VecEqual(const T1 vec1, const T2 vec2)
1976 {
1977  return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1978 }
1979 
1980 /// \brief Compare two vectors
1981 /// \tparam T1 vector type
1982 /// \tparam T2 vector type
1983 /// \param vec1 the first vector
1984 /// \param vec2 the second vector
1985 /// \return true if vec1 does not equal vec2, false otherwise
1986 /// \details VecNotEqual() performs a bitwise compare. The vector element types do
1987 /// not matter.
1988 /// \par Wraps
1989 /// vec_all_eq
1990 /// \since Crypto++ 8.0
1991 template <class T1, class T2>
1992 inline bool VecNotEqual(const T1 vec1, const T2 vec2)
1993 {
1994  return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1995 }
1996 
1997 //@}
1998 
1999 ////////////////// 32-bit Altivec /////////////////
2000 
2001 /// \name 32-BIT ALTIVEC
2002 //@{
2003 
2004 /// \brief Add two vectors as if uint64x2_p
2005 /// \param vec1 the first vector
2006 /// \param vec2 the second vector
2007 /// \return vector
2008 /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as
2009 /// if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages
2010 /// the carries from the elements.
2011 /// \par Wraps
2012 /// vec_add for POWER8, vec_addc, vec_perm, vec_add for Altivec
2013 /// \since Crypto++ 8.3
2014 inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2015 {
2016  // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
2017 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2018  return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
2019 #else
2020  // The carry mask selects carrys for elements 1 and 3 and sets
2021  // remaining elements to 0. The results is then shifted so the
2022  // carried values are added to elements 0 and 2.
2023 #if defined(CRYPTOPP_BIG_ENDIAN)
2024  const uint32x4_p zero = {0, 0, 0, 0};
2025  const uint32x4_p mask = {0, 1, 0, 1};
2026 #else
2027  const uint32x4_p zero = {0, 0, 0, 0};
2028  const uint32x4_p mask = {1, 0, 1, 0};
2029 #endif
2030 
2031  uint32x4_p cy = vec_addc(vec1, vec2);
2032  uint32x4_p res = vec_add(vec1, vec2);
2033  cy = vec_and(mask, cy);
2034  cy = vec_sld (cy, zero, 4);
2035  return vec_add(res, cy);
2036 #endif
2037 }
2038 
2039 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2040 /// \brief Add two vectors as if uint64x2_p
2041 /// \param vec1 the first vector
2042 /// \param vec2 the second vector
2043 /// \return vector
2044 /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as
2045 /// if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages
2046 /// the carries from the elements.
2047 /// \par Wraps
2048 /// vec_add for POWER8
2049 /// \since Crypto++ 8.3
2050 inline uint64x2_p VecAdd64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2051 {
2052  // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
2053  const uint64x2_p res = vec_add(vec1, vec2);
2054 
2055 #if defined(CRYPTOPP_DEBUG)
2056  // Test 32-bit add in debug builds while we are here.
2057  const uint32x4_p x = (uint32x4_p)vec1;
2058  const uint32x4_p y = (uint32x4_p)vec2;
2059  const uint32x4_p r = VecAdd64(x, y);
2060 
2061  CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2062 #endif
2063 
2064  return res;
2065 }
2066 #endif
2067 
2068 /// \brief Subtract two vectors as if uint64x2_p
2069 /// \param vec1 the first vector
2070 /// \param vec2 the second vector
2071 /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as
2072 /// if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()
2073 /// manages the borrows from the elements.
2074 /// \par Wraps
2075 /// vec_sub for POWER8, vec_subc, vec_andc, vec_perm, vec_sub for Altivec
2076 /// \since Crypto++ 8.3
2077 inline uint32x4_p VecSub64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2078 {
2079 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2080  // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
2081  return (uint32x4_p)vec_sub((uint64x2_p)vec1, (uint64x2_p)vec2);
2082 #else
2083  // The borrow mask selects borrows for elements 1 and 3 and sets
2084  // remaining elements to 0. The results is then shifted so the
2085  // borrowed values are subtracted from elements 0 and 2.
2086 #if defined(CRYPTOPP_BIG_ENDIAN)
2087  const uint32x4_p zero = {0, 0, 0, 0};
2088  const uint32x4_p mask = {0, 1, 0, 1};
2089 #else
2090  const uint32x4_p zero = {0, 0, 0, 0};
2091  const uint32x4_p mask = {1, 0, 1, 0};
2092 #endif
2093 
2094  // subc sets the complement of borrow, so we have to
2095  // un-complement it using andc.
2096  uint32x4_p bw = vec_subc(vec1, vec2);
2097  uint32x4_p res = vec_sub(vec1, vec2);
2098  bw = vec_andc(mask, bw);
2099  bw = vec_sld (bw, zero, 4);
2100  return vec_sub(res, bw);
2101 #endif
2102 }
2103 
2104 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2105 /// \brief Subtract two vectors as if uint64x2_p
2106 /// \param vec1 the first vector
2107 /// \param vec2 the second vector
2108 /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as
2109 /// if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()
2110 /// manages the borrows from the elements.
2111 /// \par Wraps
2112 /// vec_sub for POWER8
2113 /// \since Crypto++ 8.3
2114 inline uint64x2_p VecSub64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2115 {
2116  // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
2117  const uint64x2_p res = vec_sub(vec1, vec2);
2118 
2119 #if defined(CRYPTOPP_DEBUG)
2120  // Test 32-bit sub in debug builds while we are here.
2121  const uint32x4_p x = (uint32x4_p)vec1;
2122  const uint32x4_p y = (uint32x4_p)vec2;
2123  const uint32x4_p r = VecSub64(x, y);
2124 
2125  CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2126 #endif
2127 
2128  return res;
2129 }
2130 #endif
2131 
2132 /// \brief Rotate a vector left as if uint64x2_p
2133 /// \tparam C rotate bit count
2134 /// \param vec the vector
2135 /// \return vector
2136 /// \details VecRotateLeft() rotates each element in a vector by bit count.
2137 /// vec is rotated as if uint64x2_p.
2138 /// \par Wraps
2139 /// vec_rl
2140 /// \since Crypto++ 8.3
2141 template<unsigned int C>
2143 {
2144 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2145  // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2146  return (uint32x4_p)VecRotateLeft<C>((uint64x2_p)vec);
2147 #else
2148  // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
2149  enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2150 
2151  // Get the low bits, shift them to high bits
2152  uint32x4_p t1 = VecShiftLeft<S32>(vec);
2153  // Get the high bits, shift them to low bits
2154  uint32x4_p t2 = VecShiftRight<32-S32>(vec);
2155 
2156  if (S64 == 0)
2157  {
2158  const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2159  return VecPermute(vec, m);
2160  }
2161  else if (S64 == 32)
2162  {
2163  const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2164  return VecPermute(vec, m);
2165  }
2166  else if (BR) // Big rotate amount?
2167  {
2168  const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2169  t1 = VecPermute(t1, m);
2170  }
2171  else
2172  {
2173  const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2174  t2 = VecPermute(t2, m);
2175  }
2176 
2177  return vec_or(t1, t2);
2178 #endif
2179 }
2180 
2181 /// \brief Rotate a vector left as if uint64x2_p
2182 /// \param vec the vector
2183 /// \return vector
2184 /// \details VecRotateLeft<8>() rotates each element in a vector
2185 /// by 8-bits. vec is rotated as if uint64x2_p. This specialization
2186 /// is used by algorithms like Speck128.
2187 /// \par Wraps
2188 /// vec_rl
2189 /// \since Crypto++ 8.3
2190 template<>
2192 {
2193 #if (CRYPTOPP_BIG_ENDIAN)
2194  const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2195  return VecPermute(vec, m);
2196 #else
2197  const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2198  return VecPermute(vec, m);
2199 #endif
2200 }
2201 
2202 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2203 /// \brief Rotate a vector left as if uint64x2_p
2204 /// \tparam C rotate bit count
2205 /// \param vec the vector
2206 /// \return vector
2207 /// \details VecRotateLeft64() rotates each element in a vector by
2208 /// bit count. vec is rotated as if uint64x2_p.
2209 /// \par Wraps
2210 /// vec_rl
2211 /// \since Crypto++ 8.3
2212 template<unsigned int C>
2214 {
2215  // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2216  const uint64x2_p res = VecRotateLeft<C>(vec);
2217 
2218 #if defined(CRYPTOPP_DEBUG)
2219  // Test 32-bit rotate in debug builds while we are here.
2220  const uint32x4_p x = (uint32x4_p)vec;
2221  const uint32x4_p r = VecRotateLeft64<C>(x);
2222 
2223  CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2224 #endif
2225 
2226  return res;
2227 }
2228 #endif
2229 
2230 /// \brief Rotate a vector right as if uint64x2_p
2231 /// \tparam C rotate bit count
2232 /// \param vec the vector
2233 /// \return vector
2234 /// \details VecRotateRight64() rotates each element in a vector by
2235 /// bit count. vec is rotated as if uint64x2_p.
2236 /// \par Wraps
2237 /// vec_rl
2238 /// \since Crypto++ 8.3
2239 template<unsigned int C>
2241 {
2242 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2243  // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2244  return (uint32x4_p)VecRotateRight<C>((uint64x2_p)vec);
2245 #else
2246  // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
2247  enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2248 
2249  // Get the low bits, shift them to high bits
2250  uint32x4_p t1 = VecShiftRight<S32>(vec);
2251  // Get the high bits, shift them to low bits
2252  uint32x4_p t2 = VecShiftLeft<32-S32>(vec);
2253 
2254  if (S64 == 0)
2255  {
2256  const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2257  return VecPermute(vec, m);
2258  }
2259  else if (S64 == 32)
2260  {
2261  const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2262  return VecPermute(vec, m);
2263  }
2264  else if (BR) // Big rotate amount?
2265  {
2266  const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2267  t1 = VecPermute(t1, m);
2268  }
2269  else
2270  {
2271  const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2272  t2 = VecPermute(t2, m);
2273  }
2274 
2275  return vec_or(t1, t2);
2276 #endif
2277 }
2278 
2279 /// \brief Rotate a vector right as if uint64x2_p
2280 /// \param vec the vector
2281 /// \return vector
2282 /// \details VecRotateRight64<8>() rotates each element in a vector
2283 /// by 8-bits. vec is rotated as if uint64x2_p. This specialization
2284 /// is used by algorithms like Speck128.
2285 /// \details vec is rotated as if uint64x2_p.
2286 /// \par Wraps
2287 /// vec_rl
2288 /// \since Crypto++ 8.3
2289 template<>
2291 {
2292 #if (CRYPTOPP_BIG_ENDIAN)
2293  const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2294  return VecPermute(vec, m);
2295 #else
2296  const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2297  return VecPermute(vec, m);
2298 #endif
2299 }
2300 
2301 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2302 /// \brief Rotate a vector right as if uint64x2_p
2303 /// \tparam C rotate bit count
2304 /// \param vec the vector
2305 /// \return vector
2306 /// \details VecRotateRight64() rotates each element in a vector by
2307 /// bit count. vec is rotated as if uint64x2_p.
2308 /// \par Wraps
2309 /// vec_rl
2310 /// \since Crypto++ 8.3
2311 template<unsigned int C>
2313 {
2314  // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2315  const uint64x2_p res = VecRotateRight<C>(vec);
2316 
2317 #if defined(CRYPTOPP_DEBUG)
2318  // Test 32-bit rotate in debug builds while we are here.
2319  const uint32x4_p x = (uint32x4_p)vec;
2320  const uint32x4_p r = VecRotateRight64<C>(x);
2321 
2322  CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2323 #endif
2324 
2325  return res;
2326 }
2327 #endif
2328 
2329 /// \brief AND two vectors as if uint64x2_p
2330 /// \tparam T1 vector type
2331 /// \tparam T2 vector type
2332 /// \param vec1 the first vector
2333 /// \param vec2 the second vector
2334 /// \return vector
2335 /// \details VecAnd64() performs <tt>vec1 & vec2</tt>.
2336 /// vec2 is cast to the same type as vec1. The return vector
2337 /// is the same type as vec1.
2338 /// \details VecAnd64() is a convenience function that simply performs a VecAnd().
2339 /// \par Wraps
2340 /// vec_and
2341 /// \since Crypto++ 8.3
2342 template <class T1, class T2>
2343 inline T1 VecAnd64(const T1 vec1, const T2 vec2)
2344 {
2345  return (T1)vec_and(vec1, (T1)vec2);
2346 }
2347 
2348 /// \brief OR two vectors as if uint64x2_p
2349 /// \tparam T1 vector type
2350 /// \tparam T2 vector type
2351 /// \param vec1 the first vector
2352 /// \param vec2 the second vector
2353 /// \return vector
2354 /// \details VecOr64() performs <tt>vec1 | vec2</tt>.
2355 /// vec2 is cast to the same type as vec1. The return vector
2356 /// is the same type as vec1.
2357 /// \details VecOr64() is a convenience function that simply performs a VecOr().
2358 /// \par Wraps
2359 /// vec_or
2360 /// \since Crypto++ 8.3
2361 template <class T1, class T2>
2362 inline T1 VecOr64(const T1 vec1, const T2 vec2)
2363 {
2364  return (T1)vec_or(vec1, (T1)vec2);
2365 }
2366 
2367 /// \brief XOR two vectors as if uint64x2_p
2368 /// \tparam T1 vector type
2369 /// \tparam T2 vector type
2370 /// \param vec1 the first vector
2371 /// \param vec2 the second vector
2372 /// \return vector
2373 /// \details VecXor64() performs <tt>vec1 ^ vec2</tt>.
2374 /// vec2 is cast to the same type as vec1. The return vector
2375 /// is the same type as vec1.
2376 /// \details VecXor64() is a convenience function that simply performs a VecXor().
2377 /// \par Wraps
2378 /// vec_xor
2379 /// \since Crypto++ 8.3
2380 template <class T1, class T2>
2381 inline T1 VecXor64(const T1 vec1, const T2 vec2)
2382 {
2383  return (T1)vec_xor(vec1, (T1)vec2);
2384 }
2385 
2386 /// \brief Broadcast 64-bit double word to a vector
2387 /// \param val the 64-bit value
2388 /// \return vector
2389 /// \par Wraps
2390 /// vec_splats
2391 /// \since Crypto++ 8.3
2393 {
2394 #if defined(_ARCH_PWR8)
2395  // The PPC64 ABI says so.
2396  return (uint32x4_p)vec_splats((unsigned long long)val);
2397 #else
2398  const word64 x[2] = {val,val};
2399  return (uint32x4_p)VecLoad((const word32*)x);
2400 #endif
2401 }
2402 
2403 /// \brief Broadcast 64-bit element to a vector as if uint64x2_p
2404 /// \tparam the element number
2405 /// \param val the 64-bit value
2406 /// \return vector
2407 /// \par Wraps
2408 /// vec_splat
2409 /// \since Crypto++ 8.3
2410 template <unsigned int N>
2412 {
2413 #if defined(__VSX__) || defined(_ARCH_PWR8)
2414  return (uint32x4_p)vec_splat((uint64x2_p)val, N);
2415 #else
2416  enum {E=N&1};
2417  if (E == 0)
2418  {
2419  const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
2420  return (uint32x4_p)vec_perm(val, val, m);
2421  }
2422  else // (E == 1)
2423  {
2424  const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
2425  return (uint32x4_p)vec_perm(val, val, m);
2426  }
2427 #endif
2428 }
2429 
2430 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2431 /// \brief Broadcast 64-bit element to a vector
2432 /// \tparam the element number
2433 /// \param val the 64-bit value
2434 /// \return vector
2435 /// \since Crypto++ 8.3
2436 template <unsigned int N>
2438 {
2439  return vec_splat(val, N);
2440 }
2441 #endif
2442 
2443 //@}
2444 
2445 //////////////////////// Power8 Crypto ////////////////////////
2446 
2447 // __CRYPTO__ alone is not enough. Clang will define __CRYPTO__
2448 // when it is not available, like with Power7. Sigh...
2449 #if (defined(_ARCH_PWR8) && defined(__CRYPTO__)) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2450 
2451 /// \name POLYNOMIAL MULTIPLICATION
2452 //@{
2453 
2454 /// \brief Polynomial multiplication
2455 /// \param a the first term
2456 /// \param b the second term
2457 /// \return vector product
2458 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8
2459 /// polynomial multiplication multiplies the high and low terms, and then
2460 /// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
2461 /// al*bl</tt>. It is different behavior than Intel polynomial
2462 /// multiplication. To obtain a single product without the XOR, then set
2463 /// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
2464 /// results in <tt>0*bh XOR al*bl = al*bl</tt>.
2465 /// \par Wraps
2466 /// __vpmsumw, __builtin_altivec_crypto_vpmsumw and __builtin_crypto_vpmsumw.
2467 /// \since Crypto++ 8.1
2469 {
2470 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2471  return __vpmsumw (a, b);
2472 #elif defined(__clang__)
2473  return __builtin_altivec_crypto_vpmsumw (a, b);
2474 #else
2475  return __builtin_crypto_vpmsumw (a, b);
2476 #endif
2477 }
2478 
2479 /// \brief Polynomial multiplication
2480 /// \param a the first term
2481 /// \param b the second term
2482 /// \return vector product
2483 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8
2484 /// polynomial multiplication multiplies the high and low terms, and then
2485 /// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
2486 /// al*bl</tt>. It is different behavior than Intel polynomial
2487 /// multiplication. To obtain a single product without the XOR, then set
2488 /// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
2489 /// results in <tt>0*bh XOR al*bl = al*bl</tt>.
2490 /// \par Wraps
2491 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2492 /// \since Crypto++ 8.1
2494 {
2495 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2496  return __vpmsumd (a, b);
2497 #elif defined(__clang__)
2498  return __builtin_altivec_crypto_vpmsumd (a, b);
2499 #else
2500  return __builtin_crypto_vpmsumd (a, b);
2501 #endif
2502 }
2503 
2504 /// \brief Polynomial multiplication
2505 /// \param a the first term
2506 /// \param b the second term
2507 /// \return vector product
2508 /// \details VecIntelMultiply00() performs polynomial multiplication and presents
2509 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
2510 /// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
2511 /// are multiplied.
2512 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2513 /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2514 /// \par Wraps
2515 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2516 /// \since Crypto++ 8.0
2518 {
2519 #if defined(CRYPTOPP_BIG_ENDIAN)
2521 #else
2522  return VecPolyMultiply(VecGetHigh(a), VecGetHigh(b));
2523 #endif
2524 }
2525 
2526 /// \brief Polynomial multiplication
2527 /// \param a the first term
2528 /// \param b the second term
2529 /// \return vector product
2530 /// \details VecIntelMultiply01 performs() polynomial multiplication and presents
2531 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
2532 /// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
2533 /// 64-bits of <tt>b</tt> are multiplied.
2534 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2535 /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2536 /// \par Wraps
2537 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2538 /// \since Crypto++ 8.0
2540 {
2541 #if defined(CRYPTOPP_BIG_ENDIAN)
2542  return VecSwapWords(VecPolyMultiply(a, VecGetHigh(b)));
2543 #else
2544  return VecPolyMultiply(a, VecGetHigh(b));
2545 #endif
2546 }
2547 
2548 /// \brief Polynomial multiplication
2549 /// \param a the first term
2550 /// \param b the second term
2551 /// \return vector product
2552 /// \details VecIntelMultiply10() performs polynomial multiplication and presents
2553 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
2554 /// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
2555 /// 64-bits of <tt>b</tt> are multiplied.
2556 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2557 /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2558 /// \par Wraps
2559 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2560 /// \since Crypto++ 8.0
2562 {
2563 #if defined(CRYPTOPP_BIG_ENDIAN)
2564  return VecSwapWords(VecPolyMultiply(VecGetHigh(a), b));
2565 #else
2566  return VecPolyMultiply(VecGetHigh(a), b);
2567 #endif
2568 }
2569 
2570 /// \brief Polynomial multiplication
2571 /// \param a the first term
2572 /// \param b the second term
2573 /// \return vector product
2574 /// \details VecIntelMultiply11() performs polynomial multiplication and presents
2575 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
2576 /// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
2577 /// are multiplied.
2578 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2579 /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2580 /// \par Wraps
2581 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2582 /// \since Crypto++ 8.0
2584 {
2585 #if defined(CRYPTOPP_BIG_ENDIAN)
2586  return VecSwapWords(VecPolyMultiply(VecGetLow(a), b));
2587 #else
2588  return VecPolyMultiply(VecGetLow(a), b);
2589 #endif
2590 }
2591 
2592 //@}
2593 
2594 /// \name AES ENCRYPTION
2595 //@{
2596 
2597 /// \brief One round of AES encryption
2598 /// \tparam T1 vector type
2599 /// \tparam T2 vector type
2600 /// \param state the state vector
2601 /// \param key the subkey vector
2602 /// \details VecEncrypt() performs one round of AES encryption of state
2603 /// using subkey key. The return vector is the same type as state.
2604 /// \details VecEncrypt() is available on POWER8 and above.
2605 /// \par Wraps
2606 /// __vcipher, __builtin_altivec_crypto_vcipher, __builtin_crypto_vcipher
2607 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2608 template <class T1, class T2>
2609 inline T1 VecEncrypt(const T1 state, const T2 key)
2610 {
2611 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2612  return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
2613 #elif defined(__clang__)
2614  return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2615 #elif defined(__GNUC__)
2616  return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2617 #else
2618  CRYPTOPP_ASSERT(0);
2619 #endif
2620 }
2621 
2622 /// \brief Final round of AES encryption
2623 /// \tparam T1 vector type
2624 /// \tparam T2 vector type
2625 /// \param state the state vector
2626 /// \param key the subkey vector
2627 /// \details VecEncryptLast() performs the final round of AES encryption
2628 /// of state using subkey key. The return vector is the same type as state.
2629 /// \details VecEncryptLast() is available on POWER8 and above.
2630 /// \par Wraps
2631 /// __vcipherlast, __builtin_altivec_crypto_vcipherlast, __builtin_crypto_vcipherlast
2632 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2633 template <class T1, class T2>
2634 inline T1 VecEncryptLast(const T1 state, const T2 key)
2635 {
2636 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2637  return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
2638 #elif defined(__clang__)
2639  return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2640 #elif defined(__GNUC__)
2641  return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2642 #else
2643  CRYPTOPP_ASSERT(0);
2644 #endif
2645 }
2646 
2647 /// \brief One round of AES decryption
2648 /// \tparam T1 vector type
2649 /// \tparam T2 vector type
2650 /// \param state the state vector
2651 /// \param key the subkey vector
2652 /// \details VecDecrypt() performs one round of AES decryption of state
2653 /// using subkey key. The return vector is the same type as state.
2654 /// \details VecDecrypt() is available on POWER8 and above.
2655 /// \par Wraps
2656 /// __vncipher, __builtin_altivec_crypto_vncipher, __builtin_crypto_vncipher
2657 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2658 template <class T1, class T2>
2659 inline T1 VecDecrypt(const T1 state, const T2 key)
2660 {
2661 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2662  return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
2663 #elif defined(__clang__)
2664  return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2665 #elif defined(__GNUC__)
2666  return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2667 #else
2668  CRYPTOPP_ASSERT(0);
2669 #endif
2670 }
2671 
2672 /// \brief Final round of AES decryption
2673 /// \tparam T1 vector type
2674 /// \tparam T2 vector type
2675 /// \param state the state vector
2676 /// \param key the subkey vector
2677 /// \details VecDecryptLast() performs the final round of AES decryption
2678 /// of state using subkey key. The return vector is the same type as state.
2679 /// \details VecDecryptLast() is available on POWER8 and above.
2680 /// \par Wraps
2681 /// __vncipherlast, __builtin_altivec_crypto_vncipherlast, __builtin_crypto_vncipherlast
2682 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2683 template <class T1, class T2>
2684 inline T1 VecDecryptLast(const T1 state, const T2 key)
2685 {
2686 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2687  return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
2688 #elif defined(__clang__)
2689  return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2690 #elif defined(__GNUC__)
2691  return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2692 #else
2693  CRYPTOPP_ASSERT(0);
2694 #endif
2695 }
2696 
2697 //@}
2698 
2699 /// \name SHA DIGESTS
2700 //@{
2701 
2702 /// \brief SHA256 Sigma functions
2703 /// \tparam func function
2704 /// \tparam fmask function mask
2705 /// \tparam T vector type
2706 /// \param data the block to transform
2707 /// \details VecSHA256() selects sigma0, sigma1, Sigma0, Sigma1 based on
2708 /// func and fmask. The return vector is the same type as data.
2709 /// \details VecSHA256() is available on POWER8 and above.
2710 /// \par Wraps
2711 /// __vshasigmaw, __builtin_altivec_crypto_vshasigmaw, __builtin_crypto_vshasigmaw
2712 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2713 template <int func, int fmask, class T>
2714 inline T VecSHA256(const T data)
2715 {
2716 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2717  return (T)__vshasigmaw((uint32x4_p)data, func, fmask);
2718 #elif defined(__clang__)
2719  return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2720 #elif defined(__GNUC__)
2721  return (T)__builtin_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2722 #else
2723  CRYPTOPP_ASSERT(0);
2724 #endif
2725 }
2726 
2727 /// \brief SHA512 Sigma functions
2728 /// \tparam func function
2729 /// \tparam fmask function mask
2730 /// \tparam T vector type
2731 /// \param data the block to transform
2732 /// \details VecSHA512() selects sigma0, sigma1, Sigma0, Sigma1 based on
2733 /// func and fmask. The return vector is the same type as data.
2734 /// \details VecSHA512() is available on POWER8 and above.
2735 /// \par Wraps
2736 /// __vshasigmad, __builtin_altivec_crypto_vshasigmad, __builtin_crypto_vshasigmad
2737 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2738 template <int func, int fmask, class T>
2739 inline T VecSHA512(const T data)
2740 {
2741 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2742  return (T)__vshasigmad((uint64x2_p)data, func, fmask);
2743 #elif defined(__clang__)
2744  return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2745 #elif defined(__GNUC__)
2746  return (T)__builtin_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2747 #else
2748  CRYPTOPP_ASSERT(0);
2749 #endif
2750 }
2751 
2752 //@}
2753 
2754 #endif // __CRYPTO__
2755 
2756 #endif // _ALTIVEC_
2757 
2758 NAMESPACE_END
2759 
2760 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
2761 # pragma GCC diagnostic pop
2762 #endif
2763 
2764 #endif // CRYPTOPP_PPC_CRYPTO_H
Library configuration file.
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:72
unsigned long long word64
64-bit unsigned datatype
Definition: config_int.h:101
Utility functions for the Crypto++ library.
Crypto++ library namespace.
uint32x4_p VecZero()
The 0 vector.
Definition: ppc_simd.h:218
uint32x4_p VecRotateRight(const uint32x4_p vec)
Rotate a vector right.
Definition: ppc_simd.h:1676
T1 VecOr(const T1 vec1, const T2 vec2)
OR two vectors.
Definition: ppc_simd.h:1395
T VecSHA512(const T data)
SHA512 Sigma functions.
Definition: ppc_simd.h:2739
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:742
void VecStore_ALTIVEC(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:817
T1 VecOr64(const T1 vec1, const T2 vec2)
OR two vectors as if uint64x2_p.
Definition: ppc_simd.h:2362
uint32x4_p VecLoadAligned(const byte src[16])
Loads a vector from an aligned byte array.
Definition: ppc_simd.h:560
T VecRotateRightOctet(const T vec)
Rotate a vector right.
Definition: ppc_simd.h:1639
T VecShiftRightOctet(const T vec)
Shift a vector right.
Definition: ppc_simd.h:1575
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:202
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:1231
T VecShiftLeftOctet(const T vec)
Shift a vector left.
Definition: ppc_simd.h:1528
T VecSHA256(const T data)
SHA256 Sigma functions.
Definition: ppc_simd.h:2714
uint32x4_p VecSub64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Subtract two vectors as if uint64x2_p.
Definition: ppc_simd.h:2077
uint32x4_p VecLoad_ALTIVEC(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:308
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1478
uint64x2_p VecIntelMultiply00(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:2517
T VecReverseLE(const T data)
Reverse bytes in a vector.
Definition: ppc_simd.h:263
T VecMergeHigh(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1819
uint32x4_p VecSplatElement(const uint32x4_p val)
Broadcast 32-bit element to a vector.
Definition: ppc_simd.h:1855
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:192
bool VecNotEqual(const T1 vec1, const T2 vec2)
Compare two vectors.
Definition: ppc_simd.h:1992
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:1414
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:212
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:1456
void VecStoreAligned(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:1100
#define NCONST_V32_CAST(x)
Cast array to vector pointer.
Definition: ppc_simd.h:169
bool VecEqual(const T1 vec1, const T2 vec2)
Compare two vectors.
Definition: ppc_simd.h:1975
T1 VecEncryptLast(const T1 state, const T2 key)
Final round of AES encryption.
Definition: ppc_simd.h:2634
uint32x4_p VecSplatElement64(const uint32x4_p val)
Broadcast 64-bit element to a vector as if uint64x2_p.
Definition: ppc_simd.h:2411
T VecMergeLow(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1805
#define CONST_V8_CAST(x)
Cast array to vector pointer.
Definition: ppc_simd.h:145
T1 VecXor64(const T1 vec1, const T2 vec2)
XOR two vectors as if uint64x2_p.
Definition: ppc_simd.h:2381
T1 VecEncrypt(const T1 state, const T2 key)
One round of AES encryption.
Definition: ppc_simd.h:2609
T1 VecDecryptLast(const T1 state, const T2 key)
Final round of AES decryption.
Definition: ppc_simd.h:2684
uint32x4_p VecPolyMultiply(const uint32x4_p &a, const uint32x4_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:2468
uint32x4_p VecRotateRight64(const uint32x4_p vec)
Rotate a vector right as if uint64x2_p.
Definition: ppc_simd.h:2240
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:1438
uint32x4_p VecRotateLeft(const uint32x4_p vec)
Rotate a vector left.
Definition: ppc_simd.h:1660
T VecRotateLeftOctet(const T vec)
Rotate a vector left.
Definition: ppc_simd.h:1614
uint32x4_p VecSplatWord64(word64 val)
Broadcast 64-bit double word to a vector.
Definition: ppc_simd.h:2392
uint64x2_p VecIntelMultiply11(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:2583
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:1376
uint32x4_p VecShiftRight(const uint32x4_p vec)
Shift a vector right.
Definition: ppc_simd.h:1708
T VecGetHigh(const T val)
Extract a dword from a vector.
Definition: ppc_simd.h:1935
uint32x4_p VecRotateRight64< 8 >(const uint32x4_p vec)
Rotate a vector right as if uint64x2_p.
Definition: ppc_simd.h:2290
T1 VecDecrypt(const T1 state, const T2 key)
One round of AES decryption.
Definition: ppc_simd.h:2659
#define NCONST_V8_CAST(x)
Cast array to vector pointer.
Definition: ppc_simd.h:163
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:895
T VecReverse(const T data)
Reverse bytes in a vector.
Definition: ppc_simd.h:242
uint32x4_p VecShiftLeft(const uint32x4_p vec)
Shift a vector left.
Definition: ppc_simd.h:1692
#define CONST_V32_CAST(x)
Cast array to vector pointer.
Definition: ppc_simd.h:151
uint32x4_p VecSplatWord(word32 val)
Broadcast 32-bit word to a vector.
Definition: ppc_simd.h:1830
uint64x2_p VecIntelMultiply01(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:2539
uint32x4_p VecOne()
The 1 vector.
Definition: ppc_simd.h:227
T VecReverseBE(const T data)
Reverse bytes in a vector.
Definition: ppc_simd.h:283
T VecGetLow(const T val)
Extract a dword from a vector.
Definition: ppc_simd.h:1913
uint32x4_p VecAdd64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Add two vectors as if uint64x2_p.
Definition: ppc_simd.h:2014
T VecSwapWords(const T vec)
Exchange high and low double words.
Definition: ppc_simd.h:1953
__vector unsigned short uint16x8_p
Vector of 16-bit elements.
Definition: ppc_simd.h:197
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:369
uint32x4_p VecRotateLeft64(const uint32x4_p vec)
Rotate a vector left as if uint64x2_p.
Definition: ppc_simd.h:2142
uint64x2_p VecIntelMultiply10(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:2561
uint32x4_p VecRotateLeft64< 8 >(const uint32x4_p vec)
Rotate a vector left as if uint64x2_p.
Definition: ppc_simd.h:2191
T1 VecAnd64(const T1 vec1, const T2 vec2)
AND two vectors as if uint64x2_p.
Definition: ppc_simd.h:2343
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68