Crypto++  8.8
Free C++ class library of cryptographic schemes
xts.cpp
1 // xts.cpp - written and placed in the public domain by Jeffrey Walton
2 
3 // Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
4 // base architecture. We can use the SIMD code below without an
5 // architecture option. No runtime tests are required. Unfortunately,
6 // we can't use it on Altivec because an architecture switch is required.
7 // The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
8 // 16-byte block sizes.
9 
10 #include "pch.h"
11 
12 #include "xts.h"
13 #include "misc.h"
14 #include "modes.h"
15 #include "cpu.h"
16 
17 #if defined(CRYPTOPP_DEBUG)
18 # include "aes.h"
19 # include "threefish.h"
20 #endif
21 
22 // 0.3 to 0.4 cpb profit
23 #if defined(__SSE2__) || defined(_M_X64)
24 # include <emmintrin.h>
25 #endif
26 
27 #if defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
28 # if (CRYPTOPP_ARM_NEON_HEADER) || (CRYPTOPP_ARM_ASIMD_AVAILABLE)
29 # include <arm_neon.h>
30 # endif
31 #endif
32 
33 #if defined(__ALTIVEC__)
34 # include "ppc_simd.h"
35 #endif
36 
37 ANONYMOUS_NAMESPACE_BEGIN
38 
39 using namespace CryptoPP;
40 
41 #if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
42 
43 using CryptoPP::AES;
44 using CryptoPP::XTS_Mode;
45 using CryptoPP::Threefish512;
46 
47 void Modes_TestInstantiations()
48 {
49  XTS_Mode<AES>::Encryption m0;
50  XTS_Mode<AES>::Decryption m1;
51  XTS_Mode<AES>::Encryption m2;
52  XTS_Mode<AES>::Decryption m3;
53 
54 #if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
55  XTS_Mode<Threefish512>::Encryption m4;
56  XTS_Mode<Threefish512>::Decryption m5;
57 #endif
58 }
59 #endif // CRYPTOPP_DEBUG
60 
61 inline void XorBuffer(byte *output, const byte *input, const byte *mask, size_t count)
62 {
63  CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
64 
65 #if defined(CRYPTOPP_DISABLE_ASM)
66  xorbuf(output, input, mask, count);
67 
68 #elif defined(__SSE2__) || defined(_M_X64)
69  for (size_t i=0; i<count; i+=16)
70  _mm_storeu_si128(M128_CAST(output+i),
71  _mm_xor_si128(
72  _mm_loadu_si128(CONST_M128_CAST(input+i)),
73  _mm_loadu_si128(CONST_M128_CAST(mask+i))));
74 
75 #elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
76  for (size_t i=0; i<count; i+=16)
77  vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i)));
78 
79 #elif defined(__ALTIVEC__)
80  for (size_t i=0; i<count; i+=16)
81  VecStore(VecXor(VecLoad(input+i), VecLoad(mask+i)), output+i);
82 
83 #else
84  xorbuf(output, input, mask, count);
85 #endif
86 }
87 
88 inline void XorBuffer(byte *buf, const byte *mask, size_t count)
89 {
90  XorBuffer(buf, buf, mask, count);
91 }
92 
93 // Borrowed from CMAC, but little-endian representation
94 inline void GF_Double(byte *out, const byte* in, unsigned int len)
95 {
96 #if defined(CRYPTOPP_WORD128_AVAILABLE)
97  word128 carry = 0, x;
98  for (size_t i=0, idx=0; i<len/16; ++i, idx+=16)
99  {
100  x = GetWord<word128>(false, LITTLE_ENDIAN_ORDER, in+idx);
101  word128 y = (x >> 127); x = (x << 1) + carry;
102  PutWord<word128>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
103  carry = y;
104  }
105 #elif defined(_M_X64) || defined(_M_ARM64) || defined(_LP64) || defined(__LP64__)
106  word64 carry = 0, x;
107  for (size_t i=0, idx=0; i<len/8; ++i, idx+=8)
108  {
109  x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, in+idx);
110  word64 y = (x >> 63); x = (x << 1) + carry;
111  PutWord<word64>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
112  carry = y;
113  }
114 #else
115  word32 carry = 0, x;
116  for (size_t i=0, idx=0; i<len/4; ++i, idx+=4)
117  {
118  x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, in+idx);
119  word32 y = (x >> 31); x = (x << 1) + carry;
120  PutWord<word32>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
121  carry = y;
122  }
123 #endif
124 
125 #if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
126 
128  CRYPTOPP_ASSERT(len >= 16);
129  CRYPTOPP_ASSERT(len <= 128);
130 
131  byte* k = out;
132  if (carry)
133  {
134  switch (len)
135  {
136  case 16:
137  {
138  const size_t LEIDX = 16-1;
139  k[LEIDX-15] ^= 0x87;
140  break;
141  }
142  case 32:
143  {
144  // https://crypto.stackexchange.com/q/9815/10496
145  // Polynomial x^256 + x^10 + x^5 + x^2 + 1
146  const size_t LEIDX = 32-1;
147  k[LEIDX-30] ^= 4;
148  k[LEIDX-31] ^= 0x25;
149  break;
150  }
151  case 64:
152  {
153  // https://crypto.stackexchange.com/q/9815/10496
154  // Polynomial x^512 + x^8 + x^5 + x^2 + 1
155  const size_t LEIDX = 64-1;
156  k[LEIDX-62] ^= 1;
157  k[LEIDX-63] ^= 0x25;
158  break;
159  }
160  case 128:
161  {
162  // https://crypto.stackexchange.com/q/9815/10496
163  // Polynomial x^1024 + x^19 + x^6 + x + 1
164  const size_t LEIDX = 128-1;
165  k[LEIDX-125] ^= 8;
166  k[LEIDX-126] ^= 0x00;
167  k[LEIDX-127] ^= 0x43;
168  break;
169  }
170  default:
171  CRYPTOPP_ASSERT(0);
172  }
173  }
174 #else
175  CRYPTOPP_ASSERT(len == 16);
176 
177  byte* k = out;
178  if (carry)
179  {
180  k[0] ^= 0x87;
181  return;
182  }
183 #endif // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
184 }
185 
186 inline void GF_Double(byte *inout, unsigned int len)
187 {
188  GF_Double(inout, inout, len);
189 }
190 
191 ANONYMOUS_NAMESPACE_END
192 
193 NAMESPACE_BEGIN(CryptoPP)
194 
195 void XTS_ModeBase::ThrowIfInvalidBlockSize(size_t length)
196 {
197 #if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
198  CRYPTOPP_ASSERT(length >= 16 && length <= 128 && IsPowerOf2(length));
199  if (length < 16 || length > 128 || !IsPowerOf2(length))
200  throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not valid");
201 #else
202  CRYPTOPP_ASSERT(length == 16);
203  if (length != 16)
204  throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
205 #endif
206 }
207 
209 {
210  CRYPTOPP_ASSERT(length % 2 == 0);
211  if (!GetBlockCipher().IsValidKeyLength((length+1)/2))
212  throw InvalidKeyLength(AlgorithmName(), length);
213 }
214 
215 void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs &params)
216 {
217  ThrowIfInvalidKeyLength(length);
218  ThrowIfInvalidBlockSize(BlockSize());
219 
220  const size_t klen = length/2;
221  AccessBlockCipher().SetKey(key+0, klen, params);
222  AccessTweakCipher().SetKey(key+klen, klen, params);
223 
224  ResizeBuffers();
225 
226  size_t ivLength;
227  const byte *iv = GetIVAndThrowIfInvalid(params, ivLength);
228  Resynchronize(iv, (int)ivLength);
229 }
230 
231 void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength)
232 {
234  std::memcpy(m_xregister, m_register, ivLength);
235  GetTweakCipher().ProcessBlock(m_xregister);
236 }
237 
239 {
240  SecByteBlock iv(GetTweakCipher().BlockSize());
241  PutWord<word64>(false, order, iv, sector);
242  std::memset(iv+8, 0x00, iv.size()-8);
243 
245  std::memcpy(m_xregister, iv, iv.size());
246  GetTweakCipher().ProcessBlock(m_xregister);
247 }
248 
249 void XTS_ModeBase::ResizeBuffers()
250 {
251  BlockOrientedCipherModeBase::ResizeBuffers();
252  m_xworkspace.New(GetBlockCipher().BlockSize()*ParallelBlocks);
253  m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks);
254 }
255 
256 // ProcessData runs either 12-4-1 blocks, 8-2-1 or 4-1 blocks. Which is
257 // selected depends on ParallelBlocks in the header file. 12-4-1 or 8-2-1
258 // can be used on Aarch64 and PowerPC. Intel should use 4-1 due to lack
259 // of registers. The unneeded code paths should be removed by optimizer.
260 // The extra gyrations save us 1.8 cpb on Aarch64 and 2.1 cpb on PowerPC.
261 void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length)
262 {
263  // data unit is multiple of 16 bytes
264  CRYPTOPP_ASSERT(length % BlockSize() == 0);
265 
266  enum { lastParallelBlock = ParallelBlocks-1 };
267  const unsigned int blockSize = GetBlockCipher().BlockSize();
268  const size_t parallelSize = blockSize*ParallelBlocks;
269 
270  // encrypt the data unit, optimal size at a time
271  while (length >= parallelSize)
272  {
273  // m_xregister[0] always points to the next tweak.
274  GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
275  GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
276  GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
277 
278  if (ParallelBlocks > 4)
279  {
280  GF_Double(m_xregister+4*blockSize, m_xregister+3*blockSize, blockSize);
281  GF_Double(m_xregister+5*blockSize, m_xregister+4*blockSize, blockSize);
282  GF_Double(m_xregister+6*blockSize, m_xregister+5*blockSize, blockSize);
283  GF_Double(m_xregister+7*blockSize, m_xregister+6*blockSize, blockSize);
284  }
285  if (ParallelBlocks > 8)
286  {
287  GF_Double(m_xregister+8*blockSize, m_xregister+7*blockSize, blockSize);
288  GF_Double(m_xregister+9*blockSize, m_xregister+8*blockSize, blockSize);
289  GF_Double(m_xregister+10*blockSize, m_xregister+9*blockSize, blockSize);
290  GF_Double(m_xregister+11*blockSize, m_xregister+10*blockSize, blockSize);
291  }
292 
293  // merge the tweak into the input block
294  XorBuffer(m_xworkspace, inString, m_xregister, parallelSize);
295 
296  // encrypt one block, merge the tweak into the output block
297  GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
298  outString, parallelSize, BlockTransformation::BT_AllowParallel);
299 
300  // m_xregister[0] always points to the next tweak.
301  GF_Double(m_xregister+0, m_xregister+lastParallelBlock*blockSize, blockSize);
302 
303  inString += parallelSize;
304  outString += parallelSize;
305  length -= parallelSize;
306  }
307 
308  // encrypt the data unit, 4 blocks at a time
309  while (ParallelBlocks == 12 && length >= blockSize*4)
310  {
311  // m_xregister[0] always points to the next tweak.
312  GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
313  GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
314  GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
315 
316  // merge the tweak into the input block
317  XorBuffer(m_xworkspace, inString, m_xregister, blockSize*4);
318 
319  // encrypt one block, merge the tweak into the output block
320  GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
321  outString, blockSize*4, BlockTransformation::BT_AllowParallel);
322 
323  // m_xregister[0] always points to the next tweak.
324  GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);
325 
326  inString += blockSize*4;
327  outString += blockSize*4;
328  length -= blockSize*4;
329  }
330 
331  // encrypt the data unit, 2 blocks at a time
332  while (ParallelBlocks == 8 && length >= blockSize*2)
333  {
334  // m_xregister[0] always points to the next tweak.
335  GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
336 
337  // merge the tweak into the input block
338  XorBuffer(m_xworkspace, inString, m_xregister, blockSize*2);
339 
340  // encrypt one block, merge the tweak into the output block
341  GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
342  outString, blockSize*2, BlockTransformation::BT_AllowParallel);
343 
344  // m_xregister[0] always points to the next tweak.
345  GF_Double(m_xregister+0, m_xregister+1*blockSize, blockSize);
346 
347  inString += blockSize*2;
348  outString += blockSize*2;
349  length -= blockSize*2;
350  }
351 
352  // encrypt the data unit, blocksize at a time
353  while (length)
354  {
355  // merge the tweak into the input block
356  XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
357 
358  // encrypt one block
359  GetBlockCipher().ProcessBlock(m_xworkspace);
360 
361  // merge the tweak into the output block
362  XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
363 
364  // Multiply T by alpha
365  GF_Double(m_xregister, blockSize);
366 
367  inString += blockSize;
368  outString += blockSize;
369  length -= blockSize;
370  }
371 }
372 
373 size_t XTS_ModeBase::ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
374 {
375  // need at least a full AES block
376  CRYPTOPP_ASSERT(inLength >= BlockSize());
377 
378  if (inLength < BlockSize())
379  throw InvalidArgument("XTS: message is too short for ciphertext stealing");
380 
381  if (IsForwardTransformation())
382  return ProcessLastPlainBlock(outString, outLength, inString, inLength);
383  else
384  return ProcessLastCipherBlock(outString, outLength, inString, inLength);
385 }
386 
387 size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
388 {
389  // ensure output buffer is large enough
390  CRYPTOPP_ASSERT(outLength >= inLength);
391 
392  const unsigned int blockSize = GetBlockCipher().BlockSize();
393  const size_t blocks = inLength / blockSize;
394  const size_t tail = inLength % blockSize;
395  outLength = inLength;
396 
397  if (tail == 0)
398  {
399  // Allow ProcessData to handle all the full blocks
400  ProcessData(outString, inString, inLength);
401  return inLength;
402  }
403  else if (blocks > 1)
404  {
405  // Allow ProcessData to handle full blocks except one
406  const size_t head = (blocks-1)*blockSize;
407  ProcessData(outString, inString, inLength-head);
408 
409  outString += head;
410  inString += head; inLength -= head;
411  }
412 
413  ///// handle the full block /////
414 
415  // merge the tweak into the input block
416  XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
417 
418  // encrypt one block
419  GetBlockCipher().ProcessBlock(m_xworkspace);
420 
421  // merge the tweak into the output block
422  XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
423 
424  // Multiply T by alpha
425  GF_Double(m_xregister, blockSize);
426 
427  ///// handle final partial block /////
428 
429  inString += blockSize;
430  outString += blockSize;
431  const size_t len = inLength-blockSize;
432 
433  // copy in the final plaintext bytes
434  std::memcpy(m_xworkspace, inString, len);
435  // and copy out the final ciphertext bytes
436  std::memcpy(outString, outString-blockSize, len);
437  // "steal" ciphertext to complete the block
438  std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len);
439 
440  // merge the tweak into the input block
441  XorBuffer(m_xworkspace, m_xregister, blockSize);
442 
443  // encrypt one block
444  GetBlockCipher().ProcessBlock(m_xworkspace);
445 
446  // merge the tweak into the previous output block
447  XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize);
448 
449  return outLength;
450 }
451 
452 size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
453 {
454  // ensure output buffer is large enough
455  CRYPTOPP_ASSERT(outLength >= inLength);
456 
457  const unsigned int blockSize = GetBlockCipher().BlockSize();
458  const size_t blocks = inLength / blockSize;
459  const size_t tail = inLength % blockSize;
460  outLength = inLength;
461 
462  if (tail == 0)
463  {
464  // Allow ProcessData to handle all the full blocks
465  ProcessData(outString, inString, inLength);
466  return inLength;
467  }
468  else if (blocks > 1)
469  {
470  // Allow ProcessData to handle full blocks except one
471  const size_t head = (blocks-1)*blockSize;
472  ProcessData(outString, inString, inLength-head);
473 
474  outString += head;
475  inString += head; inLength -= head;
476  }
477 
478  #define poly1 (m_xregister+0*blockSize)
479  #define poly2 (m_xregister+1*blockSize)
480  GF_Double(poly2, poly1, blockSize);
481 
482  ///// handle final partial block /////
483 
484  inString += blockSize;
485  outString += blockSize;
486  const size_t len = inLength-blockSize;
487 
488  // merge the tweak into the input block
489  XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize);
490 
491  // encrypt one block
492  GetBlockCipher().ProcessBlock(m_xworkspace);
493 
494  // merge the tweak into the output block
495  XorBuffer(m_xworkspace, poly2, blockSize);
496 
497  // copy in the final plaintext bytes
498  std::memcpy(outString-blockSize, inString, len);
499  // and copy out the final ciphertext bytes
500  std::memcpy(outString, m_xworkspace, len);
501  // "steal" ciphertext to complete the block
502  std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len);
503 
504  ///// handle the full previous block /////
505 
506  inString -= blockSize;
507  outString -= blockSize;
508 
509  // merge the tweak into the input block
510  XorBuffer(m_xworkspace, outString, poly1, blockSize);
511 
512  // encrypt one block
513  GetBlockCipher().ProcessBlock(m_xworkspace);
514 
515  // merge the tweak into the output block
516  XorBuffer(outString, m_xworkspace, poly1, blockSize);
517 
518  return outLength;
519 }
520 
521 NAMESPACE_END
#define M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:609
#define CONST_M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:614
Class file for the AES cipher (Rijndael)
void Resynchronize(const byte *iv, int length=-1)
Resynchronize with an IV.
Definition: modes.h:260
@ BT_AllowParallel
Allow parallel transformations.
Definition: cryptlib.h:930
An invalid argument was detected.
Definition: cryptlib.h:208
Exception thrown when an invalid key length is encountered.
Definition: simple.h:56
Interface for retrieving values given their names.
Definition: cryptlib.h:327
size_type size() const
Provides the count of elements in the SecBlock.
Definition: secblock.h:867
SecBlock<byte> typedef.
Definition: secblock.h:1226
XTS block cipher mode of operation default implementation.
Definition: xts.h:50
void SetKey(const byte *key, size_t length, const NameValuePairs &params=g_nullNameValuePairs)
Sets or reset the key of this object.
Definition: xts.cpp:215
void ProcessData(byte *outString, const byte *inString, size_t length)
Encrypt or decrypt an array of bytes.
Definition: xts.cpp:261
void Resynchronize(const byte *iv, int ivLength=-1)
Resynchronize with an IV.
Definition: xts.cpp:231
size_t ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
Encrypt or decrypt the last block of data.
Definition: xts.cpp:373
void ThrowIfInvalidKeyLength(size_t length)
Validates the key length.
Definition: xts.cpp:208
__uint128_t word128
128-bit unsigned datatype
Definition: config_int.h:119
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:72
unsigned long long word64
64-bit unsigned datatype
Definition: config_int.h:101
Functions for CPU features and intrinsics.
ByteOrder
Provides the byte ordering.
Definition: cryptlib.h:148
@ LITTLE_ENDIAN_ORDER
byte order is little-endian
Definition: cryptlib.h:150
Utility functions for the Crypto++ library.
bool IsPowerOf2(const T &value)
Tests whether a value is a power of 2.
Definition: misc.h:1215
CRYPTOPP_DLL void xorbuf(byte *buf, const byte *mask, size_t count)
Performs an XOR of a buffer with a mask.
Classes for block cipher modes of operation.
Crypto++ library namespace.
const char * BlockSize()
int, in bytes
Definition: argnames.h:27
Precompiled header file.
Support functions for PowerPC and vector operations.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:1414
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:895
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:369
Classes for the Threefish block cipher.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68
Classes for XTS block cipher mode of operation.