#include #include #include #include #include #include "cprng64.h" static void *(*volatile explicit_memset_impl)(void *, int, size_t) = &memset; static void * explicit_memset(void *b, int c, size_t n) { return (*explicit_memset_impl)(b, c, n); } /* ChaCha core */ #define crypto_core_OUTPUTWORDS 16 #define crypto_core_INPUTWORDS 4 #define crypto_core_KEYWORDS 8 #define crypto_core_CONSTWORDS 4 #define crypto_core_ROUNDS 20 typedef uint32_t crypto_word_t; static uint32_t rotate(uint32_t u, unsigned c) { return (u << c) | (u >> (32 - c)); } #define QUARTERROUND(a, b, c, d) do { \ (a) += (b); (d) ^= (a); (d) = rotate((d), 16); \ (c) += (d); (b) ^= (c); (b) = rotate((b), 12); \ (a) += (b); (d) ^= (a); (d) = rotate((d), 8); \ (c) += (d); (b) ^= (c); (b) = rotate((b), 7); \ } while (0) static void crypto_core(uint32_t *out, const uint32_t *in, const uint32_t *k, const uint32_t *c) { uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; uint32_t j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15; int i; j0 = x0 = c[0]; j1 = x1 = c[1]; j2 = x2 = c[2]; j3 = x3 = c[3]; j4 = x4 = k[0]; j5 = x5 = k[1]; j6 = x6 = k[2]; j7 = x7 = k[3]; j8 = x8 = k[4]; j9 = x9 = k[5]; j10 = x10 = k[6]; j11 = x11 = k[7]; j12 = x12 = in[0]; j13 = x13 = in[1]; j14 = x14 = in[2]; j15 = x15 = in[3]; for (i = crypto_core_ROUNDS; i > 0; i -= 2) { QUARTERROUND( x0, x4, x8,x12); QUARTERROUND( x1, x5, x9,x13); QUARTERROUND( x2, x6,x10,x14); QUARTERROUND( x3, x7,x11,x15); QUARTERROUND( x0, x5,x10,x15); QUARTERROUND( x1, x6,x11,x12); QUARTERROUND( x2, x7, x8,x13); QUARTERROUND( x3, x4, x9,x14); } out[0] = x0 + j0; out[1] = x1 + j1; out[2] = x2 + j2; out[3] = x3 + j3; out[4] = x4 + j4; out[5] = x5 + j5; out[6] = x6 + j6; out[7] = x7 + j7; out[8] = x8 + j8; out[9] = x9 + j9; out[10] = x10 + j10; out[11] = x11 + j11; out[12] = x12 + j12; out[13] = x13 + j13; out[14] = x14 + j14; out[15] = x15 + j15; } __CTASSERT(sizeof ((const struct cprng *)0)->state == sizeof(crypto_word_t) * (crypto_core_KEYWORDS + crypto_core_INPUTWORDS + crypto_core_OUTPUTWORDS)); #define cprng_key(c) (&(c)->state[0]) #define cprng_nonce(c) (&(c)->state[crypto_core_KEYWORDS]) #define cprng_buffer(c) (&(c)->state[crypto_core_KEYWORDS + \ crypto_core_INPUTWORDS]) /* `expand 32-byte k' */ static const uint32_t sigma[4] = { 0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U, }; static void nonce_inc(uint32_t n[crypto_core_INPUTWORDS]) { uint64_t t = 1; unsigned i; for (i = 0; i < crypto_core_INPUTWORDS; i++) { t += n[i]; n[i] = t; t >>= 32; } /* * If the nonce overflows, you counted sequentially to 2^128. * If you count once per femptosecond, it will take you about * ten quadrillion years to manage this. Don't worry about it. */ } void cprng_seed(struct cprng *cprng, const void *seed) { __CTASSERT(CPRNG_SEED_BYTES == crypto_core_KEYWORDS * sizeof(crypto_word_t)); (void)memset(cprng->state, 0, sizeof cprng->state); (void)memcpy(cprng_key(cprng), seed, CPRNG_SEED_BYTES); cprng->buffered = 0; } #define CPRNG_SHORT 64 static void cprng_short(struct cprng *cprng, void *buf, size_t len) { const size_t nwords = howmany(len, sizeof(crypto_word_t)); _DIAGASSERT(n <= CPRNG_SHORT); __CTASSERT(CPRNG_SHORT <= crypto_core_OUTPUTWORDS * sizeof(crypto_word_t)); if (__predict_false(cprng->buffered < nwords)) { crypto_core(cprng_buffer(cprng), cprng_nonce(cprng), cprng_key(cprng), sigma); nonce_inc(cprng_nonce(cprng)); cprng->buffered = crypto_core_OUTPUTWORDS; } (void)memcpy(buf, &cprng_buffer(cprng)[crypto_core_OUTPUTWORDS - cprng->buffered], nwords); cprng->buffered -= nwords; } uint32_t cprng32(struct cprng *cprng) { uint32_t r; cprng_short(cprng, &r, sizeof r); return r; } uint64_t cprng64(struct cprng *cprng) { uint64_t r; cprng_short(cprng, &r, sizeof r); return r; } void cprng1(const void *seed, void *buf, size_t len) { uint8_t *p8; crypto_word_t *p; size_t ni, nb, nf; crypto_word_t key[crypto_core_KEYWORDS]; crypto_word_t nonce[crypto_core_INPUTWORDS] = {0}; crypto_word_t block[crypto_core_OUTPUTWORDS]; /* * Guarantee we can generate up to len bytes. We have * * 2^(CHAR_BIT*sizeof(crypto_word_t)*INPUTWORDS) * * possible inputs yielding output of * * 4*OUTPUTWORDS*2^(CHAR_BIT*sizeof(crypto_word_t)*INPUTWORDS) * * bytes. It suffices to require that sizeof len > * (1/CHAR_BIT) log_2 len be less than (1/CHAR_BIT) log_2 of * the total output stream length. We have * * log_2 (4 o 2^(w i)) = log_2 (4 o) + log_2 2^(w i) * = 2 + log_2 o + w i. */ __CTASSERT(CHAR_BIT*sizeof len <= (2 + ilog2(crypto_core_OUTPUTWORDS) + CHAR_BIT*sizeof(crypto_word_t)*crypto_core_INPUTWORDS)); __CTASSERT(CPRNG_SEED_BYTES == sizeof key); (void)memcpy(key, seed, sizeof key); p8 = buf; p = (crypto_word_t *)roundup2((uintptr_t)p8, sizeof(crypto_word_t)); ni = (uint8_t *)p - p8; nb = (len - ni) / sizeof block; nf = (len - ni) % sizeof block; _DIAGASSERT(((uintptr_t)p & (sizeof(crypto_word_t) - 1)) == 0); _DIAGASSERT(n == (ni + (nb * sizeof block) + nf)); _DIAGASSERT(ni < sizeof(uint32_t)); _DIAGASSERT(nf < sizeof(crypto_word_t)*crypto_core_OUTPUTWORDS); if (__predict_false(ni)) { crypto_core(block, nonce, key, sigma); nonce_inc(nonce); (void)memcpy(p8, block, ni); } while (nb--) { crypto_core(p, nonce, key, sigma); nonce_inc(nonce); p += crypto_core_OUTPUTWORDS; } if (__predict_false(nf)) { crypto_core(block, nonce, key, sigma); nonce_inc(nonce); (void)memcpy(p, block, nf); } if (__predict_false(ni | nf)) (void)explicit_memset(block, 0, sizeof block); (void)explicit_memset(key, 0, sizeof key); } void cprng_buf(struct cprng *cprng, void *buf, size_t len) { if (len <= CPRNG_SHORT) { cprng_short(cprng, buf, len); } else { uint8_t seed[CPRNG_SEED_BYTES]; cprng_short(cprng, seed, sizeof seed); cprng1(seed, buf, len); } }