1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-01-09 17:38:00 +00:00
putty-source/sshsh256.c
Simon Tatham 853bd8b284 Turn SSH-2 MACs into a classoid.
This piece of tidying-up has come out particularly well in terms of
saving tedious repetition and boilerplate. I've managed to remove
three pointless methods from every MAC implementation by means of
writing them once centrally in terms of the implementation-specific
methods; another method (hmacmd5_sink) vanished because I was able to
make the interface type 'ssh2_mac' be directly usable as a BinarySink
by way of a new delegation system; and because all the method
implementations can now find their own vtable, I was even able to
merge a lot of keying and output functions that had previously
differed only in length parameters by having them look up the lengths
in whatever vtable they were passed.
2018-09-19 23:08:07 +01:00

644 lines
21 KiB
C

/*
* SHA-256 algorithm as described at
*
* http://csrc.nist.gov/cryptval/shs.html
*/
#include "ssh.h"
#include <assert.h>
/* ----------------------------------------------------------------------
* Core SHA256 algorithm: processes 16-word blocks into a message digest.
*/
#define ror(x,y) ( ((x) << (32-y)) | (((uint32)(x)) >> (y)) )
#define shr(x,y) ( (((uint32)(x)) >> (y)) )
#define Ch(x,y,z) ( ((x) & (y)) ^ (~(x) & (z)) )
#define Maj(x,y,z) ( ((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)) )
#define bigsigma0(x) ( ror((x),2) ^ ror((x),13) ^ ror((x),22) )
#define bigsigma1(x) ( ror((x),6) ^ ror((x),11) ^ ror((x),25) )
#define smallsigma0(x) ( ror((x),7) ^ ror((x),18) ^ shr((x),3) )
#define smallsigma1(x) ( ror((x),17) ^ ror((x),19) ^ shr((x),10) )
static void SHA256_sw(SHA256_State *s, const unsigned char *q, int len);
static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len);
void SHA256_Core_Init(SHA256_State *s) {
s->h[0] = 0x6a09e667;
s->h[1] = 0xbb67ae85;
s->h[2] = 0x3c6ef372;
s->h[3] = 0xa54ff53a;
s->h[4] = 0x510e527f;
s->h[5] = 0x9b05688c;
s->h[6] = 0x1f83d9ab;
s->h[7] = 0x5be0cd19;
}
void SHA256_Block(SHA256_State *s, uint32 *block) {
uint32 w[80];
uint32 a,b,c,d,e,f,g,h;
static const int k[] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};
int t;
for (t = 0; t < 16; t++)
w[t] = block[t];
for (t = 16; t < 64; t++)
w[t] = smallsigma1(w[t-2]) + w[t-7] + smallsigma0(w[t-15]) + w[t-16];
a = s->h[0]; b = s->h[1]; c = s->h[2]; d = s->h[3];
e = s->h[4]; f = s->h[5]; g = s->h[6]; h = s->h[7];
for (t = 0; t < 64; t+=8) {
uint32 t1, t2;
#define ROUND(j,a,b,c,d,e,f,g,h) \
t1 = h + bigsigma1(e) + Ch(e,f,g) + k[j] + w[j]; \
t2 = bigsigma0(a) + Maj(a,b,c); \
d = d + t1; h = t1 + t2;
ROUND(t+0, a,b,c,d,e,f,g,h);
ROUND(t+1, h,a,b,c,d,e,f,g);
ROUND(t+2, g,h,a,b,c,d,e,f);
ROUND(t+3, f,g,h,a,b,c,d,e);
ROUND(t+4, e,f,g,h,a,b,c,d);
ROUND(t+5, d,e,f,g,h,a,b,c);
ROUND(t+6, c,d,e,f,g,h,a,b);
ROUND(t+7, b,c,d,e,f,g,h,a);
}
s->h[0] += a; s->h[1] += b; s->h[2] += c; s->h[3] += d;
s->h[4] += e; s->h[5] += f; s->h[6] += g; s->h[7] += h;
}
/* ----------------------------------------------------------------------
* Outer SHA256 algorithm: take an arbitrary length byte string,
* convert it into 16-word blocks with the prescribed padding at
* the end, and pass those blocks to the core SHA256 algorithm.
*/
#define BLKSIZE 64
static void SHA256_BinarySink_write(BinarySink *bs,
const void *p, size_t len);
void SHA256_Init(SHA256_State *s) {
SHA256_Core_Init(s);
s->blkused = 0;
s->lenhi = s->lenlo = 0;
if (supports_sha_ni())
s->sha256 = &SHA256_ni;
else
s->sha256 = &SHA256_sw;
BinarySink_INIT(s, SHA256_BinarySink_write);
}
static void SHA256_BinarySink_write(BinarySink *bs,
const void *p, size_t len)
{
struct SHA256_State *s = BinarySink_DOWNCAST(bs, struct SHA256_State);
unsigned char *q = (unsigned char *)p;
uint32 lenw = len;
assert(len == lenw);
/*
* Update the length field.
*/
s->lenlo += lenw;
s->lenhi += (s->lenlo < lenw);
(*(s->sha256))(s, q, len);
}
static void SHA256_sw(SHA256_State *s, const unsigned char *q, int len) {
uint32 wordblock[16];
int i;
if (s->blkused && s->blkused+len < BLKSIZE) {
/*
* Trivial case: just add to the block.
*/
memcpy(s->block + s->blkused, q, len);
s->blkused += len;
} else {
/*
* We must complete and process at least one block.
*/
while (s->blkused + len >= BLKSIZE) {
memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused);
q += BLKSIZE - s->blkused;
len -= BLKSIZE - s->blkused;
/* Now process the block. Gather bytes big-endian into words */
for (i = 0; i < 16; i++) {
wordblock[i] =
( ((uint32)s->block[i*4+0]) << 24 ) |
( ((uint32)s->block[i*4+1]) << 16 ) |
( ((uint32)s->block[i*4+2]) << 8 ) |
( ((uint32)s->block[i*4+3]) << 0 );
}
SHA256_Block(s, wordblock);
s->blkused = 0;
}
memcpy(s->block, q, len);
s->blkused = len;
}
}
void SHA256_Final(SHA256_State *s, unsigned char *digest) {
int i;
int pad;
unsigned char c[64];
uint32 lenhi, lenlo;
if (s->blkused >= 56)
pad = 56 + 64 - s->blkused;
else
pad = 56 - s->blkused;
lenhi = (s->lenhi << 3) | (s->lenlo >> (32-3));
lenlo = (s->lenlo << 3);
memset(c, 0, pad);
c[0] = 0x80;
put_data(s, &c, pad);
put_uint32(s, lenhi);
put_uint32(s, lenlo);
for (i = 0; i < 8; i++) {
digest[i*4+0] = (s->h[i] >> 24) & 0xFF;
digest[i*4+1] = (s->h[i] >> 16) & 0xFF;
digest[i*4+2] = (s->h[i] >> 8) & 0xFF;
digest[i*4+3] = (s->h[i] >> 0) & 0xFF;
}
}
void SHA256_Simple(const void *p, int len, unsigned char *output) {
SHA256_State s;
SHA256_Init(&s);
put_data(&s, p, len);
SHA256_Final(&s, output);
smemclr(&s, sizeof(s));
}
/*
* Thin abstraction for things where hashes are pluggable.
*/
static void *sha256_init(void)
{
SHA256_State *s;
s = snew(SHA256_State);
SHA256_Init(s);
return s;
}
static void *sha256_copy(const void *vold)
{
const SHA256_State *old = (const SHA256_State *)vold;
SHA256_State *s;
s = snew(SHA256_State);
*s = *old;
BinarySink_COPIED(s);
return s;
}
static void sha256_free(void *handle)
{
SHA256_State *s = handle;
smemclr(s, sizeof(*s));
sfree(s);
}
static BinarySink *sha256_sink(void *handle)
{
SHA256_State *s = handle;
return BinarySink_UPCAST(s);
}
static void sha256_final(void *handle, unsigned char *output)
{
SHA256_State *s = handle;
SHA256_Final(s, output);
sha256_free(s);
}
const struct ssh_hash ssh_sha256 = {
sha256_init, sha256_copy, sha256_sink, sha256_final, sha256_free,
32, "SHA-256"
};
/* ----------------------------------------------------------------------
* The above is the SHA-256 algorithm itself. Now we implement the
* HMAC wrapper on it.
*/
struct hmacsha256 {
SHA256_State sha[3];
ssh2_mac mac;
};
static ssh2_mac *hmacsha256_new(
const struct ssh2_macalg *alg, ssh2_cipher *cipher)
{
struct hmacsha256 *ctx = snew(struct hmacsha256);
ctx->mac.vt = alg;
BinarySink_DELEGATE_INIT(&ctx->mac, &ctx->sha[2]);
return &ctx->mac;
}
static void hmacsha256_free(ssh2_mac *mac)
{
struct hmacsha256 *ctx = FROMFIELD(mac, struct hmacsha256, mac);
smemclr(ctx, sizeof(*ctx));
sfree(ctx);
}
static void sha256_key_internal(struct hmacsha256 *ctx,
const unsigned char *key, int len)
{
unsigned char foo[64];
int i;
memset(foo, 0x36, 64);
for (i = 0; i < len && i < 64; i++)
foo[i] ^= key[i];
SHA256_Init(&ctx->sha[0]);
put_data(&ctx->sha[0], foo, 64);
memset(foo, 0x5C, 64);
for (i = 0; i < len && i < 64; i++)
foo[i] ^= key[i];
SHA256_Init(&ctx->sha[1]);
put_data(&ctx->sha[1], foo, 64);
smemclr(foo, 64); /* burn the evidence */
}
static void hmacsha256_key(ssh2_mac *mac, const void *key)
{
struct hmacsha256 *ctx = FROMFIELD(mac, struct hmacsha256, mac);
sha256_key_internal(ctx, key, ctx->mac.vt->keylen);
}
static void hmacsha256_start(ssh2_mac *mac)
{
struct hmacsha256 *ctx = FROMFIELD(mac, struct hmacsha256, mac);
ctx->sha[2] = ctx->sha[0]; /* structure copy */
BinarySink_COPIED(&ctx->sha[2]);
}
static void hmacsha256_genresult(ssh2_mac *mac, unsigned char *hmac)
{
struct hmacsha256 *ctx = FROMFIELD(mac, struct hmacsha256, mac);
SHA256_State s;
unsigned char intermediate[32];
s = ctx->sha[2]; /* structure copy */
BinarySink_COPIED(&s);
SHA256_Final(&s, intermediate);
s = ctx->sha[1]; /* structure copy */
BinarySink_COPIED(&s);
put_data(&s, intermediate, 32);
SHA256_Final(&s, hmac);
}
const struct ssh2_macalg ssh_hmac_sha256 = {
hmacsha256_new, hmacsha256_free, hmacsha256_key,
hmacsha256_start, hmacsha256_genresult,
"hmac-sha2-256", "hmac-sha2-256-etm@openssh.com",
32, 32,
"HMAC-SHA-256"
};
#ifdef TEST
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
int main(void) {
unsigned char digest[32];
int i, j, errors;
struct {
const char *teststring;
unsigned char digest[32];
} tests[] = {
{ "abc", {
0xba, 0x78, 0x16, 0xbf, 0x8f, 0x01, 0xcf, 0xea,
0x41, 0x41, 0x40, 0xde, 0x5d, 0xae, 0x22, 0x23,
0xb0, 0x03, 0x61, 0xa3, 0x96, 0x17, 0x7a, 0x9c,
0xb4, 0x10, 0xff, 0x61, 0xf2, 0x00, 0x15, 0xad,
} },
{ "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", {
0x24, 0x8d, 0x6a, 0x61, 0xd2, 0x06, 0x38, 0xb8,
0xe5, 0xc0, 0x26, 0x93, 0x0c, 0x3e, 0x60, 0x39,
0xa3, 0x3c, 0xe4, 0x59, 0x64, 0xff, 0x21, 0x67,
0xf6, 0xec, 0xed, 0xd4, 0x19, 0xdb, 0x06, 0xc1,
} },
};
errors = 0;
for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) {
SHA256_Simple(tests[i].teststring,
strlen(tests[i].teststring), digest);
for (j = 0; j < 32; j++) {
if (digest[j] != tests[i].digest[j]) {
fprintf(stderr,
"\"%s\" digest byte %d should be 0x%02x, is 0x%02x\n",
tests[i].teststring, j, tests[i].digest[j], digest[j]);
errors++;
}
}
}
printf("%d errors\n", errors);
return 0;
}
#endif
#ifdef COMPILER_SUPPORTS_SHA_NI
#if defined _MSC_VER && defined _M_AMD64
# include <intrin.h>
#endif
/*
* Set target architecture for Clang and GCC
*/
#if !defined(__clang__) && defined(__GNUC__)
# pragma GCC target("sha")
# pragma GCC target("sse4.1")
#endif
#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5))
# define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
#else
# define FUNC_ISA
#endif
#include <wmmintrin.h>
#include <smmintrin.h>
#include <immintrin.h>
#if defined(__clang__) || defined(__GNUC__)
#include <shaintrin.h>
#endif
/* SHA256 implementation using new instructions
The code is based on Jeffrey Walton's SHA256 implementation:
https://github.com/noloader/SHA-Intrinsics
*/
FUNC_ISA
static void SHA256_ni_(SHA256_State * s, const unsigned char *q, int len) {
if (s->blkused && s->blkused+len < BLKSIZE) {
/*
* Trivial case: just add to the block.
*/
memcpy(s->block + s->blkused, q, len);
s->blkused += len;
} else {
__m128i STATE0, STATE1;
__m128i MSG, TMP;
__m128i MSG0, MSG1, MSG2, MSG3;
__m128i ABEF_SAVE, CDGH_SAVE;
const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
/* Load initial values */
TMP = _mm_loadu_si128((const __m128i*) &s->h[0]);
STATE1 = _mm_loadu_si128((const __m128i*) &s->h[4]);
TMP = _mm_shuffle_epi32(TMP, 0xB1); /* CDAB */
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); /* EFGH */
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); /* ABEF */
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */
/*
* We must complete and process at least one block.
*/
while (s->blkused + len >= BLKSIZE) {
memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused);
q += BLKSIZE - s->blkused;
len -= BLKSIZE - s->blkused;
/* Save current state */
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
/* Rounds 0-3 */
MSG = _mm_loadu_si128((const __m128i*) (s->block + 0));
MSG0 = _mm_shuffle_epi8(MSG, MASK);
MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
/* Rounds 4-7 */
MSG1 = _mm_loadu_si128((const __m128i*) (s->block + 16));
MSG1 = _mm_shuffle_epi8(MSG1, MASK);
MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
/* Rounds 8-11 */
MSG2 = _mm_loadu_si128((const __m128i*) (s->block + 32));
MSG2 = _mm_shuffle_epi8(MSG2, MASK);
MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
/* Rounds 12-15 */
MSG3 = _mm_loadu_si128((const __m128i*) (s->block + 48));
MSG3 = _mm_shuffle_epi8(MSG3, MASK);
MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
MSG0 = _mm_add_epi32(MSG0, TMP);
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
/* Rounds 16-19 */
MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
MSG1 = _mm_add_epi32(MSG1, TMP);
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
/* Rounds 20-23 */
MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
MSG2 = _mm_add_epi32(MSG2, TMP);
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
/* Rounds 24-27 */
MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
MSG3 = _mm_add_epi32(MSG3, TMP);
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
/* Rounds 28-31 */
MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
MSG0 = _mm_add_epi32(MSG0, TMP);
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
/* Rounds 32-35 */
MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
MSG1 = _mm_add_epi32(MSG1, TMP);
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
/* Rounds 36-39 */
MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
MSG2 = _mm_add_epi32(MSG2, TMP);
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
/* Rounds 40-43 */
MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
MSG3 = _mm_add_epi32(MSG3, TMP);
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
/* Rounds 44-47 */
MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
MSG0 = _mm_add_epi32(MSG0, TMP);
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
/* Rounds 48-51 */
MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
MSG1 = _mm_add_epi32(MSG1, TMP);
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
/* Rounds 52-55 */
MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
MSG2 = _mm_add_epi32(MSG2, TMP);
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
/* Rounds 56-59 */
MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
MSG3 = _mm_add_epi32(MSG3, TMP);
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
/* Rounds 60-63 */
MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
/* Combine state */
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
s->blkused = 0;
}
TMP = _mm_shuffle_epi32(STATE0, 0x1B); /* FEBA */
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); /* DCHG */
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); /* ABEF */
/* Save state */
_mm_storeu_si128((__m128i*) &s->h[0], STATE0);
_mm_storeu_si128((__m128i*) &s->h[4], STATE1);
memcpy(s->block, q, len);
s->blkused = len;
}
}
/*
* Workaround LLVM bug https://bugs.llvm.org/show_bug.cgi?id=34980
*/
static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len)
{
SHA256_ni_(s, q, len);
}
#else /* COMPILER_SUPPORTS_AES_NI */
static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len)
{
assert(0);
}
#endif /* COMPILER_SUPPORTS_AES_NI */