From 5d9d075aac141c68cf0fdfa3c2d1fc910559b56b Mon Sep 17 00:00:00 2001 From: "Pavel I. Kryukov" Date: Mon, 19 Feb 2018 00:07:06 +0300 Subject: [PATCH] Add SHA256 implementation with new instructions SHA256-NI code is conditionally enabled if CPU supports SHA extensions. The procedure is based on Jeffrey Walton's SHA256 implementation: https://github.com/noloader/SHA-Intrinsics --- sshsh256.c | 252 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 251 insertions(+), 1 deletion(-) diff --git a/sshsh256.c b/sshsh256.c index e83266b5..7bb70bd3 100644 --- a/sshsh256.c +++ b/sshsh256.c @@ -5,6 +5,7 @@ */ #include "ssh.h" +#include /* ---------------------------------------------------------------------- * Core SHA256 algorithm: processes 16-word blocks into a message digest. @@ -20,6 +21,7 @@ #define smallsigma1(x) ( ror((x),17) ^ ror((x),19) ^ shr((x),10) ) static void SHA256_sw(SHA256_State *s, const unsigned char *q, int len); +static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len); void SHA256_Core_Init(SHA256_State *s) { s->h[0] = 0x6a09e667; @@ -99,7 +101,10 @@ void SHA256_Init(SHA256_State *s) { SHA256_Core_Init(s); s->blkused = 0; s->lenhi = s->lenlo = 0; - s->sha256 = &SHA256_sw; + if (supports_sha_ni()) + s->sha256 = &SHA256_ni; + else + s->sha256 = &SHA256_sw; } void SHA256_Bytes(SHA256_State *s, const void *p, int len) { @@ -405,3 +410,248 @@ int main(void) { } #endif + +#ifdef COMPILER_SUPPORTS_SHA_NI + +/* + * Set target architecture for Clang and GCC + */ +#if !defined(__clang__) && defined(__GNUC__) +# pragma GCC target("sha") +# pragma GCC target("sse4.1") +#endif + +#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5)) +# define FUNC_ISA __attribute__ ((target("sse4.1,sha"))) +#else +# define FUNC_ISA +#endif + +#include +#include +#include + +#if defined(__clang__) || defined(__GNUC__) +#include +#endif + +/* SHA256 implementation using new instructions + The code is based on Jeffrey Walton's SHA256 implementation: + https://github.com/noloader/SHA-Intrinsics +*/ +FUNC_ISA +static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len) { + if (s->blkused && s->blkused+len < BLKSIZE) { + /* + * Trivial case: just add to the block. + */ + memcpy(s->block + s->blkused, q, len); + s->blkused += len; + } else { + __m128i STATE0, STATE1; + __m128i MSG, TMP; + __m128i MSG0, MSG1, MSG2, MSG3; + __m128i ABEF_SAVE, CDGH_SAVE; + const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); + + /* Load initial values */ + TMP = _mm_loadu_si128((const __m128i*) &s->h[0]); + STATE1 = _mm_loadu_si128((const __m128i*) &s->h[4]); + + TMP = _mm_shuffle_epi32(TMP, 0xB1); /* CDAB */ + STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); /* EFGH */ + STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); /* ABEF */ + STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */ + /* + * We must complete and process at least one block. + */ + while (s->blkused + len >= BLKSIZE) { + memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused); + q += BLKSIZE - s->blkused; + len -= BLKSIZE - s->blkused; + + /* Save current state */ + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + /* Rounds 0-3 */ + MSG = _mm_loadu_si128((const __m128i*) (s->block + 0)); + MSG0 = _mm_shuffle_epi8(MSG, MASK); + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Rounds 4-7 */ + MSG1 = _mm_loadu_si128((const __m128i*) (s->block + 16)); + MSG1 = _mm_shuffle_epi8(MSG1, MASK); + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + + /* Rounds 8-11 */ + MSG2 = _mm_loadu_si128((const __m128i*) (s->block + 32)); + MSG2 = _mm_shuffle_epi8(MSG2, MASK); + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + + /* Rounds 12-15 */ + MSG3 = _mm_loadu_si128((const __m128i*) (s->block + 48)); + MSG3 = _mm_shuffle_epi8(MSG3, MASK); + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + + /* Rounds 16-19 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + + /* Rounds 20-23 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + + /* Rounds 24-27 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + + /* Rounds 28-31 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + + /* Rounds 32-35 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + + /* Rounds 36-39 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + + /* Rounds 40-43 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + + /* Rounds 44-47 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + + /* Rounds 48-51 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + + /* Rounds 52-55 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Rounds 56-59 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Rounds 60-63 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Combine state */ + STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); + STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); + + s->blkused = 0; + } + + TMP = _mm_shuffle_epi32(STATE0, 0x1B); /* FEBA */ + STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); /* DCHG */ + STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */ + STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); /* ABEF */ + + /* Save state */ + _mm_storeu_si128((__m128i*) &s->h[0], STATE0); + _mm_storeu_si128((__m128i*) &s->h[4], STATE1); + + memcpy(s->block, q, len); + s->blkused = len; + } +} + +#else /* COMPILER_SUPPORTS_AES_NI */ + +static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len) +{ + assert(0); +} + +#endif /* COMPILER_SUPPORTS_AES_NI */