/* * SHA1 hash algorithm. Used in SSH-2 as a MAC, and the transform is * also used as a `stirring' function for the PuTTY random number * pool. Implemented directly from the specification by Simon * Tatham. */ #include "ssh.h" #include /* ---------------------------------------------------------------------- * Core SHA algorithm: processes 16-word blocks into a message digest. */ #define rol(x,y) ( ((x) << (y)) | (((uint32)x) >> (32-y)) ) static void sha1_sw(SHA_State * s, const unsigned char *q, int len); static void sha1_ni(SHA_State * s, const unsigned char *q, int len); static void SHA_Core_Init(uint32 h[5]) { h[0] = 0x67452301; h[1] = 0xefcdab89; h[2] = 0x98badcfe; h[3] = 0x10325476; h[4] = 0xc3d2e1f0; } void SHATransform(word32 * digest, word32 * block) { word32 w[80]; word32 a, b, c, d, e; int t; #ifdef RANDOM_DIAGNOSTICS { extern int random_diagnostics; if (random_diagnostics) { int i; printf("SHATransform:"); for (i = 0; i < 5; i++) printf(" %08x", digest[i]); printf(" +"); for (i = 0; i < 16; i++) printf(" %08x", block[i]); } } #endif for (t = 0; t < 16; t++) w[t] = block[t]; for (t = 16; t < 80; t++) { word32 tmp = w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16]; w[t] = rol(tmp, 1); } a = digest[0]; b = digest[1]; c = digest[2]; d = digest[3]; e = digest[4]; for (t = 0; t < 20; t++) { word32 tmp = rol(a, 5) + ((b & c) | (d & ~b)) + e + w[t] + 0x5a827999; e = d; d = c; c = rol(b, 30); b = a; a = tmp; } for (t = 20; t < 40; t++) { word32 tmp = rol(a, 5) + (b ^ c ^ d) + e + w[t] + 0x6ed9eba1; e = d; d = c; c = rol(b, 30); b = a; a = tmp; } for (t = 40; t < 60; t++) { word32 tmp = rol(a, 5) + ((b & c) | (b & d) | (c & d)) + e + w[t] + 0x8f1bbcdc; e = d; d = c; c = rol(b, 30); b = a; a = tmp; } for (t = 60; t < 80; t++) { word32 tmp = rol(a, 5) + (b ^ c ^ d) + e + w[t] + 0xca62c1d6; e = d; d = c; c = rol(b, 30); b = a; a = tmp; } digest[0] += a; digest[1] += b; digest[2] += c; digest[3] += d; digest[4] += e; #ifdef RANDOM_DIAGNOSTICS { extern int random_diagnostics; if (random_diagnostics) { int i; printf(" ="); for (i = 0; i < 5; i++) printf(" %08x", digest[i]); printf("\n"); } } #endif } /* ---------------------------------------------------------------------- * Outer SHA algorithm: take an arbitrary length byte string, * convert it into 16-word blocks with the prescribed padding at * the end, and pass those blocks to the core SHA algorithm. */ static void SHA_BinarySink_write(BinarySink *bs, const void *p, size_t len); void SHA_Init(SHA_State * s) { SHA_Core_Init(s->h); s->blkused = 0; s->lenhi = s->lenlo = 0; if (supports_sha_ni()) s->sha1 = &sha1_ni; else s->sha1 = &sha1_sw; BinarySink_INIT(s, SHA_BinarySink_write); } static void SHA_BinarySink_write(BinarySink *bs, const void *p, size_t len) { struct SHA_State *s = BinarySink_DOWNCAST(bs, struct SHA_State); const unsigned char *q = (const unsigned char *) p; uint32 lenw = len; assert(lenw == len); /* * Update the length field. */ s->lenlo += lenw; s->lenhi += (s->lenlo < lenw); (*(s->sha1))(s, q, len); } static void sha1_sw(SHA_State * s, const unsigned char *q, int len) { uint32 wordblock[16]; int i; if (s->blkused && s->blkused + len < 64) { /* * Trivial case: just add to the block. */ memcpy(s->block + s->blkused, q, len); s->blkused += len; } else { /* * We must complete and process at least one block. */ while (s->blkused + len >= 64) { memcpy(s->block + s->blkused, q, 64 - s->blkused); q += 64 - s->blkused; len -= 64 - s->blkused; /* Now process the block. Gather bytes big-endian into words */ for (i = 0; i < 16; i++) { wordblock[i] = (((uint32) s->block[i * 4 + 0]) << 24) | (((uint32) s->block[i * 4 + 1]) << 16) | (((uint32) s->block[i * 4 + 2]) << 8) | (((uint32) s->block[i * 4 + 3]) << 0); } SHATransform(s->h, wordblock); s->blkused = 0; } memcpy(s->block, q, len); s->blkused = len; } } void SHA_Final(SHA_State * s, unsigned char *output) { int i; int pad; unsigned char c[64]; uint32 lenhi, lenlo; if (s->blkused >= 56) pad = 56 + 64 - s->blkused; else pad = 56 - s->blkused; lenhi = (s->lenhi << 3) | (s->lenlo >> (32 - 3)); lenlo = (s->lenlo << 3); memset(c, 0, pad); c[0] = 0x80; put_data(s, &c, pad); put_uint32(s, lenhi); put_uint32(s, lenlo); for (i = 0; i < 5; i++) { output[i * 4] = (s->h[i] >> 24) & 0xFF; output[i * 4 + 1] = (s->h[i] >> 16) & 0xFF; output[i * 4 + 2] = (s->h[i] >> 8) & 0xFF; output[i * 4 + 3] = (s->h[i]) & 0xFF; } } void SHA_Simple(const void *p, int len, unsigned char *output) { SHA_State s; SHA_Init(&s); put_data(&s, p, len); SHA_Final(&s, output); smemclr(&s, sizeof(s)); } /* * Thin abstraction for things where hashes are pluggable. */ struct sha1_hash { SHA_State state; ssh_hash hash; }; static ssh_hash *sha1_new(const struct ssh_hashalg *alg) { struct sha1_hash *h = snew(struct sha1_hash); SHA_Init(&h->state); h->hash.vt = alg; BinarySink_DELEGATE_INIT(&h->hash, &h->state); return &h->hash; } static ssh_hash *sha1_copy(ssh_hash *hashold) { struct sha1_hash *hold, *hnew; ssh_hash *hashnew = sha1_new(hashold->vt); hold = FROMFIELD(hashold, struct sha1_hash, hash); hnew = FROMFIELD(hashnew, struct sha1_hash, hash); hnew->state = hold->state; BinarySink_COPIED(&hnew->state); return hashnew; } static void sha1_free(ssh_hash *hash) { struct sha1_hash *h = FROMFIELD(hash, struct sha1_hash, hash); smemclr(h, sizeof(*h)); sfree(h); } static void sha1_final(ssh_hash *hash, unsigned char *output) { struct sha1_hash *h = FROMFIELD(hash, struct sha1_hash, hash); SHA_Final(&h->state, output); sha1_free(hash); } const struct ssh_hashalg ssh_sha1 = { sha1_new, sha1_copy, sha1_final, sha1_free, 20, "SHA-1" }; /* ---------------------------------------------------------------------- * The above is the SHA-1 algorithm itself. Now we implement the * HMAC wrapper on it. */ struct hmacsha1 { SHA_State sha[3]; ssh2_mac mac; }; static ssh2_mac *hmacsha1_new( const struct ssh2_macalg *alg, ssh2_cipher *cipher) { struct hmacsha1 *ctx = snew(struct hmacsha1); ctx->mac.vt = alg; BinarySink_DELEGATE_INIT(&ctx->mac, &ctx->sha[2]); return &ctx->mac; } static void hmacsha1_free(ssh2_mac *mac) { struct hmacsha1 *ctx = FROMFIELD(mac, struct hmacsha1, mac); smemclr(ctx, sizeof(*ctx)); sfree(ctx); } static void sha1_key_internal(SHA_State *keys, const unsigned char *key, int len) { unsigned char foo[64]; int i; memset(foo, 0x36, 64); for (i = 0; i < len && i < 64; i++) foo[i] ^= key[i]; SHA_Init(&keys[0]); put_data(&keys[0], foo, 64); memset(foo, 0x5C, 64); for (i = 0; i < len && i < 64; i++) foo[i] ^= key[i]; SHA_Init(&keys[1]); put_data(&keys[1], foo, 64); smemclr(foo, 64); /* burn the evidence */ } static void hmacsha1_key(ssh2_mac *mac, const void *key) { struct hmacsha1 *ctx = FROMFIELD(mac, struct hmacsha1, mac); /* Reading the key length out of the ssh2_macalg structure means * this same method can be used for the _buggy variants which use * a shorter key */ sha1_key_internal(ctx->sha, key, ctx->mac.vt->keylen); } static void hmacsha1_start(ssh2_mac *mac) { struct hmacsha1 *ctx = FROMFIELD(mac, struct hmacsha1, mac); ctx->sha[2] = ctx->sha[0]; /* structure copy */ BinarySink_COPIED(&ctx->sha[2]); } static void hmacsha1_genresult(ssh2_mac *mac, unsigned char *hmac) { struct hmacsha1 *ctx = FROMFIELD(mac, struct hmacsha1, mac); SHA_State s; unsigned char intermediate[20]; s = ctx->sha[2]; /* structure copy */ BinarySink_COPIED(&s); SHA_Final(&s, intermediate); s = ctx->sha[1]; /* structure copy */ BinarySink_COPIED(&s); put_data(&s, intermediate, 20); SHA_Final(&s, intermediate); memcpy(hmac, intermediate, ctx->mac.vt->len); smemclr(intermediate, sizeof(intermediate)); } void hmac_sha1_simple(const void *key, int keylen, const void *data, int datalen, unsigned char *output) { SHA_State states[2]; unsigned char intermediate[20]; sha1_key_internal(states, key, keylen); put_data(&states[0], data, datalen); SHA_Final(&states[0], intermediate); put_data(&states[1], intermediate, 20); SHA_Final(&states[1], output); } const struct ssh2_macalg ssh_hmac_sha1 = { hmacsha1_new, hmacsha1_free, hmacsha1_key, hmacsha1_start, hmacsha1_genresult, "hmac-sha1", "hmac-sha1-etm@openssh.com", 20, 20, "HMAC-SHA1" }; const struct ssh2_macalg ssh_hmac_sha1_96 = { hmacsha1_new, hmacsha1_free, hmacsha1_key, hmacsha1_start, hmacsha1_genresult, "hmac-sha1-96", "hmac-sha1-96-etm@openssh.com", 12, 20, "HMAC-SHA1-96" }; const struct ssh2_macalg ssh_hmac_sha1_buggy = { hmacsha1_new, hmacsha1_free, hmacsha1_key, hmacsha1_start, hmacsha1_genresult, "hmac-sha1", NULL, 20, 16, "bug-compatible HMAC-SHA1" }; const struct ssh2_macalg ssh_hmac_sha1_96_buggy = { hmacsha1_new, hmacsha1_free, hmacsha1_key, hmacsha1_start, hmacsha1_genresult, "hmac-sha1-96", NULL, 12, 16, "bug-compatible HMAC-SHA1-96" }; #ifdef COMPILER_SUPPORTS_SHA_NI #if defined _MSC_VER && defined _M_AMD64 # include #endif /* * Set target architecture for Clang and GCC */ #if !defined(__clang__) && defined(__GNUC__) # pragma GCC target("sha") # pragma GCC target("sse4.1") #endif #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5)) # define FUNC_ISA __attribute__ ((target("sse4.1,sha"))) #else # define FUNC_ISA #endif #include #include #include #if defined(__clang__) || defined(__GNUC__) #include #endif /* * Determinators of CPU type */ #if defined(__clang__) || defined(__GNUC__) #include int supports_sha_ni(void) { unsigned int CPUInfo[4]; __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); if (CPUInfo[0] < 7) return 0; __cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); return CPUInfo[1] & (1 << 29); /* SHA */ } #else /* defined(__clang__) || defined(__GNUC__) */ int supports_sha_ni(void) { unsigned int CPUInfo[4]; __cpuid(CPUInfo, 0); if (CPUInfo[0] < 7) return 0; __cpuidex(CPUInfo, 7, 0); return CPUInfo[1] & (1 << 29); /* Check SHA */ } #endif /* defined(__clang__) || defined(__GNUC__) */ /* SHA1 implementation using new instructions The code is based on Jeffrey Walton's SHA1 implementation: https://github.com/noloader/SHA-Intrinsics */ FUNC_ISA static void sha1_ni_(SHA_State * s, const unsigned char *q, int len) { if (s->blkused && s->blkused + len < 64) { /* * Trivial case: just add to the block. */ memcpy(s->block + s->blkused, q, len); s->blkused += len; } else { __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; const __m128i MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL); ABCD = _mm_loadu_si128((const __m128i*) s->h); E0 = _mm_set_epi32(s->h[4], 0, 0, 0); ABCD = _mm_shuffle_epi32(ABCD, 0x1B); /* * We must complete and process at least one block. */ while (s->blkused + len >= 64) { __m128i MSG0, MSG1, MSG2, MSG3; memcpy(s->block + s->blkused, q, 64 - s->blkused); q += 64 - s->blkused; len -= 64 - s->blkused; /* Save current state */ ABCD_SAVE = ABCD; E0_SAVE = E0; /* Rounds 0-3 */ MSG0 = _mm_loadu_si128((const __m128i*)(s->block + 0)); MSG0 = _mm_shuffle_epi8(MSG0, MASK); E0 = _mm_add_epi32(E0, MSG0); E1 = ABCD; ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); /* Rounds 4-7 */ MSG1 = _mm_loadu_si128((const __m128i*)(s->block + 16)); MSG1 = _mm_shuffle_epi8(MSG1, MASK); E1 = _mm_sha1nexte_epu32(E1, MSG1); E0 = ABCD; ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); /* Rounds 8-11 */ MSG2 = _mm_loadu_si128((const __m128i*)(s->block + 32)); MSG2 = _mm_shuffle_epi8(MSG2, MASK); E0 = _mm_sha1nexte_epu32(E0, MSG2); E1 = ABCD; ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); MSG0 = _mm_xor_si128(MSG0, MSG2); /* Rounds 12-15 */ MSG3 = _mm_loadu_si128((const __m128i*)(s->block + 48)); MSG3 = _mm_shuffle_epi8(MSG3, MASK); E1 = _mm_sha1nexte_epu32(E1, MSG3); E0 = ABCD; MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); MSG1 = _mm_xor_si128(MSG1, MSG3); /* Rounds 16-19 */ E0 = _mm_sha1nexte_epu32(E0, MSG0); E1 = ABCD; MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); MSG2 = _mm_xor_si128(MSG2, MSG0); /* Rounds 20-23 */ E1 = _mm_sha1nexte_epu32(E1, MSG1); E0 = ABCD; MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); MSG3 = _mm_xor_si128(MSG3, MSG1); /* Rounds 24-27 */ E0 = _mm_sha1nexte_epu32(E0, MSG2); E1 = ABCD; MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); MSG0 = _mm_xor_si128(MSG0, MSG2); /* Rounds 28-31 */ E1 = _mm_sha1nexte_epu32(E1, MSG3); E0 = ABCD; MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); MSG1 = _mm_xor_si128(MSG1, MSG3); /* Rounds 32-35 */ E0 = _mm_sha1nexte_epu32(E0, MSG0); E1 = ABCD; MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); MSG2 = _mm_xor_si128(MSG2, MSG0); /* Rounds 36-39 */ E1 = _mm_sha1nexte_epu32(E1, MSG1); E0 = ABCD; MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); MSG3 = _mm_xor_si128(MSG3, MSG1); /* Rounds 40-43 */ E0 = _mm_sha1nexte_epu32(E0, MSG2); E1 = ABCD; MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); MSG0 = _mm_xor_si128(MSG0, MSG2); /* Rounds 44-47 */ E1 = _mm_sha1nexte_epu32(E1, MSG3); E0 = ABCD; MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); MSG1 = _mm_xor_si128(MSG1, MSG3); /* Rounds 48-51 */ E0 = _mm_sha1nexte_epu32(E0, MSG0); E1 = ABCD; MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); MSG2 = _mm_xor_si128(MSG2, MSG0); /* Rounds 52-55 */ E1 = _mm_sha1nexte_epu32(E1, MSG1); E0 = ABCD; MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); MSG3 = _mm_xor_si128(MSG3, MSG1); /* Rounds 56-59 */ E0 = _mm_sha1nexte_epu32(E0, MSG2); E1 = ABCD; MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); MSG0 = _mm_xor_si128(MSG0, MSG2); /* Rounds 60-63 */ E1 = _mm_sha1nexte_epu32(E1, MSG3); E0 = ABCD; MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); MSG1 = _mm_xor_si128(MSG1, MSG3); /* Rounds 64-67 */ E0 = _mm_sha1nexte_epu32(E0, MSG0); E1 = ABCD; MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); MSG2 = _mm_xor_si128(MSG2, MSG0); /* Rounds 68-71 */ E1 = _mm_sha1nexte_epu32(E1, MSG1); E0 = ABCD; MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); MSG3 = _mm_xor_si128(MSG3, MSG1); /* Rounds 72-75 */ E0 = _mm_sha1nexte_epu32(E0, MSG2); E1 = ABCD; MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); /* Rounds 76-79 */ E1 = _mm_sha1nexte_epu32(E1, MSG3); E0 = ABCD; ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); /* Combine state */ E0 = _mm_sha1nexte_epu32(E0, E0_SAVE); ABCD = _mm_add_epi32(ABCD, ABCD_SAVE); s->blkused = 0; } ABCD = _mm_shuffle_epi32(ABCD, 0x1B); /* Save state */ _mm_storeu_si128((__m128i*) s->h, ABCD); s->h[4] = _mm_extract_epi32(E0, 3); memcpy(s->block, q, len); s->blkused = len; } } /* * Workaround LLVM bug https://bugs.llvm.org/show_bug.cgi?id=34980 */ static void sha1_ni(SHA_State * s, const unsigned char *q, int len) { sha1_ni_(s, q, len); } #else /* COMPILER_SUPPORTS_AES_NI */ static void sha1_ni(SHA_State * s, const unsigned char *q, int len) { assert(0); } int supports_sha_ni(void) { return 0; } #endif /* COMPILER_SUPPORTS_AES_NI */