diff --git a/cmake/cmake.h.in b/cmake/cmake.h.in index f051c759..b6c07d2c 100644 --- a/cmake/cmake.h.in +++ b/cmake/cmake.h.in @@ -40,3 +40,11 @@ #cmakedefine01 HAVE_SO_PEERCRED #cmakedefine01 HAVE_PANGO_FONT_FAMILY_IS_MONOSPACE #cmakedefine01 HAVE_PANGO_FONT_MAP_LIST_FAMILIES + +#cmakedefine01 HAVE_AES_NI +#cmakedefine01 HAVE_SHA_NI +#cmakedefine01 HAVE_SHAINTRIN_H +#cmakedefine01 HAVE_NEON_CRYPTO +#cmakedefine01 HAVE_NEON_SHA512 +#cmakedefine01 HAVE_NEON_SHA512_INTRINSICS +#cmakedefine01 USE_ARM64_NEON_H diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt index 74f86cd4..917614be 100644 --- a/crypto/CMakeLists.txt +++ b/crypto/CMakeLists.txt @@ -1,5 +1,7 @@ add_sources_from_current_dir(crypto - aes.c + aes-common.c + aes-select.c + aes-sw.c arcfour.c argon2.c bcrypt.c @@ -23,8 +25,181 @@ add_sources_from_current_dir(crypto pubkey-ppk.c pubkey-ssh1.c rsa.c - sha256.c - sha512.c + sha256-common.c + sha256-select.c + sha256-sw.c + sha512-common.c + sha512-select.c + sha512-sw.c sha3.c - sha1.c + sha1-common.c + sha1-select.c + sha1-sw.c xdmauth.c) + +include(CheckCSourceCompiles) + +function(test_compile_with_flags outvar) + cmake_parse_arguments(OPT "" "" + "GNU_FLAGS;MSVC_FLAGS;ADD_SOURCES_IF_SUCCESSFUL;TEST_SOURCE" "${ARGN}") + + # Figure out what flags are applicable to this compiler. + set(flags) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR + CMAKE_C_COMPILER_ID MATCHES "Clang") + set(flags ${OPT_GNU_FLAGS}) + endif() + if(CMAKE_C_COMPILER_ID MATCHES "MSVC") + set(flags ${OPT_MSVC_FLAGS}) + endif() + + # See if we can compile the provided test program. + string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} ${flags}) + check_c_source_compiles("${OPT_TEST_SOURCE}" "${outvar}") + + if(${outvar} AND OPT_ADD_SOURCES_IF_SUCCESSFUL) + # Make an object library that compiles the implementation with the + # necessary flags, and add the resulting objects to the crypto + # library. + set(libname object_lib_${outvar}) + add_library(${libname} OBJECT ${OPT_ADD_SOURCES_IF_SUCCESSFUL}) + target_compile_options(${libname} PRIVATE ${flags}) + target_sources(crypto PRIVATE $) + endif() + + # Export the output to the caller's scope, so that further tests can + # be based on it. + set(${outvar} ${${outvar}} PARENT_SCOPE) +endfunction() + +# ---------------------------------------------------------------------- +# Try to enable x86 intrinsics-based crypto implementations. + +test_compile_with_flags(HAVE_WMMINTRIN_H + GNU_FLAGS -msse4.1 + TEST_SOURCE " + #include + #include + volatile __m128i r, a, b; + int main(void) { r = _mm_xor_si128(a, b); }") +if(HAVE_WMMINTRIN_H) + test_compile_with_flags(HAVE_AES_NI + GNU_FLAGS -msse4.1 -maes + TEST_SOURCE " + #include + #include + volatile __m128i r, a, b; + int main(void) { r = _mm_aesenc_si128(a, b); }" + ADD_SOURCES_IF_SUCCESSFUL aes-ni aes-ni.c) + + # shaintrin.h doesn't exist on all compilers; sometimes it's folded + # into the other headers + test_compile_with_flags(HAVE_SHAINTRIN_H + GNU_FLAGS -msse4.1 -msha + TEST_SOURCE " + #include + #include + #include + #include + volatile __m128i r, a, b; + int main(void) { r = _mm_xor_si128(a, b); }") + if(HAVE_SHAINTRIN_H) + set(include_shaintrin "#include ") + else() + set(include_shaintrin "") + endif() + + test_compile_with_flags(HAVE_SHA_NI + GNU_FLAGS -msse4.1 -msha + TEST_SOURCE " + #include + #include + #include + ${include_shaintrin} + volatile __m128i r, a, b, c; + int main(void) { r = _mm_sha256rnds2_epu32(a, b, c); } + " sha-ni sha256-ni.c sha1-ni.c) +endif() + +# ---------------------------------------------------------------------- +# Try to enable Arm Neon intrinsics-based crypto implementations. + +# Start by checking which header file we need. ACLE specifies that it +# ought to be , on both 32- and 64-bit Arm, but Visual +# Studio for some reason renamed the header to in +# 64-bit, and gives an error if you use the standard name. (However, +# clang-cl does let you use the standard name.) +test_compile_with_flags(HAVE_ARM_NEON_H + MSVC_FLAGS -D_ARM_USE_NEW_NEON_INTRINSICS + TEST_SOURCE " + #include + volatile uint8x16_t r, a, b; + int main(void) { r = veorq_u8(a, b); }") +if(HAVE_ARM_NEON_H) + set(neon ON) + set(neon_header "arm_neon.h") +else() + test_compile_with_flags(HAVE_ARM64_NEON_H TEST_SOURCE " + #include + volatile uint8x16_t r, a, b; + int main(void) { r = veorq_u8(a, b); }") + if(HAVE_ARM64_NEON_H) + set(neon ON) + set(neon_header "arm64_neon.h") + set(USE_ARM64_NEON_H ON) + endif() +endif() + +if(neon) + # If we have _some_ NEON header, look for the individual things we + # can enable with it. + + # The 'crypto' architecture extension includes support for AES, + # SHA-1, and SHA-256. + test_compile_with_flags(HAVE_NEON_CRYPTO + GNU_FLAGS -march=armv8-a+crypto + MSVC_FLAGS -D_ARM_USE_NEW_NEON_INTRINSICS + TEST_SOURCE " + #include <${neon_header}> + volatile uint8x16_t r, a, b; + volatile uint32x4_t s, x, y, z; + int main(void) { r = vaeseq_u8(a, b); s = vsha256hq_u32(x, y, z); }" + ADD_SOURCES_IF_SUCCESSFUL aes-neon.c sha256-neon.c sha1-neon.c) + + # The 'sha3' architecture extension, despite the name, includes + # support for SHA-512 (from the SHA-2 standard) as well as SHA-3 + # proper. + # + # Versions of clang up to and including clang 12 support this + # extension in assembly language, but not the ACLE intrinsics for + # it. So we check both. + test_compile_with_flags(HAVE_NEON_SHA512_INTRINSICS + GNU_FLAGS -march=armv8.2-a+crypto+sha3 + TEST_SOURCE " + #include <${neon_header}> + volatile uint64x2_t r, a, b; + int main(void) { r = vsha512su0q_u64(a, b); }" + ADD_SOURCES_IF_SUCCESSFUL sha512-neon.c) + if(HAVE_NEON_SHA512_INTRINSICS) + set(HAVE_NEON_SHA512 ON) + else() + test_compile_with_flags(HAVE_NEON_SHA512_ASM + GNU_FLAGS -march=armv8.2-a+crypto+sha3 + TEST_SOURCE " + #include <${neon_header}> + volatile uint64x2_t r, a; + int main(void) { __asm__(\"sha512su0 %0.2D,%1.2D\" : \"+w\" (r) : \"w\" (a)); }" + ADD_SOURCES_IF_SUCCESSFUL sha512-neon.c) + if(HAVE_NEON_SHA512_ASM) + set(HAVE_NEON_SHA512 ON) + endif() + endif() +endif() + +set(HAVE_AES_NI ${HAVE_AES_NI} PARENT_SCOPE) +set(HAVE_SHA_NI ${HAVE_SHA_NI} PARENT_SCOPE) +set(HAVE_SHAINTRIN_H ${HAVE_SHAINTRIN_H} PARENT_SCOPE) +set(HAVE_NEON_CRYPTO ${HAVE_NEON_CRYPTO} PARENT_SCOPE) +set(HAVE_NEON_SHA512 ${HAVE_NEON_SHA512} PARENT_SCOPE) +set(HAVE_NEON_SHA512_INTRINSICS ${HAVE_NEON_SHA512_INTRINSICS} PARENT_SCOPE) +set(USE_ARM64_NEON_H ${USE_ARM64_NEON_H} PARENT_SCOPE) diff --git a/crypto/aes-common.c b/crypto/aes-common.c new file mode 100644 index 00000000..e1c41ddf --- /dev/null +++ b/crypto/aes-common.c @@ -0,0 +1,14 @@ +/* + * Common variable definitions across all the AES implementations. + */ + +#include "ssh.h" +#include "aes.h" + +const uint8_t aes_key_setup_round_constants[10] = { + /* The first few powers of X in GF(2^8), used during key setup. + * This can safely be a lookup table without side channel risks, + * because key setup iterates through it once in a standard way + * regardless of the key. */ + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, +}; diff --git a/crypto/aes-neon.c b/crypto/aes-neon.c new file mode 100644 index 00000000..d47d2fd3 --- /dev/null +++ b/crypto/aes-neon.c @@ -0,0 +1,294 @@ +/* ---------------------------------------------------------------------- + * Hardware-accelerated implementation of AES using Arm NEON. + */ + +#include "ssh.h" +#include "aes.h" + +#if USE_ARM64_NEON_H +#include +#else +#include +#endif + +static bool aes_neon_available(void) +{ + /* + * For Arm, we delegate to a per-platform AES detection function, + * because it has to be implemented by asking the operating system + * rather than directly querying the CPU. + * + * That's because Arm systems commonly have multiple cores that + * are not all alike, so any method of querying whether NEON + * crypto instructions work on the _current_ CPU - even one as + * crude as just trying one and catching the SIGILL - wouldn't + * give an answer that you could still rely on the first time the + * OS migrated your process to another CPU. + */ + return platform_aes_neon_available(); +} + +/* + * Core NEON encrypt/decrypt functions, one per length and direction. + */ + +#define NEON_CIPHER(len, repmacro) \ + static inline uint8x16_t aes_neon_##len##_e( \ + uint8x16_t v, const uint8x16_t *keysched) \ + { \ + repmacro(v = vaesmcq_u8(vaeseq_u8(v, *keysched++));); \ + v = vaeseq_u8(v, *keysched++); \ + return veorq_u8(v, *keysched); \ + } \ + static inline uint8x16_t aes_neon_##len##_d( \ + uint8x16_t v, const uint8x16_t *keysched) \ + { \ + repmacro(v = vaesimcq_u8(vaesdq_u8(v, *keysched++));); \ + v = vaesdq_u8(v, *keysched++); \ + return veorq_u8(v, *keysched); \ + } + +NEON_CIPHER(128, REP9) +NEON_CIPHER(192, REP11) +NEON_CIPHER(256, REP13) + +/* + * The main key expansion. + */ +static void aes_neon_key_expand( + const unsigned char *key, size_t key_words, + uint8x16_t *keysched_e, uint8x16_t *keysched_d) +{ + size_t rounds = key_words + 6; + size_t sched_words = (rounds + 1) * 4; + + /* + * Store the key schedule as 32-bit integers during expansion, so + * that it's easy to refer back to individual previous words. We + * collect them into the final uint8x16_t form at the end. + */ + uint32_t sched[MAXROUNDKEYS * 4]; + + unsigned rconpos = 0; + + for (size_t i = 0; i < sched_words; i++) { + if (i < key_words) { + sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i); + } else { + uint32_t temp = sched[i - 1]; + + bool rotate_and_round_constant = (i % key_words == 0); + bool sub = rotate_and_round_constant || + (key_words == 8 && i % 8 == 4); + + if (rotate_and_round_constant) + temp = (temp << 24) | (temp >> 8); + + if (sub) { + uint32x4_t v32 = vdupq_n_u32(temp); + uint8x16_t v8 = vreinterpretq_u8_u32(v32); + v8 = vaeseq_u8(v8, vdupq_n_u8(0)); + v32 = vreinterpretq_u32_u8(v8); + temp = vget_lane_u32(vget_low_u32(v32), 0); + } + + if (rotate_and_round_constant) { + assert(rconpos < lenof(aes_key_setup_round_constants)); + temp ^= aes_key_setup_round_constants[rconpos++]; + } + + sched[i] = sched[i - key_words] ^ temp; + } + } + + /* + * Combine the key schedule words into uint8x16_t vectors and + * store them in the output context. + */ + for (size_t round = 0; round <= rounds; round++) + keysched_e[round] = vreinterpretq_u8_u32(vld1q_u32(sched + 4*round)); + + smemclr(sched, sizeof(sched)); + + /* + * Now prepare the modified keys for the inverse cipher. + */ + for (size_t eround = 0; eround <= rounds; eround++) { + size_t dround = rounds - eround; + uint8x16_t rkey = keysched_e[eround]; + if (eround && dround) /* neither first nor last */ + rkey = vaesimcq_u8(rkey); + keysched_d[dround] = rkey; + } +} + +/* + * Auxiliary routine to reverse the byte order of a vector, so that + * the SDCTR IV can be made big-endian for feeding to the cipher. + * + * In fact we don't need to reverse the vector _all_ the way; we leave + * the two lanes in MSW,LSW order, because that makes no difference to + * the efficiency of the increment. That way we only have to reverse + * bytes within each lane in this function. + */ +static inline uint8x16_t aes_neon_sdctr_reverse(uint8x16_t v) +{ + return vrev64q_u8(v); +} + +/* + * Auxiliary routine to increment the 128-bit counter used in SDCTR + * mode. There's no instruction to treat a 128-bit vector as a single + * long integer, so instead we have to increment the bottom half + * unconditionally, and the top half if the bottom half started off as + * all 1s (in which case there was about to be a carry). + */ +static inline uint8x16_t aes_neon_sdctr_increment(uint8x16_t in) +{ +#ifdef __aarch64__ + /* There will be a carry if the low 64 bits are all 1s. */ + uint64x1_t all1 = vcreate_u64(0xFFFFFFFFFFFFFFFF); + uint64x1_t carry = vceq_u64(vget_high_u64(vreinterpretq_u64_u8(in)), all1); + + /* Make a word whose bottom half is unconditionally all 1s, and + * the top half is 'carry', i.e. all 0s most of the time but all + * 1s if we need to increment the top half. Then that word is what + * we need to _subtract_ from the input counter. */ + uint64x2_t subtrahend = vcombine_u64(carry, all1); +#else + /* AArch32 doesn't have comparisons that operate on a 64-bit lane, + * so we start by comparing each 32-bit half of the low 64 bits + * _separately_ to all-1s. */ + uint32x2_t all1 = vdup_n_u32(0xFFFFFFFF); + uint32x2_t carry = vceq_u32( + vget_high_u32(vreinterpretq_u32_u8(in)), all1); + + /* Swap the 32-bit words of the compare output, and AND with the + * unswapped version. Now carry is all 1s iff the bottom half of + * the input counter was all 1s, and all 0s otherwise. */ + carry = vand_u32(carry, vrev64_u32(carry)); + + /* Now make the vector to subtract in the same way as above. */ + uint64x2_t subtrahend = vreinterpretq_u64_u32(vcombine_u32(carry, all1)); +#endif + + return vreinterpretq_u8_u64( + vsubq_u64(vreinterpretq_u64_u8(in), subtrahend)); +} + +/* + * The SSH interface and the cipher modes. + */ + +typedef struct aes_neon_context aes_neon_context; +struct aes_neon_context { + uint8x16_t keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv; + + ssh_cipher ciph; +}; + +static ssh_cipher *aes_neon_new(const ssh_cipheralg *alg) +{ + const struct aes_extra *extra = (const struct aes_extra *)alg->extra; + if (!check_availability(extra)) + return NULL; + + aes_neon_context *ctx = snew(aes_neon_context); + ctx->ciph.vt = alg; + return &ctx->ciph; +} + +static void aes_neon_free(ssh_cipher *ciph) +{ + aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); + smemclr(ctx, sizeof(*ctx)); + sfree(ctx); +} + +static void aes_neon_setkey(ssh_cipher *ciph, const void *vkey) +{ + aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); + const unsigned char *key = (const unsigned char *)vkey; + + aes_neon_key_expand(key, ctx->ciph.vt->real_keybits / 32, + ctx->keysched_e, ctx->keysched_d); +} + +static void aes_neon_setiv_cbc(ssh_cipher *ciph, const void *iv) +{ + aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); + ctx->iv = vld1q_u8(iv); +} + +static void aes_neon_setiv_sdctr(ssh_cipher *ciph, const void *iv) +{ + aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); + uint8x16_t counter = vld1q_u8(iv); + ctx->iv = aes_neon_sdctr_reverse(counter); +} + +typedef uint8x16_t (*aes_neon_fn)(uint8x16_t v, const uint8x16_t *keysched); + +static inline void aes_cbc_neon_encrypt( + ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt) +{ + aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); + + for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; + blk < finish; blk += 16) { + uint8x16_t plaintext = vld1q_u8(blk); + uint8x16_t cipher_input = veorq_u8(plaintext, ctx->iv); + uint8x16_t ciphertext = encrypt(cipher_input, ctx->keysched_e); + vst1q_u8(blk, ciphertext); + ctx->iv = ciphertext; + } +} + +static inline void aes_cbc_neon_decrypt( + ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn decrypt) +{ + aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); + + for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; + blk < finish; blk += 16) { + uint8x16_t ciphertext = vld1q_u8(blk); + uint8x16_t decrypted = decrypt(ciphertext, ctx->keysched_d); + uint8x16_t plaintext = veorq_u8(decrypted, ctx->iv); + vst1q_u8(blk, plaintext); + ctx->iv = ciphertext; + } +} + +static inline void aes_sdctr_neon( + ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt) +{ + aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); + + for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; + blk < finish; blk += 16) { + uint8x16_t counter = aes_neon_sdctr_reverse(ctx->iv); + uint8x16_t keystream = encrypt(counter, ctx->keysched_e); + uint8x16_t input = vld1q_u8(blk); + uint8x16_t output = veorq_u8(input, keystream); + vst1q_u8(blk, output); + ctx->iv = aes_neon_sdctr_increment(ctx->iv); + } +} + +#define NEON_ENC_DEC(len) \ + static void aes##len##_neon_cbc_encrypt( \ + ssh_cipher *ciph, void *vblk, int blklen) \ + { aes_cbc_neon_encrypt(ciph, vblk, blklen, aes_neon_##len##_e); } \ + static void aes##len##_neon_cbc_decrypt( \ + ssh_cipher *ciph, void *vblk, int blklen) \ + { aes_cbc_neon_decrypt(ciph, vblk, blklen, aes_neon_##len##_d); } \ + static void aes##len##_neon_sdctr( \ + ssh_cipher *ciph, void *vblk, int blklen) \ + { aes_sdctr_neon(ciph, vblk, blklen, aes_neon_##len##_e); } \ + +NEON_ENC_DEC(128) +NEON_ENC_DEC(192) +NEON_ENC_DEC(256) + +AES_EXTRA(_neon); +AES_ALL_VTABLES(_neon, "NEON accelerated"); diff --git a/crypto/aes-ni.c b/crypto/aes-ni.c new file mode 100644 index 00000000..22348de4 --- /dev/null +++ b/crypto/aes-ni.c @@ -0,0 +1,281 @@ +/* + * Hardware-accelerated implementation of AES using x86 AES-NI. + */ + +#include "ssh.h" +#include "aes.h" + +#include +#include + +#if defined(__clang__) || defined(__GNUC__) +#include +#define GET_CPU_ID(out) __cpuid(1, (out)[0], (out)[1], (out)[2], (out)[3]) +#else +#define GET_CPU_ID(out) __cpuid(out, 1) +#endif + +static bool aes_ni_available(void) +{ + /* + * Determine if AES is available on this CPU, by checking that + * both AES itself and SSE4.1 are supported. + */ + unsigned int CPUInfo[4]; + GET_CPU_ID(CPUInfo); + return (CPUInfo[2] & (1 << 25)) && (CPUInfo[2] & (1 << 19)); +} + +/* + * Core AES-NI encrypt/decrypt functions, one per length and direction. + */ + +#define NI_CIPHER(len, dir, dirlong, repmacro) \ + static inline __m128i aes_ni_##len##_##dir( \ + __m128i v, const __m128i *keysched) \ + { \ + v = _mm_xor_si128(v, *keysched++); \ + repmacro(v = _mm_aes##dirlong##_si128(v, *keysched++);); \ + return _mm_aes##dirlong##last_si128(v, *keysched); \ + } + +NI_CIPHER(128, e, enc, REP9) +NI_CIPHER(128, d, dec, REP9) +NI_CIPHER(192, e, enc, REP11) +NI_CIPHER(192, d, dec, REP11) +NI_CIPHER(256, e, enc, REP13) +NI_CIPHER(256, d, dec, REP13) + +/* + * The main key expansion. + */ +static void aes_ni_key_expand( + const unsigned char *key, size_t key_words, + __m128i *keysched_e, __m128i *keysched_d) +{ + size_t rounds = key_words + 6; + size_t sched_words = (rounds + 1) * 4; + + /* + * Store the key schedule as 32-bit integers during expansion, so + * that it's easy to refer back to individual previous words. We + * collect them into the final __m128i form at the end. + */ + uint32_t sched[MAXROUNDKEYS * 4]; + + unsigned rconpos = 0; + + for (size_t i = 0; i < sched_words; i++) { + if (i < key_words) { + sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i); + } else { + uint32_t temp = sched[i - 1]; + + bool rotate_and_round_constant = (i % key_words == 0); + bool only_sub = (key_words == 8 && i % 8 == 4); + + if (rotate_and_round_constant) { + __m128i v = _mm_setr_epi32(0,temp,0,0); + v = _mm_aeskeygenassist_si128(v, 0); + temp = _mm_extract_epi32(v, 1); + + assert(rconpos < lenof(aes_key_setup_round_constants)); + temp ^= aes_key_setup_round_constants[rconpos++]; + } else if (only_sub) { + __m128i v = _mm_setr_epi32(0,temp,0,0); + v = _mm_aeskeygenassist_si128(v, 0); + temp = _mm_extract_epi32(v, 0); + } + + sched[i] = sched[i - key_words] ^ temp; + } + } + + /* + * Combine the key schedule words into __m128i vectors and store + * them in the output context. + */ + for (size_t round = 0; round <= rounds; round++) + keysched_e[round] = _mm_setr_epi32( + sched[4*round ], sched[4*round+1], + sched[4*round+2], sched[4*round+3]); + + smemclr(sched, sizeof(sched)); + + /* + * Now prepare the modified keys for the inverse cipher. + */ + for (size_t eround = 0; eround <= rounds; eround++) { + size_t dround = rounds - eround; + __m128i rkey = keysched_e[eround]; + if (eround && dround) /* neither first nor last */ + rkey = _mm_aesimc_si128(rkey); + keysched_d[dround] = rkey; + } +} + +/* + * Auxiliary routine to increment the 128-bit counter used in SDCTR + * mode. + */ +static inline __m128i aes_ni_sdctr_increment(__m128i v) +{ + const __m128i ONE = _mm_setr_epi32(1,0,0,0); + const __m128i ZERO = _mm_setzero_si128(); + + /* Increment the low-order 64 bits of v */ + v = _mm_add_epi64(v, ONE); + /* Check if they've become zero */ + __m128i cmp = _mm_cmpeq_epi64(v, ZERO); + /* If so, the low half of cmp is all 1s. Pack that into the high + * half of addend with zero in the low half. */ + __m128i addend = _mm_unpacklo_epi64(ZERO, cmp); + /* And subtract that from v, which increments the high 64 bits iff + * the low 64 wrapped round. */ + v = _mm_sub_epi64(v, addend); + + return v; +} + +/* + * Auxiliary routine to reverse the byte order of a vector, so that + * the SDCTR IV can be made big-endian for feeding to the cipher. + */ +static inline __m128i aes_ni_sdctr_reverse(__m128i v) +{ + v = _mm_shuffle_epi8( + v, _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)); + return v; +} + +/* + * The SSH interface and the cipher modes. + */ + +typedef struct aes_ni_context aes_ni_context; +struct aes_ni_context { + __m128i keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv; + + void *pointer_to_free; + ssh_cipher ciph; +}; + +static ssh_cipher *aes_ni_new(const ssh_cipheralg *alg) +{ + const struct aes_extra *extra = (const struct aes_extra *)alg->extra; + if (!check_availability(extra)) + return NULL; + + /* + * The __m128i variables in the context structure need to be + * 16-byte aligned, but not all malloc implementations that this + * code has to work with will guarantee to return a 16-byte + * aligned pointer. So we over-allocate, manually realign the + * pointer ourselves, and store the original one inside the + * context so we know how to free it later. + */ + void *allocation = smalloc(sizeof(aes_ni_context) + 15); + uintptr_t alloc_address = (uintptr_t)allocation; + uintptr_t aligned_address = (alloc_address + 15) & ~15; + aes_ni_context *ctx = (aes_ni_context *)aligned_address; + + ctx->ciph.vt = alg; + ctx->pointer_to_free = allocation; + return &ctx->ciph; +} + +static void aes_ni_free(ssh_cipher *ciph) +{ + aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); + void *allocation = ctx->pointer_to_free; + smemclr(ctx, sizeof(*ctx)); + sfree(allocation); +} + +static void aes_ni_setkey(ssh_cipher *ciph, const void *vkey) +{ + aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); + const unsigned char *key = (const unsigned char *)vkey; + + aes_ni_key_expand(key, ctx->ciph.vt->real_keybits / 32, + ctx->keysched_e, ctx->keysched_d); +} + +static void aes_ni_setiv_cbc(ssh_cipher *ciph, const void *iv) +{ + aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); + ctx->iv = _mm_loadu_si128(iv); +} + +static void aes_ni_setiv_sdctr(ssh_cipher *ciph, const void *iv) +{ + aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); + __m128i counter = _mm_loadu_si128(iv); + ctx->iv = aes_ni_sdctr_reverse(counter); +} + +typedef __m128i (*aes_ni_fn)(__m128i v, const __m128i *keysched); + +static inline void aes_cbc_ni_encrypt( + ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt) +{ + aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); + + for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; + blk < finish; blk += 16) { + __m128i plaintext = _mm_loadu_si128((const __m128i *)blk); + __m128i cipher_input = _mm_xor_si128(plaintext, ctx->iv); + __m128i ciphertext = encrypt(cipher_input, ctx->keysched_e); + _mm_storeu_si128((__m128i *)blk, ciphertext); + ctx->iv = ciphertext; + } +} + +static inline void aes_cbc_ni_decrypt( + ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn decrypt) +{ + aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); + + for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; + blk < finish; blk += 16) { + __m128i ciphertext = _mm_loadu_si128((const __m128i *)blk); + __m128i decrypted = decrypt(ciphertext, ctx->keysched_d); + __m128i plaintext = _mm_xor_si128(decrypted, ctx->iv); + _mm_storeu_si128((__m128i *)blk, plaintext); + ctx->iv = ciphertext; + } +} + +static inline void aes_sdctr_ni( + ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt) +{ + aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); + + for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; + blk < finish; blk += 16) { + __m128i counter = aes_ni_sdctr_reverse(ctx->iv); + __m128i keystream = encrypt(counter, ctx->keysched_e); + __m128i input = _mm_loadu_si128((const __m128i *)blk); + __m128i output = _mm_xor_si128(input, keystream); + _mm_storeu_si128((__m128i *)blk, output); + ctx->iv = aes_ni_sdctr_increment(ctx->iv); + } +} + +#define NI_ENC_DEC(len) \ + static void aes##len##_ni_cbc_encrypt( \ + ssh_cipher *ciph, void *vblk, int blklen) \ + { aes_cbc_ni_encrypt(ciph, vblk, blklen, aes_ni_##len##_e); } \ + static void aes##len##_ni_cbc_decrypt( \ + ssh_cipher *ciph, void *vblk, int blklen) \ + { aes_cbc_ni_decrypt(ciph, vblk, blklen, aes_ni_##len##_d); } \ + static void aes##len##_ni_sdctr( \ + ssh_cipher *ciph, void *vblk, int blklen) \ + { aes_sdctr_ni(ciph, vblk, blklen, aes_ni_##len##_e); } \ + +NI_ENC_DEC(128) +NI_ENC_DEC(192) +NI_ENC_DEC(256) + +AES_EXTRA(_ni); +AES_ALL_VTABLES(_ni, "AES-NI accelerated"); diff --git a/crypto/aes-select.c b/crypto/aes-select.c new file mode 100644 index 00000000..f0c5031f --- /dev/null +++ b/crypto/aes-select.c @@ -0,0 +1,89 @@ +/* + * Top-level vtables to select an AES implementation. + */ + +#include +#include + +#include "putty.h" +#include "ssh.h" +#include "aes.h" + +static ssh_cipher *aes_select(const ssh_cipheralg *alg) +{ + const ssh_cipheralg *const *real_algs = (const ssh_cipheralg **)alg->extra; + + for (size_t i = 0; real_algs[i]; i++) { + const ssh_cipheralg *alg = real_algs[i]; + const struct aes_extra *alg_extra = + (const struct aes_extra *)alg->extra; + if (check_availability(alg_extra)) + return ssh_cipher_new(alg); + } + + /* We should never reach the NULL at the end of the list, because + * the last non-NULL entry should be software-only AES, which is + * always available. */ + unreachable("aes_select ran off the end of its list"); +} + +#if HAVE_AES_NI +#define IF_NI(...) __VA_ARGS__ +#else +#define IF_NI(...) +#endif + +#if HAVE_NEON_CRYPTO +#define IF_NEON(...) __VA_ARGS__ +#else +#define IF_NEON(...) +#endif + +#define AES_SELECTOR_VTABLE(mode_c, mode_protocol, mode_display, bits) \ + static const ssh_cipheralg * \ + ssh_aes ## bits ## _ ## mode_c ## _impls[] = { \ + IF_NI(&ssh_aes ## bits ## _ ## mode_c ## _ni,) \ + IF_NEON(&ssh_aes ## bits ## _ ## mode_c ## _neon,) \ + &ssh_aes ## bits ## _ ## mode_c ## _sw, \ + NULL, \ + }; \ + const ssh_cipheralg ssh_aes ## bits ## _ ## mode_c = { \ + .new = aes_select, \ + .ssh2_id = "aes" #bits "-" mode_protocol, \ + .blksize = 16, \ + .real_keybits = bits, \ + .padded_keybytes = bits/8, \ + .text_name = "AES-" #bits " " mode_display \ + " (dummy selector vtable)", \ + .extra = ssh_aes ## bits ## _ ## mode_c ## _impls, \ + } + +AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 128); +AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 192); +AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 256); +AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 128); +AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 192); +AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 256); + +static const ssh_cipheralg ssh_rijndael_lysator = { + /* Same as aes256_cbc, but with a different protocol ID */ + .new = aes_select, + .ssh2_id = "rijndael-cbc@lysator.liu.se", + .blksize = 16, + .real_keybits = 256, + .padded_keybytes = 256/8, + .text_name = "AES-256 CBC (dummy selector vtable)", + .extra = ssh_aes256_cbc_impls, +}; + +static const ssh_cipheralg *const aes_list[] = { + &ssh_aes256_sdctr, + &ssh_aes256_cbc, + &ssh_rijndael_lysator, + &ssh_aes192_sdctr, + &ssh_aes192_cbc, + &ssh_aes128_sdctr, + &ssh_aes128_cbc, +}; + +const ssh2_ciphers ssh2_aes = { lenof(aes_list), aes_list }; diff --git a/crypto/aes.c b/crypto/aes-sw.c similarity index 59% rename from crypto/aes.c rename to crypto/aes-sw.c index a7ca1117..f8512388 100644 --- a/crypto/aes.c +++ b/crypto/aes-sw.c @@ -1,247 +1,4 @@ /* - * Implementation of AES. - */ - -#include -#include - -#include "ssh.h" -#include "mpint_i.h" /* we reuse the BignumInt system */ - -/* - * Start by deciding whether we can support hardware AES at all. - */ -#define HW_AES_NONE 0 -#define HW_AES_NI 1 -#define HW_AES_NEON 2 - -#ifdef _FORCE_AES_NI -# define HW_AES HW_AES_NI -#elif defined(__clang__) -# if __has_attribute(target) && __has_include() && \ - (defined(__x86_64__) || defined(__i386)) -# define HW_AES HW_AES_NI -# endif -#elif defined(__GNUC__) -# if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) && \ - (defined(__x86_64__) || defined(__i386)) -# define HW_AES HW_AES_NI -# endif -#elif defined (_MSC_VER) -# if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729 -# define HW_AES HW_AES_NI -# endif -#endif - -#ifdef _FORCE_AES_NEON -# define HW_AES HW_AES_NEON -#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - /* Arm can potentially support both endiannesses, but this code - * hasn't been tested on anything but little. If anyone wants to - * run big-endian, they'll need to fix it first. */ -#elif defined __ARM_FEATURE_CRYPTO - /* If the Arm crypto extension is available already, we can - * support NEON AES without having to enable anything by hand */ -# define HW_AES HW_AES_NEON -#elif defined(__clang__) -# if __has_attribute(target) && __has_include() && \ - (defined(__aarch64__)) - /* clang can enable the crypto extension in AArch64 using - * __attribute__((target)) */ -# define HW_AES HW_AES_NEON -# define USE_CLANG_ATTR_TARGET_AARCH64 -# endif -#elif defined _MSC_VER -# if defined _M_ARM64 -# define HW_AES HW_AES_NEON - /* 64-bit Visual Studio uses the header in place - * of the standard */ -# define USE_ARM64_NEON_H -# elif defined _M_ARM -# define HW_AES HW_AES_NEON - /* 32-bit Visual Studio uses the right header name, but requires - * this #define to enable a set of intrinsic definitions that - * do not omit one of the parameters for vaes[ed]q_u8 */ -# define _ARM_USE_NEW_NEON_INTRINSICS -# endif -#endif - -#if defined _FORCE_SOFTWARE_AES || !defined HW_AES -# undef HW_AES -# define HW_AES HW_AES_NONE -#endif - -#if HW_AES == HW_AES_NI -#define HW_NAME_SUFFIX " (AES-NI accelerated)" -#elif HW_AES == HW_AES_NEON -#define HW_NAME_SUFFIX " (NEON accelerated)" -#else -#define HW_NAME_SUFFIX " (!NONEXISTENT ACCELERATED VERSION!)" -#endif - -/* - * Vtable collection for AES. For each SSH-level cipher id (i.e. - * combination of key length and cipher mode), we provide three - * vtables: one for the pure software implementation, one using - * hardware acceleration (if available), and a top-level one which is - * never actually instantiated, and only contains a new() method whose - * job is to decide which of the other two to return an actual - * instance of. - */ - -static ssh_cipher *aes_select(const ssh_cipheralg *alg); -static ssh_cipher *aes_sw_new(const ssh_cipheralg *alg); -static void aes_sw_free(ssh_cipher *); -static void aes_sw_setiv_cbc(ssh_cipher *, const void *iv); -static void aes_sw_setiv_sdctr(ssh_cipher *, const void *iv); -static void aes_sw_setkey(ssh_cipher *, const void *key); -static ssh_cipher *aes_hw_new(const ssh_cipheralg *alg); -static void aes_hw_free(ssh_cipher *); -static void aes_hw_setiv_cbc(ssh_cipher *, const void *iv); -static void aes_hw_setiv_sdctr(ssh_cipher *, const void *iv); -static void aes_hw_setkey(ssh_cipher *, const void *key); - -struct aes_extra { - const ssh_cipheralg *sw, *hw; -}; - -#define VTABLES_INNER(cid, pid, bits, name, encsuffix, \ - decsuffix, setivsuffix, flagsval) \ - static void cid##_sw##encsuffix(ssh_cipher *, void *blk, int len); \ - static void cid##_sw##decsuffix(ssh_cipher *, void *blk, int len); \ - const ssh_cipheralg ssh_##cid##_sw = { \ - .new = aes_sw_new, \ - .free = aes_sw_free, \ - .setiv = aes_sw_##setivsuffix, \ - .setkey = aes_sw_setkey, \ - .encrypt = cid##_sw##encsuffix, \ - .decrypt = cid##_sw##decsuffix, \ - .ssh2_id = pid, \ - .blksize = 16, \ - .real_keybits = bits, \ - .padded_keybytes = bits/8, \ - .flags = flagsval, \ - .text_name = name " (unaccelerated)", \ - }; \ - \ - static void cid##_hw##encsuffix(ssh_cipher *, void *blk, int len); \ - static void cid##_hw##decsuffix(ssh_cipher *, void *blk, int len); \ - const ssh_cipheralg ssh_##cid##_hw = { \ - .new = aes_hw_new, \ - .free = aes_hw_free, \ - .setiv = aes_hw_##setivsuffix, \ - .setkey = aes_hw_setkey, \ - .encrypt = cid##_hw##encsuffix, \ - .decrypt = cid##_hw##decsuffix, \ - .ssh2_id = pid, \ - .blksize = 16, \ - .real_keybits = bits, \ - .padded_keybytes = bits/8, \ - .flags = flagsval, \ - .text_name = name HW_NAME_SUFFIX, \ - }; \ - \ - static const struct aes_extra extra_##cid = { \ - &ssh_##cid##_sw, &ssh_##cid##_hw }; \ - \ - const ssh_cipheralg ssh_##cid = { \ - .new = aes_select, \ - .ssh2_id = pid, \ - .blksize = 16, \ - .real_keybits = bits, \ - .padded_keybytes = bits/8, \ - .flags = flagsval, \ - .text_name = name " (dummy selector vtable)", \ - .extra = &extra_##cid \ - }; \ - -#define VTABLES(keylen) \ - VTABLES_INNER(aes ## keylen ## _cbc, "aes" #keylen "-cbc", \ - keylen, "AES-" #keylen " CBC", _encrypt, _decrypt, \ - setiv_cbc, SSH_CIPHER_IS_CBC) \ - VTABLES_INNER(aes ## keylen ## _sdctr, "aes" #keylen "-ctr", \ - keylen, "AES-" #keylen " SDCTR",,, setiv_sdctr, 0) - -VTABLES(128) -VTABLES(192) -VTABLES(256) - -static const ssh_cipheralg ssh_rijndael_lysator = { - /* Same as aes256_cbc, but with a different protocol ID */ - .new = aes_select, - .ssh2_id = "rijndael-cbc@lysator.liu.se", - .blksize = 16, - .real_keybits = 256, - .padded_keybytes = 256/8, - .flags = 0, - .text_name = "AES-256 CBC (dummy selector vtable)", - .extra = &extra_aes256_cbc, -}; - -static const ssh_cipheralg *const aes_list[] = { - &ssh_aes256_sdctr, - &ssh_aes256_cbc, - &ssh_rijndael_lysator, - &ssh_aes192_sdctr, - &ssh_aes192_cbc, - &ssh_aes128_sdctr, - &ssh_aes128_cbc, -}; - -const ssh2_ciphers ssh2_aes = { lenof(aes_list), aes_list }; - -/* - * The actual query function that asks if hardware acceleration is - * available. - */ -static bool aes_hw_available(void); - -/* - * The top-level selection function, caching the results of - * aes_hw_available() so it only has to run once. - */ -static bool aes_hw_available_cached(void) -{ - static bool initialised = false; - static bool hw_available; - if (!initialised) { - hw_available = aes_hw_available(); - initialised = true; - } - return hw_available; -} - -static ssh_cipher *aes_select(const ssh_cipheralg *alg) -{ - const struct aes_extra *extra = (const struct aes_extra *)alg->extra; - const ssh_cipheralg *real_alg = - aes_hw_available_cached() ? extra->hw : extra->sw; - - return ssh_cipher_new(real_alg); -} - -/* ---------------------------------------------------------------------- - * Definitions likely to be helpful to multiple implementations. - */ - -#define REP2(x) x x -#define REP4(x) REP2(REP2(x)) -#define REP8(x) REP2(REP4(x)) -#define REP9(x) REP8(x) x -#define REP11(x) REP8(x) REP2(x) x -#define REP13(x) REP8(x) REP4(x) x - -static const uint8_t key_setup_round_constants[] = { - /* The first few powers of X in GF(2^8), used during key setup. - * This can safely be a lookup table without side channel risks, - * because key setup iterates through it once in a standard way - * regardless of the key. */ - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, -}; - -#define MAXROUNDKEYS 15 - -/* ---------------------------------------------------------------------- * Software implementation of AES. * * This implementation uses a bit-sliced representation. Instead of @@ -257,6 +14,16 @@ static const uint8_t key_setup_round_constants[] = { * ops you get 64 S-box lookups, not just one. */ +#include "ssh.h" +#include "aes.h" +#include "mpint_i.h" /* we reuse the BignumInt system */ + +static bool aes_sw_available(void) +{ + /* Software AES is always available */ + return true; +} + #define SLICE_PARALLELISM (BIGNUM_INT_BYTES / 2) #ifdef BITSLICED_DEBUG @@ -922,8 +689,8 @@ static void aes_sliced_key_setup( } if (rotate_and_round_constant) { - assert(rconpos < lenof(key_setup_round_constants)); - uint8_t rcon = key_setup_round_constants[rconpos++]; + assert(rconpos < lenof(aes_key_setup_round_constants)); + uint8_t rcon = aes_key_setup_round_constants[rconpos++]; for (size_t i = 0; i < 8; i++) slices[i] ^= 1 & (rcon >> i); } @@ -1255,13 +1022,13 @@ static inline void aes_sdctr_sw( } #define SW_ENC_DEC(len) \ - static void aes##len##_cbc_sw_encrypt( \ + static void aes##len##_sw_cbc_encrypt( \ ssh_cipher *ciph, void *vblk, int blklen) \ { aes_cbc_sw_encrypt(ciph, vblk, blklen); } \ - static void aes##len##_cbc_sw_decrypt( \ + static void aes##len##_sw_cbc_decrypt( \ ssh_cipher *ciph, void *vblk, int blklen) \ { aes_cbc_sw_decrypt(ciph, vblk, blklen); } \ - static void aes##len##_sdctr_sw( \ + static void aes##len##_sw_sdctr( \ ssh_cipher *ciph, void *vblk, int blklen) \ { aes_sdctr_sw(ciph, vblk, blklen); } @@ -1269,644 +1036,5 @@ SW_ENC_DEC(128) SW_ENC_DEC(192) SW_ENC_DEC(256) -/* ---------------------------------------------------------------------- - * Hardware-accelerated implementation of AES using x86 AES-NI. - */ - -#if HW_AES == HW_AES_NI - -/* - * Set target architecture for Clang and GCC - */ -#if !defined(__clang__) && defined(__GNUC__) -# pragma GCC target("aes") -# pragma GCC target("sse4.1") -#endif - -#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) -# define FUNC_ISA __attribute__ ((target("sse4.1,aes"))) -#else -# define FUNC_ISA -#endif - -#include -#include - -#if defined(__clang__) || defined(__GNUC__) -#include -#define GET_CPU_ID(out) __cpuid(1, (out)[0], (out)[1], (out)[2], (out)[3]) -#else -#define GET_CPU_ID(out) __cpuid(out, 1) -#endif - -bool aes_hw_available(void) -{ - /* - * Determine if AES is available on this CPU, by checking that - * both AES itself and SSE4.1 are supported. - */ - unsigned int CPUInfo[4]; - GET_CPU_ID(CPUInfo); - return (CPUInfo[2] & (1 << 25)) && (CPUInfo[2] & (1 << 19)); -} - -/* - * Core AES-NI encrypt/decrypt functions, one per length and direction. - */ - -#define NI_CIPHER(len, dir, dirlong, repmacro) \ - static FUNC_ISA inline __m128i aes_ni_##len##_##dir( \ - __m128i v, const __m128i *keysched) \ - { \ - v = _mm_xor_si128(v, *keysched++); \ - repmacro(v = _mm_aes##dirlong##_si128(v, *keysched++);); \ - return _mm_aes##dirlong##last_si128(v, *keysched); \ - } - -NI_CIPHER(128, e, enc, REP9) -NI_CIPHER(128, d, dec, REP9) -NI_CIPHER(192, e, enc, REP11) -NI_CIPHER(192, d, dec, REP11) -NI_CIPHER(256, e, enc, REP13) -NI_CIPHER(256, d, dec, REP13) - -/* - * The main key expansion. - */ -static FUNC_ISA void aes_ni_key_expand( - const unsigned char *key, size_t key_words, - __m128i *keysched_e, __m128i *keysched_d) -{ - size_t rounds = key_words + 6; - size_t sched_words = (rounds + 1) * 4; - - /* - * Store the key schedule as 32-bit integers during expansion, so - * that it's easy to refer back to individual previous words. We - * collect them into the final __m128i form at the end. - */ - uint32_t sched[MAXROUNDKEYS * 4]; - - unsigned rconpos = 0; - - for (size_t i = 0; i < sched_words; i++) { - if (i < key_words) { - sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i); - } else { - uint32_t temp = sched[i - 1]; - - bool rotate_and_round_constant = (i % key_words == 0); - bool only_sub = (key_words == 8 && i % 8 == 4); - - if (rotate_and_round_constant) { - __m128i v = _mm_setr_epi32(0,temp,0,0); - v = _mm_aeskeygenassist_si128(v, 0); - temp = _mm_extract_epi32(v, 1); - - assert(rconpos < lenof(key_setup_round_constants)); - temp ^= key_setup_round_constants[rconpos++]; - } else if (only_sub) { - __m128i v = _mm_setr_epi32(0,temp,0,0); - v = _mm_aeskeygenassist_si128(v, 0); - temp = _mm_extract_epi32(v, 0); - } - - sched[i] = sched[i - key_words] ^ temp; - } - } - - /* - * Combine the key schedule words into __m128i vectors and store - * them in the output context. - */ - for (size_t round = 0; round <= rounds; round++) - keysched_e[round] = _mm_setr_epi32( - sched[4*round ], sched[4*round+1], - sched[4*round+2], sched[4*round+3]); - - smemclr(sched, sizeof(sched)); - - /* - * Now prepare the modified keys for the inverse cipher. - */ - for (size_t eround = 0; eround <= rounds; eround++) { - size_t dround = rounds - eround; - __m128i rkey = keysched_e[eround]; - if (eround && dround) /* neither first nor last */ - rkey = _mm_aesimc_si128(rkey); - keysched_d[dround] = rkey; - } -} - -/* - * Auxiliary routine to increment the 128-bit counter used in SDCTR - * mode. - */ -static FUNC_ISA inline __m128i aes_ni_sdctr_increment(__m128i v) -{ - const __m128i ONE = _mm_setr_epi32(1,0,0,0); - const __m128i ZERO = _mm_setzero_si128(); - - /* Increment the low-order 64 bits of v */ - v = _mm_add_epi64(v, ONE); - /* Check if they've become zero */ - __m128i cmp = _mm_cmpeq_epi64(v, ZERO); - /* If so, the low half of cmp is all 1s. Pack that into the high - * half of addend with zero in the low half. */ - __m128i addend = _mm_unpacklo_epi64(ZERO, cmp); - /* And subtract that from v, which increments the high 64 bits iff - * the low 64 wrapped round. */ - v = _mm_sub_epi64(v, addend); - - return v; -} - -/* - * Auxiliary routine to reverse the byte order of a vector, so that - * the SDCTR IV can be made big-endian for feeding to the cipher. - */ -static FUNC_ISA inline __m128i aes_ni_sdctr_reverse(__m128i v) -{ - v = _mm_shuffle_epi8( - v, _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)); - return v; -} - -/* - * The SSH interface and the cipher modes. - */ - -typedef struct aes_ni_context aes_ni_context; -struct aes_ni_context { - __m128i keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv; - - void *pointer_to_free; - ssh_cipher ciph; -}; - -static ssh_cipher *aes_hw_new(const ssh_cipheralg *alg) -{ - if (!aes_hw_available_cached()) - return NULL; - - /* - * The __m128i variables in the context structure need to be - * 16-byte aligned, but not all malloc implementations that this - * code has to work with will guarantee to return a 16-byte - * aligned pointer. So we over-allocate, manually realign the - * pointer ourselves, and store the original one inside the - * context so we know how to free it later. - */ - void *allocation = smalloc(sizeof(aes_ni_context) + 15); - uintptr_t alloc_address = (uintptr_t)allocation; - uintptr_t aligned_address = (alloc_address + 15) & ~15; - aes_ni_context *ctx = (aes_ni_context *)aligned_address; - - ctx->ciph.vt = alg; - ctx->pointer_to_free = allocation; - return &ctx->ciph; -} - -static void aes_hw_free(ssh_cipher *ciph) -{ - aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); - void *allocation = ctx->pointer_to_free; - smemclr(ctx, sizeof(*ctx)); - sfree(allocation); -} - -static void aes_hw_setkey(ssh_cipher *ciph, const void *vkey) -{ - aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); - const unsigned char *key = (const unsigned char *)vkey; - - aes_ni_key_expand(key, ctx->ciph.vt->real_keybits / 32, - ctx->keysched_e, ctx->keysched_d); -} - -static FUNC_ISA void aes_hw_setiv_cbc(ssh_cipher *ciph, const void *iv) -{ - aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); - ctx->iv = _mm_loadu_si128(iv); -} - -static FUNC_ISA void aes_hw_setiv_sdctr(ssh_cipher *ciph, const void *iv) -{ - aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); - __m128i counter = _mm_loadu_si128(iv); - ctx->iv = aes_ni_sdctr_reverse(counter); -} - -typedef __m128i (*aes_ni_fn)(__m128i v, const __m128i *keysched); - -static FUNC_ISA inline void aes_cbc_ni_encrypt( - ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt) -{ - aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); - - for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; - blk < finish; blk += 16) { - __m128i plaintext = _mm_loadu_si128((const __m128i *)blk); - __m128i cipher_input = _mm_xor_si128(plaintext, ctx->iv); - __m128i ciphertext = encrypt(cipher_input, ctx->keysched_e); - _mm_storeu_si128((__m128i *)blk, ciphertext); - ctx->iv = ciphertext; - } -} - -static FUNC_ISA inline void aes_cbc_ni_decrypt( - ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn decrypt) -{ - aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); - - for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; - blk < finish; blk += 16) { - __m128i ciphertext = _mm_loadu_si128((const __m128i *)blk); - __m128i decrypted = decrypt(ciphertext, ctx->keysched_d); - __m128i plaintext = _mm_xor_si128(decrypted, ctx->iv); - _mm_storeu_si128((__m128i *)blk, plaintext); - ctx->iv = ciphertext; - } -} - -static FUNC_ISA inline void aes_sdctr_ni( - ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt) -{ - aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); - - for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; - blk < finish; blk += 16) { - __m128i counter = aes_ni_sdctr_reverse(ctx->iv); - __m128i keystream = encrypt(counter, ctx->keysched_e); - __m128i input = _mm_loadu_si128((const __m128i *)blk); - __m128i output = _mm_xor_si128(input, keystream); - _mm_storeu_si128((__m128i *)blk, output); - ctx->iv = aes_ni_sdctr_increment(ctx->iv); - } -} - -#define NI_ENC_DEC(len) \ - static FUNC_ISA void aes##len##_cbc_hw_encrypt( \ - ssh_cipher *ciph, void *vblk, int blklen) \ - { aes_cbc_ni_encrypt(ciph, vblk, blklen, aes_ni_##len##_e); } \ - static FUNC_ISA void aes##len##_cbc_hw_decrypt( \ - ssh_cipher *ciph, void *vblk, int blklen) \ - { aes_cbc_ni_decrypt(ciph, vblk, blklen, aes_ni_##len##_d); } \ - static FUNC_ISA void aes##len##_sdctr_hw( \ - ssh_cipher *ciph, void *vblk, int blklen) \ - { aes_sdctr_ni(ciph, vblk, blklen, aes_ni_##len##_e); } \ - -NI_ENC_DEC(128) -NI_ENC_DEC(192) -NI_ENC_DEC(256) - -/* ---------------------------------------------------------------------- - * Hardware-accelerated implementation of AES using Arm NEON. - */ - -#elif HW_AES == HW_AES_NEON - -/* - * Manually set the target architecture, if we decided above that we - * need to. - */ -#ifdef USE_CLANG_ATTR_TARGET_AARCH64 -/* - * A spot of cheating: redefine some ACLE feature macros before - * including arm_neon.h. Otherwise we won't get the AES intrinsics - * defined by that header, because it will be looking at the settings - * for the whole translation unit rather than the ones we're going to - * put on some particular functions using __attribute__((target)). - */ -#define __ARM_NEON 1 -#define __ARM_FEATURE_CRYPTO 1 -#define FUNC_ISA __attribute__ ((target("neon,crypto"))) -#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */ - -#ifndef FUNC_ISA -#define FUNC_ISA -#endif - -#ifdef USE_ARM64_NEON_H -#include -#else -#include -#endif - -static bool aes_hw_available(void) -{ - /* - * For Arm, we delegate to a per-platform AES detection function, - * because it has to be implemented by asking the operating system - * rather than directly querying the CPU. - * - * That's because Arm systems commonly have multiple cores that - * are not all alike, so any method of querying whether NEON - * crypto instructions work on the _current_ CPU - even one as - * crude as just trying one and catching the SIGILL - wouldn't - * give an answer that you could still rely on the first time the - * OS migrated your process to another CPU. - */ - return platform_aes_hw_available(); -} - -/* - * Core NEON encrypt/decrypt functions, one per length and direction. - */ - -#define NEON_CIPHER(len, repmacro) \ - static FUNC_ISA inline uint8x16_t aes_neon_##len##_e( \ - uint8x16_t v, const uint8x16_t *keysched) \ - { \ - repmacro(v = vaesmcq_u8(vaeseq_u8(v, *keysched++));); \ - v = vaeseq_u8(v, *keysched++); \ - return veorq_u8(v, *keysched); \ - } \ - static FUNC_ISA inline uint8x16_t aes_neon_##len##_d( \ - uint8x16_t v, const uint8x16_t *keysched) \ - { \ - repmacro(v = vaesimcq_u8(vaesdq_u8(v, *keysched++));); \ - v = vaesdq_u8(v, *keysched++); \ - return veorq_u8(v, *keysched); \ - } - -NEON_CIPHER(128, REP9) -NEON_CIPHER(192, REP11) -NEON_CIPHER(256, REP13) - -/* - * The main key expansion. - */ -static FUNC_ISA void aes_neon_key_expand( - const unsigned char *key, size_t key_words, - uint8x16_t *keysched_e, uint8x16_t *keysched_d) -{ - size_t rounds = key_words + 6; - size_t sched_words = (rounds + 1) * 4; - - /* - * Store the key schedule as 32-bit integers during expansion, so - * that it's easy to refer back to individual previous words. We - * collect them into the final uint8x16_t form at the end. - */ - uint32_t sched[MAXROUNDKEYS * 4]; - - unsigned rconpos = 0; - - for (size_t i = 0; i < sched_words; i++) { - if (i < key_words) { - sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i); - } else { - uint32_t temp = sched[i - 1]; - - bool rotate_and_round_constant = (i % key_words == 0); - bool sub = rotate_and_round_constant || - (key_words == 8 && i % 8 == 4); - - if (rotate_and_round_constant) - temp = (temp << 24) | (temp >> 8); - - if (sub) { - uint32x4_t v32 = vdupq_n_u32(temp); - uint8x16_t v8 = vreinterpretq_u8_u32(v32); - v8 = vaeseq_u8(v8, vdupq_n_u8(0)); - v32 = vreinterpretq_u32_u8(v8); - temp = vget_lane_u32(vget_low_u32(v32), 0); - } - - if (rotate_and_round_constant) { - assert(rconpos < lenof(key_setup_round_constants)); - temp ^= key_setup_round_constants[rconpos++]; - } - - sched[i] = sched[i - key_words] ^ temp; - } - } - - /* - * Combine the key schedule words into uint8x16_t vectors and - * store them in the output context. - */ - for (size_t round = 0; round <= rounds; round++) - keysched_e[round] = vreinterpretq_u8_u32(vld1q_u32(sched + 4*round)); - - smemclr(sched, sizeof(sched)); - - /* - * Now prepare the modified keys for the inverse cipher. - */ - for (size_t eround = 0; eround <= rounds; eround++) { - size_t dround = rounds - eround; - uint8x16_t rkey = keysched_e[eround]; - if (eround && dround) /* neither first nor last */ - rkey = vaesimcq_u8(rkey); - keysched_d[dround] = rkey; - } -} - -/* - * Auxiliary routine to reverse the byte order of a vector, so that - * the SDCTR IV can be made big-endian for feeding to the cipher. - * - * In fact we don't need to reverse the vector _all_ the way; we leave - * the two lanes in MSW,LSW order, because that makes no difference to - * the efficiency of the increment. That way we only have to reverse - * bytes within each lane in this function. - */ -static FUNC_ISA inline uint8x16_t aes_neon_sdctr_reverse(uint8x16_t v) -{ - return vrev64q_u8(v); -} - -/* - * Auxiliary routine to increment the 128-bit counter used in SDCTR - * mode. There's no instruction to treat a 128-bit vector as a single - * long integer, so instead we have to increment the bottom half - * unconditionally, and the top half if the bottom half started off as - * all 1s (in which case there was about to be a carry). - */ -static FUNC_ISA inline uint8x16_t aes_neon_sdctr_increment(uint8x16_t in) -{ -#ifdef __aarch64__ - /* There will be a carry if the low 64 bits are all 1s. */ - uint64x1_t all1 = vcreate_u64(0xFFFFFFFFFFFFFFFF); - uint64x1_t carry = vceq_u64(vget_high_u64(vreinterpretq_u64_u8(in)), all1); - - /* Make a word whose bottom half is unconditionally all 1s, and - * the top half is 'carry', i.e. all 0s most of the time but all - * 1s if we need to increment the top half. Then that word is what - * we need to _subtract_ from the input counter. */ - uint64x2_t subtrahend = vcombine_u64(carry, all1); -#else - /* AArch32 doesn't have comparisons that operate on a 64-bit lane, - * so we start by comparing each 32-bit half of the low 64 bits - * _separately_ to all-1s. */ - uint32x2_t all1 = vdup_n_u32(0xFFFFFFFF); - uint32x2_t carry = vceq_u32( - vget_high_u32(vreinterpretq_u32_u8(in)), all1); - - /* Swap the 32-bit words of the compare output, and AND with the - * unswapped version. Now carry is all 1s iff the bottom half of - * the input counter was all 1s, and all 0s otherwise. */ - carry = vand_u32(carry, vrev64_u32(carry)); - - /* Now make the vector to subtract in the same way as above. */ - uint64x2_t subtrahend = vreinterpretq_u64_u32(vcombine_u32(carry, all1)); -#endif - - return vreinterpretq_u8_u64( - vsubq_u64(vreinterpretq_u64_u8(in), subtrahend)); -} - -/* - * The SSH interface and the cipher modes. - */ - -typedef struct aes_neon_context aes_neon_context; -struct aes_neon_context { - uint8x16_t keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv; - - ssh_cipher ciph; -}; - -static ssh_cipher *aes_hw_new(const ssh_cipheralg *alg) -{ - if (!aes_hw_available_cached()) - return NULL; - - aes_neon_context *ctx = snew(aes_neon_context); - ctx->ciph.vt = alg; - return &ctx->ciph; -} - -static void aes_hw_free(ssh_cipher *ciph) -{ - aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); - smemclr(ctx, sizeof(*ctx)); - sfree(ctx); -} - -static void aes_hw_setkey(ssh_cipher *ciph, const void *vkey) -{ - aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); - const unsigned char *key = (const unsigned char *)vkey; - - aes_neon_key_expand(key, ctx->ciph.vt->real_keybits / 32, - ctx->keysched_e, ctx->keysched_d); -} - -static FUNC_ISA void aes_hw_setiv_cbc(ssh_cipher *ciph, const void *iv) -{ - aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); - ctx->iv = vld1q_u8(iv); -} - -static FUNC_ISA void aes_hw_setiv_sdctr(ssh_cipher *ciph, const void *iv) -{ - aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); - uint8x16_t counter = vld1q_u8(iv); - ctx->iv = aes_neon_sdctr_reverse(counter); -} - -typedef uint8x16_t (*aes_neon_fn)(uint8x16_t v, const uint8x16_t *keysched); - -static FUNC_ISA inline void aes_cbc_neon_encrypt( - ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt) -{ - aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); - - for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; - blk < finish; blk += 16) { - uint8x16_t plaintext = vld1q_u8(blk); - uint8x16_t cipher_input = veorq_u8(plaintext, ctx->iv); - uint8x16_t ciphertext = encrypt(cipher_input, ctx->keysched_e); - vst1q_u8(blk, ciphertext); - ctx->iv = ciphertext; - } -} - -static FUNC_ISA inline void aes_cbc_neon_decrypt( - ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn decrypt) -{ - aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); - - for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; - blk < finish; blk += 16) { - uint8x16_t ciphertext = vld1q_u8(blk); - uint8x16_t decrypted = decrypt(ciphertext, ctx->keysched_d); - uint8x16_t plaintext = veorq_u8(decrypted, ctx->iv); - vst1q_u8(blk, plaintext); - ctx->iv = ciphertext; - } -} - -static FUNC_ISA inline void aes_sdctr_neon( - ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt) -{ - aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); - - for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; - blk < finish; blk += 16) { - uint8x16_t counter = aes_neon_sdctr_reverse(ctx->iv); - uint8x16_t keystream = encrypt(counter, ctx->keysched_e); - uint8x16_t input = vld1q_u8(blk); - uint8x16_t output = veorq_u8(input, keystream); - vst1q_u8(blk, output); - ctx->iv = aes_neon_sdctr_increment(ctx->iv); - } -} - -#define NEON_ENC_DEC(len) \ - static FUNC_ISA void aes##len##_cbc_hw_encrypt( \ - ssh_cipher *ciph, void *vblk, int blklen) \ - { aes_cbc_neon_encrypt(ciph, vblk, blklen, aes_neon_##len##_e); } \ - static FUNC_ISA void aes##len##_cbc_hw_decrypt( \ - ssh_cipher *ciph, void *vblk, int blklen) \ - { aes_cbc_neon_decrypt(ciph, vblk, blklen, aes_neon_##len##_d); } \ - static FUNC_ISA void aes##len##_sdctr_hw( \ - ssh_cipher *ciph, void *vblk, int blklen) \ - { aes_sdctr_neon(ciph, vblk, blklen, aes_neon_##len##_e); } \ - -NEON_ENC_DEC(128) -NEON_ENC_DEC(192) -NEON_ENC_DEC(256) - -/* ---------------------------------------------------------------------- - * Stub functions if we have no hardware-accelerated AES. In this - * case, aes_hw_new returns NULL (though it should also never be - * selected by aes_select, so the only thing that should even be - * _able_ to call it is testcrypt). As a result, the remaining vtable - * functions should never be called at all. - */ - -#elif HW_AES == HW_AES_NONE - -bool aes_hw_available(void) -{ - return false; -} - -static ssh_cipher *aes_hw_new(const ssh_cipheralg *alg) -{ - return NULL; -} - -#define STUB_BODY { unreachable("Should never be called"); } - -static void aes_hw_free(ssh_cipher *ciph) STUB_BODY -static void aes_hw_setkey(ssh_cipher *ciph, const void *key) STUB_BODY -static void aes_hw_setiv_cbc(ssh_cipher *ciph, const void *iv) STUB_BODY -static void aes_hw_setiv_sdctr(ssh_cipher *ciph, const void *iv) STUB_BODY -#define STUB_ENC_DEC(len) \ - static void aes##len##_cbc_hw_encrypt( \ - ssh_cipher *ciph, void *vblk, int blklen) STUB_BODY \ - static void aes##len##_cbc_hw_decrypt( \ - ssh_cipher *ciph, void *vblk, int blklen) STUB_BODY \ - static void aes##len##_sdctr_hw( \ - ssh_cipher *ciph, void *vblk, int blklen) STUB_BODY - -STUB_ENC_DEC(128) -STUB_ENC_DEC(192) -STUB_ENC_DEC(256) - -#endif /* HW_AES */ +AES_EXTRA(_sw); +AES_ALL_VTABLES(_sw, "unaccelerated"); diff --git a/crypto/aes.h b/crypto/aes.h new file mode 100644 index 00000000..1960713a --- /dev/null +++ b/crypto/aes.h @@ -0,0 +1,109 @@ +/* + * Definitions likely to be helpful to multiple AES implementations. + */ + +/* + * The 'extra' structure used by AES implementations is used to + * include information about how to check if a given implementation is + * available at run time, and whether we've already checked. + */ +struct aes_extra_mutable; +struct aes_extra { + /* Function to check availability. Might be expensive, so we don't + * want to call it more than once. */ + bool (*check_available)(void); + + /* Point to a writable substructure. */ + struct aes_extra_mutable *mut; +}; +struct aes_extra_mutable { + bool checked_availability; + bool is_available; +}; +static inline bool check_availability(const struct aes_extra *extra) +{ + if (!extra->mut->checked_availability) { + extra->mut->is_available = extra->check_available(); + extra->mut->checked_availability = true; + } + + return extra->mut->is_available; +} + +/* + * Macros to define vtables for AES variants. There are a lot of + * these, because of the cross product between cipher modes, key + * sizes, and assorted HW/SW implementations, so it's worth spending + * some effort here to reduce the boilerplate in the sub-files. + */ + +#define AES_EXTRA(impl_c) \ + static struct aes_extra_mutable aes ## impl_c ## _extra_mut; \ + static const struct aes_extra aes ## impl_c ## _extra = { \ + .check_available = aes ## impl_c ## _available, \ + .mut = &aes ## impl_c ## _extra_mut, \ + } + +#define AES_CBC_VTABLE(impl_c, impl_display, bits) \ + const ssh_cipheralg ssh_aes ## bits ## _cbc ## impl_c = { \ + .new = aes ## impl_c ## _new, \ + .free = aes ## impl_c ## _free, \ + .setiv = aes ## impl_c ## _setiv_cbc, \ + .setkey = aes ## impl_c ## _setkey, \ + .encrypt = aes ## bits ## impl_c ## _cbc_encrypt, \ + .decrypt = aes ## bits ## impl_c ## _cbc_decrypt, \ + .ssh2_id = "aes" #bits "-cbc", \ + .blksize = 16, \ + .real_keybits = bits, \ + .padded_keybytes = bits/8, \ + .flags = SSH_CIPHER_IS_CBC, \ + .text_name = "AES-" #bits " CBC (" impl_display ")", \ + .extra = &aes ## impl_c ## _extra, \ + } + +#define AES_SDCTR_VTABLE(impl_c, impl_display, bits) \ + const ssh_cipheralg ssh_aes ## bits ## _sdctr ## impl_c = { \ + .new = aes ## impl_c ## _new, \ + .free = aes ## impl_c ## _free, \ + .setiv = aes ## impl_c ## _setiv_sdctr, \ + .setkey = aes ## impl_c ## _setkey, \ + .encrypt = aes ## bits ## impl_c ## _sdctr, \ + .decrypt = aes ## bits ## impl_c ## _sdctr, \ + .ssh2_id = "aes" #bits "-ctr", \ + .blksize = 16, \ + .real_keybits = bits, \ + .padded_keybytes = bits/8, \ + .flags = 0, \ + .text_name = "AES-" #bits " SDCTR (" impl_display ")", \ + .extra = &aes ## impl_c ## _extra, \ + } + +#define AES_ALL_VTABLES(impl_c, impl_display) \ + AES_CBC_VTABLE(impl_c, impl_display, 128); \ + AES_CBC_VTABLE(impl_c, impl_display, 192); \ + AES_CBC_VTABLE(impl_c, impl_display, 256); \ + AES_SDCTR_VTABLE(impl_c, impl_display, 128); \ + AES_SDCTR_VTABLE(impl_c, impl_display, 192); \ + AES_SDCTR_VTABLE(impl_c, impl_display, 256) + +/* + * Macros to repeat a piece of code particular numbers of times that + * correspond to 1 fewer than the number of AES rounds. (Because the + * last round is different.) + */ +#define REP2(x) x x +#define REP4(x) REP2(REP2(x)) +#define REP8(x) REP2(REP4(x)) +#define REP9(x) REP8(x) x +#define REP11(x) REP8(x) REP2(x) x +#define REP13(x) REP8(x) REP4(x) x + +/* + * The round constants used in key schedule expansion. + */ +extern const uint8_t aes_key_setup_round_constants[10]; + +/* + * The largest number of round keys ever needed. + */ +#define MAXROUNDKEYS 15 diff --git a/crypto/sha1-common.c b/crypto/sha1-common.c new file mode 100644 index 00000000..bf1db67a --- /dev/null +++ b/crypto/sha1-common.c @@ -0,0 +1,10 @@ +/* + * Common variable definitions across all the SHA-1 implementations. + */ + +#include "ssh.h" +#include "sha1.h" + +const uint32_t sha1_initial_state[5] = { + 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0, +}; diff --git a/crypto/sha1-neon.c b/crypto/sha1-neon.c new file mode 100644 index 00000000..99045714 --- /dev/null +++ b/crypto/sha1-neon.c @@ -0,0 +1,190 @@ +/* + * Hardware-accelerated implementation of SHA-1 using Arm NEON. + */ + +#include "ssh.h" +#include "sha1.h" + +#if USE_ARM64_NEON_H +#include +#else +#include +#endif + +static bool sha1_neon_available(void) +{ + /* + * For Arm, we delegate to a per-platform detection function (see + * explanation in aes-neon.c). + */ + return platform_sha1_neon_available(); +} + +typedef struct sha1_neon_core sha1_neon_core; +struct sha1_neon_core { + uint32x4_t abcd; + uint32_t e; +}; + +static inline uint32x4_t sha1_neon_load_input(const uint8_t *p) +{ + return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p))); +} + +static inline uint32x4_t sha1_neon_schedule_update( + uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1) +{ + return vsha1su1q_u32(vsha1su0q_u32(m4, m3, m2), m1); +} + +/* + * SHA-1 has three different kinds of round, differing in whether they + * use the Ch, Maj or Par functions defined above. Each one uses a + * separate NEON instruction, so we define three inline functions for + * the different round types using this macro. + * + * The two batches of Par-type rounds also use a different constant, + * but that's passed in as an operand, so we don't need a fourth + * inline function just for that. + */ +#define SHA1_NEON_ROUND_FN(type) \ + static inline sha1_neon_core sha1_neon_round4_##type( \ + sha1_neon_core old, uint32x4_t sched, uint32x4_t constant) \ + { \ + sha1_neon_core new; \ + uint32x4_t round_input = vaddq_u32(sched, constant); \ + new.abcd = vsha1##type##q_u32(old.abcd, old.e, round_input); \ + new.e = vsha1h_u32(vget_lane_u32(vget_low_u32(old.abcd), 0)); \ + return new; \ + } +SHA1_NEON_ROUND_FN(c) +SHA1_NEON_ROUND_FN(p) +SHA1_NEON_ROUND_FN(m) + +static inline void sha1_neon_block(sha1_neon_core *core, const uint8_t *p) +{ + uint32x4_t constant, s0, s1, s2, s3; + sha1_neon_core cr = *core; + + constant = vdupq_n_u32(SHA1_STAGE0_CONSTANT); + s0 = sha1_neon_load_input(p); + cr = sha1_neon_round4_c(cr, s0, constant); + s1 = sha1_neon_load_input(p + 16); + cr = sha1_neon_round4_c(cr, s1, constant); + s2 = sha1_neon_load_input(p + 32); + cr = sha1_neon_round4_c(cr, s2, constant); + s3 = sha1_neon_load_input(p + 48); + cr = sha1_neon_round4_c(cr, s3, constant); + s0 = sha1_neon_schedule_update(s0, s1, s2, s3); + cr = sha1_neon_round4_c(cr, s0, constant); + + constant = vdupq_n_u32(SHA1_STAGE1_CONSTANT); + s1 = sha1_neon_schedule_update(s1, s2, s3, s0); + cr = sha1_neon_round4_p(cr, s1, constant); + s2 = sha1_neon_schedule_update(s2, s3, s0, s1); + cr = sha1_neon_round4_p(cr, s2, constant); + s3 = sha1_neon_schedule_update(s3, s0, s1, s2); + cr = sha1_neon_round4_p(cr, s3, constant); + s0 = sha1_neon_schedule_update(s0, s1, s2, s3); + cr = sha1_neon_round4_p(cr, s0, constant); + s1 = sha1_neon_schedule_update(s1, s2, s3, s0); + cr = sha1_neon_round4_p(cr, s1, constant); + + constant = vdupq_n_u32(SHA1_STAGE2_CONSTANT); + s2 = sha1_neon_schedule_update(s2, s3, s0, s1); + cr = sha1_neon_round4_m(cr, s2, constant); + s3 = sha1_neon_schedule_update(s3, s0, s1, s2); + cr = sha1_neon_round4_m(cr, s3, constant); + s0 = sha1_neon_schedule_update(s0, s1, s2, s3); + cr = sha1_neon_round4_m(cr, s0, constant); + s1 = sha1_neon_schedule_update(s1, s2, s3, s0); + cr = sha1_neon_round4_m(cr, s1, constant); + s2 = sha1_neon_schedule_update(s2, s3, s0, s1); + cr = sha1_neon_round4_m(cr, s2, constant); + + constant = vdupq_n_u32(SHA1_STAGE3_CONSTANT); + s3 = sha1_neon_schedule_update(s3, s0, s1, s2); + cr = sha1_neon_round4_p(cr, s3, constant); + s0 = sha1_neon_schedule_update(s0, s1, s2, s3); + cr = sha1_neon_round4_p(cr, s0, constant); + s1 = sha1_neon_schedule_update(s1, s2, s3, s0); + cr = sha1_neon_round4_p(cr, s1, constant); + s2 = sha1_neon_schedule_update(s2, s3, s0, s1); + cr = sha1_neon_round4_p(cr, s2, constant); + s3 = sha1_neon_schedule_update(s3, s0, s1, s2); + cr = sha1_neon_round4_p(cr, s3, constant); + + core->abcd = vaddq_u32(core->abcd, cr.abcd); + core->e += cr.e; +} + +typedef struct sha1_neon { + sha1_neon_core core; + sha1_block blk; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha1_neon; + +static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len); + +static ssh_hash *sha1_neon_new(const ssh_hashalg *alg) +{ + const struct sha1_extra *extra = (const struct sha1_extra *)alg->extra; + if (!check_availability(extra)) + return NULL; + + sha1_neon *s = snew(sha1_neon); + + s->hash.vt = alg; + BinarySink_INIT(s, sha1_neon_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + return &s->hash; +} + +static void sha1_neon_reset(ssh_hash *hash) +{ + sha1_neon *s = container_of(hash, sha1_neon, hash); + + s->core.abcd = vld1q_u32(sha1_initial_state); + s->core.e = sha1_initial_state[4]; + + sha1_block_setup(&s->blk); +} + +static void sha1_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig) +{ + sha1_neon *copy = container_of(hcopy, sha1_neon, hash); + sha1_neon *orig = container_of(horig, sha1_neon, hash); + + *copy = *orig; /* structure copy */ + + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); +} + +static void sha1_neon_free(ssh_hash *hash) +{ + sha1_neon *s = container_of(hash, sha1_neon, hash); + smemclr(s, sizeof(*s)); + sfree(s); +} + +static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len) +{ + sha1_neon *s = BinarySink_DOWNCAST(bs, sha1_neon); + + while (len > 0) + if (sha1_block_write(&s->blk, &vp, &len)) + sha1_neon_block(&s->core, s->blk.block); +} + +static void sha1_neon_digest(ssh_hash *hash, uint8_t *digest) +{ + sha1_neon *s = container_of(hash, sha1_neon, hash); + + sha1_block_pad(&s->blk, BinarySink_UPCAST(s)); + vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd))); + PUT_32BIT_MSB_FIRST(digest + 16, s->core.e); +} + +SHA1_VTABLE(neon, "NEON accelerated"); diff --git a/crypto/sha1-ni.c b/crypto/sha1-ni.c new file mode 100644 index 00000000..04e6386b --- /dev/null +++ b/crypto/sha1-ni.c @@ -0,0 +1,325 @@ +/* + * Hardware-accelerated implementation of SHA-1 using x86 SHA-NI. + */ + +#include "ssh.h" +#include "sha1.h" + +#include +#include +#include +#if HAVE_SHAINTRIN_H +#include +#endif + +#if defined(__clang__) || defined(__GNUC__) +#include +#define GET_CPU_ID_0(out) \ + __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3]) +#define GET_CPU_ID_7(out) \ + __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3]) +#else +#define GET_CPU_ID_0(out) __cpuid(out, 0) +#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0) +#endif + +static bool sha1_ni_available(void) +{ + unsigned int CPUInfo[4]; + GET_CPU_ID_0(CPUInfo); + if (CPUInfo[0] < 7) + return false; + + GET_CPU_ID_7(CPUInfo); + return CPUInfo[1] & (1 << 29); /* Check SHA */ +} + +/* SHA1 implementation using new instructions + The code is based on Jeffrey Walton's SHA1 implementation: + https://github.com/noloader/SHA-Intrinsics +*/ +static inline void sha1_ni_block(__m128i *core, const uint8_t *p) +{ + __m128i ABCD, E0, E1, MSG0, MSG1, MSG2, MSG3; + const __m128i MASK = _mm_set_epi64x( + 0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL); + + const __m128i *block = (const __m128i *)p; + + /* Load initial values */ + ABCD = core[0]; + E0 = core[1]; + + /* Rounds 0-3 */ + MSG0 = _mm_loadu_si128(block); + MSG0 = _mm_shuffle_epi8(MSG0, MASK); + E0 = _mm_add_epi32(E0, MSG0); + E1 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + + /* Rounds 4-7 */ + MSG1 = _mm_loadu_si128(block + 1); + MSG1 = _mm_shuffle_epi8(MSG1, MASK); + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + + /* Rounds 8-11 */ + MSG2 = _mm_loadu_si128(block + 2); + MSG2 = _mm_shuffle_epi8(MSG2, MASK); + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + /* Rounds 12-15 */ + MSG3 = _mm_loadu_si128(block + 3); + MSG3 = _mm_shuffle_epi8(MSG3, MASK); + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + /* Rounds 16-19 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + /* Rounds 20-23 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + /* Rounds 24-27 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + /* Rounds 28-31 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + /* Rounds 32-35 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + /* Rounds 36-39 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + /* Rounds 40-43 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + /* Rounds 44-47 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + /* Rounds 48-51 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + /* Rounds 52-55 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + /* Rounds 56-59 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + /* Rounds 60-63 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + /* Rounds 64-67 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + /* Rounds 68-71 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + /* Rounds 72-75 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); + + /* Rounds 76-79 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + + /* Combine state */ + core[0] = _mm_add_epi32(ABCD, core[0]); + core[1] = _mm_sha1nexte_epu32(E0, core[1]); +} + +typedef struct sha1_ni { + /* + * core[0] stores the first four words of the SHA-1 state. core[1] + * stores just the fifth word, in the vector lane at the highest + * address. + */ + __m128i core[2]; + sha1_block blk; + void *pointer_to_free; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha1_ni; + +static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len); + +static sha1_ni *sha1_ni_alloc(void) +{ + /* + * The __m128i variables in the context structure need to be + * 16-byte aligned, but not all malloc implementations that this + * code has to work with will guarantee to return a 16-byte + * aligned pointer. So we over-allocate, manually realign the + * pointer ourselves, and store the original one inside the + * context so we know how to free it later. + */ + void *allocation = smalloc(sizeof(sha1_ni) + 15); + uintptr_t alloc_address = (uintptr_t)allocation; + uintptr_t aligned_address = (alloc_address + 15) & ~15; + sha1_ni *s = (sha1_ni *)aligned_address; + s->pointer_to_free = allocation; + return s; +} + +static ssh_hash *sha1_ni_new(const ssh_hashalg *alg) +{ + const struct sha1_extra *extra = (const struct sha1_extra *)alg->extra; + if (!check_availability(extra)) + return NULL; + + sha1_ni *s = sha1_ni_alloc(); + + s->hash.vt = alg; + BinarySink_INIT(s, sha1_ni_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + return &s->hash; +} + +static void sha1_ni_reset(ssh_hash *hash) +{ + sha1_ni *s = container_of(hash, sha1_ni, hash); + + /* Initialise the core vectors in their storage order */ + s->core[0] = _mm_set_epi64x( + 0x67452301efcdab89ULL, 0x98badcfe10325476ULL); + s->core[1] = _mm_set_epi32(0xc3d2e1f0, 0, 0, 0); + + sha1_block_setup(&s->blk); +} + +static void sha1_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig) +{ + sha1_ni *copy = container_of(hcopy, sha1_ni, hash); + sha1_ni *orig = container_of(horig, sha1_ni, hash); + + void *ptf_save = copy->pointer_to_free; + *copy = *orig; /* structure copy */ + copy->pointer_to_free = ptf_save; + + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); +} + +static void sha1_ni_free(ssh_hash *hash) +{ + sha1_ni *s = container_of(hash, sha1_ni, hash); + + void *ptf = s->pointer_to_free; + smemclr(s, sizeof(*s)); + sfree(ptf); +} + +static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len) +{ + sha1_ni *s = BinarySink_DOWNCAST(bs, sha1_ni); + + while (len > 0) + if (sha1_block_write(&s->blk, &vp, &len)) + sha1_ni_block(s->core, s->blk.block); +} + +static void sha1_ni_digest(ssh_hash *hash, uint8_t *digest) +{ + sha1_ni *s = container_of(hash, sha1_ni, hash); + + sha1_block_pad(&s->blk, BinarySink_UPCAST(s)); + + /* Rearrange the first vector into its output order */ + __m128i abcd = _mm_shuffle_epi32(s->core[0], 0x1B); + + /* Byte-swap it into the output endianness */ + const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12); + abcd = _mm_shuffle_epi8(abcd, mask); + + /* And store it */ + _mm_storeu_si128((__m128i *)digest, abcd); + + /* Finally, store the leftover word */ + uint32_t e = _mm_extract_epi32(s->core[1], 3); + PUT_32BIT_MSB_FIRST(digest + 16, e); +} + +SHA1_VTABLE(ni, "SHA-NI accelerated"); diff --git a/crypto/sha1-select.c b/crypto/sha1-select.c new file mode 100644 index 00000000..1e8a6ce9 --- /dev/null +++ b/crypto/sha1-select.c @@ -0,0 +1,44 @@ +/* + * Top-level vtables to select a SHA-1 implementation. + */ + +#include +#include + +#include "putty.h" +#include "ssh.h" +#include "sha1.h" + +static ssh_hash *sha1_select(const ssh_hashalg *alg) +{ + static const ssh_hashalg *const real_algs[] = { +#if HAVE_SHA_NI + &ssh_sha1_ni, +#endif +#if HAVE_NEON_CRYPTO + &ssh_sha1_neon, +#endif + &ssh_sha1_sw, + NULL, + }; + + for (size_t i = 0; real_algs[i]; i++) { + const ssh_hashalg *alg = real_algs[i]; + const struct sha1_extra *alg_extra = + (const struct sha1_extra *)alg->extra; + if (check_availability(alg_extra)) + return ssh_hash_new(alg); + } + + /* We should never reach the NULL at the end of the list, because + * the last non-NULL entry should be software-only SHA-1, which + * is always available. */ + unreachable("sha1_select ran off the end of its list"); +} + +const ssh_hashalg ssh_sha1 = { + .new = sha1_select, + .hlen = 20, + .blocklen = 64, + HASHALG_NAMES_ANNOTATED("SHA-1", "dummy selector vtable"), +}; diff --git a/crypto/sha1-sw.c b/crypto/sha1-sw.c new file mode 100644 index 00000000..905d97f3 --- /dev/null +++ b/crypto/sha1-sw.c @@ -0,0 +1,155 @@ +/* + * Software implementation of SHA-1. + */ + +#include "ssh.h" +#include "sha1.h" + +static bool sha1_sw_available(void) +{ + /* Software SHA-1 is always available */ + return true; +} + +static inline uint32_t rol(uint32_t x, unsigned y) +{ + return (x << (31 & y)) | (x >> (31 & -y)); +} + +static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0) +{ + return if0 ^ (ctrl & (if1 ^ if0)); +} + +static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) +{ + return (x & y) | (z & (x | y)); +} + +static inline uint32_t Par(uint32_t x, uint32_t y, uint32_t z) +{ + return (x ^ y ^ z); +} + +static inline void sha1_sw_round( + unsigned round_index, const uint32_t *schedule, + uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, uint32_t *e, + uint32_t f, uint32_t constant) +{ + *e = rol(*a, 5) + f + *e + schedule[round_index] + constant; + *b = rol(*b, 30); +} + +static void sha1_sw_block(uint32_t *core, const uint8_t *block) +{ + uint32_t w[SHA1_ROUNDS]; + uint32_t a,b,c,d,e; + + for (size_t t = 0; t < 16; t++) + w[t] = GET_32BIT_MSB_FIRST(block + 4*t); + + for (size_t t = 16; t < SHA1_ROUNDS; t++) + w[t] = rol(w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16], 1); + + a = core[0]; b = core[1]; c = core[2]; d = core[3]; + e = core[4]; + + size_t t = 0; + for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) { + sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Ch(b,c,d), SHA1_STAGE0_CONSTANT); + sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Ch(a,b,c), SHA1_STAGE0_CONSTANT); + sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Ch(e,a,b), SHA1_STAGE0_CONSTANT); + sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Ch(d,e,a), SHA1_STAGE0_CONSTANT); + sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Ch(c,d,e), SHA1_STAGE0_CONSTANT); + } + for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) { + sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE1_CONSTANT); + sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE1_CONSTANT); + sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE1_CONSTANT); + sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE1_CONSTANT); + sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE1_CONSTANT); + } + for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) { + sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Maj(b,c,d), SHA1_STAGE2_CONSTANT); + sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Maj(a,b,c), SHA1_STAGE2_CONSTANT); + sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Maj(e,a,b), SHA1_STAGE2_CONSTANT); + sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Maj(d,e,a), SHA1_STAGE2_CONSTANT); + sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Maj(c,d,e), SHA1_STAGE2_CONSTANT); + } + for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) { + sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE3_CONSTANT); + sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE3_CONSTANT); + sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE3_CONSTANT); + sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE3_CONSTANT); + sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE3_CONSTANT); + } + + core[0] += a; core[1] += b; core[2] += c; core[3] += d; core[4] += e; + + smemclr(w, sizeof(w)); +} + +typedef struct sha1_sw { + uint32_t core[5]; + sha1_block blk; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha1_sw; + +static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len); + +static ssh_hash *sha1_sw_new(const ssh_hashalg *alg) +{ + sha1_sw *s = snew(sha1_sw); + + s->hash.vt = alg; + BinarySink_INIT(s, sha1_sw_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + return &s->hash; +} + +static void sha1_sw_reset(ssh_hash *hash) +{ + sha1_sw *s = container_of(hash, sha1_sw, hash); + + memcpy(s->core, sha1_initial_state, sizeof(s->core)); + sha1_block_setup(&s->blk); +} + +static void sha1_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig) +{ + sha1_sw *copy = container_of(hcopy, sha1_sw, hash); + sha1_sw *orig = container_of(horig, sha1_sw, hash); + + memcpy(copy, orig, sizeof(*copy)); + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); +} + +static void sha1_sw_free(ssh_hash *hash) +{ + sha1_sw *s = container_of(hash, sha1_sw, hash); + + smemclr(s, sizeof(*s)); + sfree(s); +} + +static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len) +{ + sha1_sw *s = BinarySink_DOWNCAST(bs, sha1_sw); + + while (len > 0) + if (sha1_block_write(&s->blk, &vp, &len)) + sha1_sw_block(s->core, s->blk.block); +} + +static void sha1_sw_digest(ssh_hash *hash, uint8_t *digest) +{ + sha1_sw *s = container_of(hash, sha1_sw, hash); + + sha1_block_pad(&s->blk, BinarySink_UPCAST(s)); + for (size_t i = 0; i < 5; i++) + PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]); +} + +SHA1_VTABLE(sw, "unaccelerated"); diff --git a/crypto/sha1.c b/crypto/sha1.c deleted file mode 100644 index 536d474f..00000000 --- a/crypto/sha1.c +++ /dev/null @@ -1,933 +0,0 @@ -/* - * SHA-1 algorithm as described at - * - * http://csrc.nist.gov/cryptval/shs.html - */ - -#include "ssh.h" -#include - -/* - * Start by deciding whether we can support hardware SHA at all. - */ -#define HW_SHA1_NONE 0 -#define HW_SHA1_NI 1 -#define HW_SHA1_NEON 2 - -#ifdef _FORCE_SHA_NI -# define HW_SHA1 HW_SHA1_NI -#elif defined(__clang__) -# if __has_attribute(target) && __has_include() && \ - (defined(__x86_64__) || defined(__i386)) -# define HW_SHA1 HW_SHA1_NI -# endif -#elif defined(__GNUC__) -# if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) && \ - (defined(__x86_64__) || defined(__i386)) -# define HW_SHA1 HW_SHA1_NI -# endif -#elif defined (_MSC_VER) -# if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729 -# define HW_SHA1 HW_SHA1_NI -# endif -#endif - -#ifdef _FORCE_SHA_NEON -# define HW_SHA1 HW_SHA1_NEON -#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - /* Arm can potentially support both endiannesses, but this code - * hasn't been tested on anything but little. If anyone wants to - * run big-endian, they'll need to fix it first. */ -#elif defined __ARM_FEATURE_CRYPTO - /* If the Arm crypto extension is available already, we can - * support NEON SHA without having to enable anything by hand */ -# define HW_SHA1 HW_SHA1_NEON -#elif defined(__clang__) -# if __has_attribute(target) && __has_include() && \ - (defined(__aarch64__)) - /* clang can enable the crypto extension in AArch64 using - * __attribute__((target)) */ -# define HW_SHA1 HW_SHA1_NEON -# define USE_CLANG_ATTR_TARGET_AARCH64 -# endif -#elif defined _MSC_VER - /* Visual Studio supports the crypto extension when targeting - * AArch64, but as of VS2017, the AArch32 header doesn't quite - * manage it (declaring the shae/shad intrinsics without a round - * key operand). */ -# if defined _M_ARM64 -# define HW_SHA1 HW_SHA1_NEON -# if defined _M_ARM64 -# define USE_ARM64_NEON_H /* unusual header name in this case */ -# endif -# endif -#endif - -#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA1 -# undef HW_SHA1 -# define HW_SHA1 HW_SHA1_NONE -#endif - -/* - * The actual query function that asks if hardware acceleration is - * available. - */ -static bool sha1_hw_available(void); - -/* - * The top-level selection function, caching the results of - * sha1_hw_available() so it only has to run once. - */ -static bool sha1_hw_available_cached(void) -{ - static bool initialised = false; - static bool hw_available; - if (!initialised) { - hw_available = sha1_hw_available(); - initialised = true; - } - return hw_available; -} - -static ssh_hash *sha1_select(const ssh_hashalg *alg) -{ - const ssh_hashalg *real_alg = - sha1_hw_available_cached() ? &ssh_sha1_hw : &ssh_sha1_sw; - - return ssh_hash_new(real_alg); -} - -const ssh_hashalg ssh_sha1 = { - .new = sha1_select, - .hlen = 20, - .blocklen = 64, - HASHALG_NAMES_ANNOTATED("SHA-1", "dummy selector vtable"), -}; - -/* ---------------------------------------------------------------------- - * Definitions likely to be helpful to multiple implementations. - */ - -static const uint32_t sha1_initial_state[] = { - 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0, -}; - -#define SHA1_ROUNDS_PER_STAGE 20 -#define SHA1_STAGE0_CONSTANT 0x5a827999 -#define SHA1_STAGE1_CONSTANT 0x6ed9eba1 -#define SHA1_STAGE2_CONSTANT 0x8f1bbcdc -#define SHA1_STAGE3_CONSTANT 0xca62c1d6 -#define SHA1_ROUNDS (4 * SHA1_ROUNDS_PER_STAGE) - -typedef struct sha1_block sha1_block; -struct sha1_block { - uint8_t block[64]; - size_t used; - uint64_t len; -}; - -static inline void sha1_block_setup(sha1_block *blk) -{ - blk->used = 0; - blk->len = 0; -} - -static inline bool sha1_block_write( - sha1_block *blk, const void **vdata, size_t *len) -{ - size_t blkleft = sizeof(blk->block) - blk->used; - size_t chunk = *len < blkleft ? *len : blkleft; - - const uint8_t *p = *vdata; - memcpy(blk->block + blk->used, p, chunk); - *vdata = p + chunk; - *len -= chunk; - blk->used += chunk; - blk->len += chunk; - - if (blk->used == sizeof(blk->block)) { - blk->used = 0; - return true; - } - - return false; -} - -static inline void sha1_block_pad(sha1_block *blk, BinarySink *bs) -{ - uint64_t final_len = blk->len << 3; - size_t pad = 1 + (63 & (55 - blk->used)); - - put_byte(bs, 0x80); - for (size_t i = 1; i < pad; i++) - put_byte(bs, 0); - put_uint64(bs, final_len); - - assert(blk->used == 0 && "Should have exactly hit a block boundary"); -} - -/* ---------------------------------------------------------------------- - * Software implementation of SHA-1. - */ - -static inline uint32_t rol(uint32_t x, unsigned y) -{ - return (x << (31 & y)) | (x >> (31 & -y)); -} - -static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0) -{ - return if0 ^ (ctrl & (if1 ^ if0)); -} - -static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) -{ - return (x & y) | (z & (x | y)); -} - -static inline uint32_t Par(uint32_t x, uint32_t y, uint32_t z) -{ - return (x ^ y ^ z); -} - -static inline void sha1_sw_round( - unsigned round_index, const uint32_t *schedule, - uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, uint32_t *e, - uint32_t f, uint32_t constant) -{ - *e = rol(*a, 5) + f + *e + schedule[round_index] + constant; - *b = rol(*b, 30); -} - -static void sha1_sw_block(uint32_t *core, const uint8_t *block) -{ - uint32_t w[SHA1_ROUNDS]; - uint32_t a,b,c,d,e; - - for (size_t t = 0; t < 16; t++) - w[t] = GET_32BIT_MSB_FIRST(block + 4*t); - - for (size_t t = 16; t < SHA1_ROUNDS; t++) - w[t] = rol(w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16], 1); - - a = core[0]; b = core[1]; c = core[2]; d = core[3]; - e = core[4]; - - size_t t = 0; - for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) { - sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Ch(b,c,d), SHA1_STAGE0_CONSTANT); - sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Ch(a,b,c), SHA1_STAGE0_CONSTANT); - sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Ch(e,a,b), SHA1_STAGE0_CONSTANT); - sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Ch(d,e,a), SHA1_STAGE0_CONSTANT); - sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Ch(c,d,e), SHA1_STAGE0_CONSTANT); - } - for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) { - sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE1_CONSTANT); - sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE1_CONSTANT); - sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE1_CONSTANT); - sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE1_CONSTANT); - sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE1_CONSTANT); - } - for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) { - sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Maj(b,c,d), SHA1_STAGE2_CONSTANT); - sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Maj(a,b,c), SHA1_STAGE2_CONSTANT); - sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Maj(e,a,b), SHA1_STAGE2_CONSTANT); - sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Maj(d,e,a), SHA1_STAGE2_CONSTANT); - sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Maj(c,d,e), SHA1_STAGE2_CONSTANT); - } - for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) { - sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE3_CONSTANT); - sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE3_CONSTANT); - sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE3_CONSTANT); - sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE3_CONSTANT); - sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE3_CONSTANT); - } - - core[0] += a; core[1] += b; core[2] += c; core[3] += d; core[4] += e; - - smemclr(w, sizeof(w)); -} - -typedef struct sha1_sw { - uint32_t core[5]; - sha1_block blk; - BinarySink_IMPLEMENTATION; - ssh_hash hash; -} sha1_sw; - -static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len); - -static ssh_hash *sha1_sw_new(const ssh_hashalg *alg) -{ - sha1_sw *s = snew(sha1_sw); - - s->hash.vt = alg; - BinarySink_INIT(s, sha1_sw_write); - BinarySink_DELEGATE_INIT(&s->hash, s); - return &s->hash; -} - -static void sha1_sw_reset(ssh_hash *hash) -{ - sha1_sw *s = container_of(hash, sha1_sw, hash); - - memcpy(s->core, sha1_initial_state, sizeof(s->core)); - sha1_block_setup(&s->blk); -} - -static void sha1_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig) -{ - sha1_sw *copy = container_of(hcopy, sha1_sw, hash); - sha1_sw *orig = container_of(horig, sha1_sw, hash); - - memcpy(copy, orig, sizeof(*copy)); - BinarySink_COPIED(copy); - BinarySink_DELEGATE_INIT(©->hash, copy); -} - -static void sha1_sw_free(ssh_hash *hash) -{ - sha1_sw *s = container_of(hash, sha1_sw, hash); - - smemclr(s, sizeof(*s)); - sfree(s); -} - -static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len) -{ - sha1_sw *s = BinarySink_DOWNCAST(bs, sha1_sw); - - while (len > 0) - if (sha1_block_write(&s->blk, &vp, &len)) - sha1_sw_block(s->core, s->blk.block); -} - -static void sha1_sw_digest(ssh_hash *hash, uint8_t *digest) -{ - sha1_sw *s = container_of(hash, sha1_sw, hash); - - sha1_block_pad(&s->blk, BinarySink_UPCAST(s)); - for (size_t i = 0; i < 5; i++) - PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]); -} - -const ssh_hashalg ssh_sha1_sw = { - .new = sha1_sw_new, - .reset = sha1_sw_reset, - .copyfrom = sha1_sw_copyfrom, - .digest = sha1_sw_digest, - .free = sha1_sw_free, - .hlen = 20, - .blocklen = 64, - HASHALG_NAMES_ANNOTATED("SHA-1", "unaccelerated"), -}; - -/* ---------------------------------------------------------------------- - * Hardware-accelerated implementation of SHA-1 using x86 SHA-NI. - */ - -#if HW_SHA1 == HW_SHA1_NI - -/* - * Set target architecture for Clang and GCC - */ - -#if defined(__clang__) || defined(__GNUC__) -# define FUNC_ISA __attribute__ ((target("sse4.1,sha"))) -#if !defined(__clang__) -# pragma GCC target("sha") -# pragma GCC target("sse4.1") -#endif -#else -# define FUNC_ISA -#endif - -#include -#include -#include -#if defined(__clang__) || defined(__GNUC__) -#include -#endif - -#if defined(__clang__) || defined(__GNUC__) -#include -#define GET_CPU_ID_0(out) \ - __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3]) -#define GET_CPU_ID_7(out) \ - __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3]) -#else -#define GET_CPU_ID_0(out) __cpuid(out, 0) -#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0) -#endif - -static bool sha1_hw_available(void) -{ - unsigned int CPUInfo[4]; - GET_CPU_ID_0(CPUInfo); - if (CPUInfo[0] < 7) - return false; - - GET_CPU_ID_7(CPUInfo); - return CPUInfo[1] & (1 << 29); /* Check SHA */ -} - -/* SHA1 implementation using new instructions - The code is based on Jeffrey Walton's SHA1 implementation: - https://github.com/noloader/SHA-Intrinsics -*/ -FUNC_ISA -static inline void sha1_ni_block(__m128i *core, const uint8_t *p) -{ - __m128i ABCD, E0, E1, MSG0, MSG1, MSG2, MSG3; - const __m128i MASK = _mm_set_epi64x( - 0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL); - - const __m128i *block = (const __m128i *)p; - - /* Load initial values */ - ABCD = core[0]; - E0 = core[1]; - - /* Rounds 0-3 */ - MSG0 = _mm_loadu_si128(block); - MSG0 = _mm_shuffle_epi8(MSG0, MASK); - E0 = _mm_add_epi32(E0, MSG0); - E1 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - - /* Rounds 4-7 */ - MSG1 = _mm_loadu_si128(block + 1); - MSG1 = _mm_shuffle_epi8(MSG1, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - - /* Rounds 8-11 */ - MSG2 = _mm_loadu_si128(block + 2); - MSG2 = _mm_shuffle_epi8(MSG2, MASK); - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - /* Rounds 12-15 */ - MSG3 = _mm_loadu_si128(block + 3); - MSG3 = _mm_shuffle_epi8(MSG3, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - /* Rounds 16-19 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - /* Rounds 20-23 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - /* Rounds 24-27 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - /* Rounds 28-31 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - /* Rounds 32-35 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - /* Rounds 36-39 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - /* Rounds 40-43 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - /* Rounds 44-47 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - /* Rounds 48-51 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - /* Rounds 52-55 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - /* Rounds 56-59 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - /* Rounds 60-63 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - /* Rounds 64-67 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - /* Rounds 68-71 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - /* Rounds 72-75 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); - - /* Rounds 76-79 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - - /* Combine state */ - core[0] = _mm_add_epi32(ABCD, core[0]); - core[1] = _mm_sha1nexte_epu32(E0, core[1]); -} - -typedef struct sha1_ni { - /* - * core[0] stores the first four words of the SHA-1 state. core[1] - * stores just the fifth word, in the vector lane at the highest - * address. - */ - __m128i core[2]; - sha1_block blk; - void *pointer_to_free; - BinarySink_IMPLEMENTATION; - ssh_hash hash; -} sha1_ni; - -static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len); - -static sha1_ni *sha1_ni_alloc(void) -{ - /* - * The __m128i variables in the context structure need to be - * 16-byte aligned, but not all malloc implementations that this - * code has to work with will guarantee to return a 16-byte - * aligned pointer. So we over-allocate, manually realign the - * pointer ourselves, and store the original one inside the - * context so we know how to free it later. - */ - void *allocation = smalloc(sizeof(sha1_ni) + 15); - uintptr_t alloc_address = (uintptr_t)allocation; - uintptr_t aligned_address = (alloc_address + 15) & ~15; - sha1_ni *s = (sha1_ni *)aligned_address; - s->pointer_to_free = allocation; - return s; -} - -static ssh_hash *sha1_ni_new(const ssh_hashalg *alg) -{ - if (!sha1_hw_available_cached()) - return NULL; - - sha1_ni *s = sha1_ni_alloc(); - - s->hash.vt = alg; - BinarySink_INIT(s, sha1_ni_write); - BinarySink_DELEGATE_INIT(&s->hash, s); - return &s->hash; -} - -FUNC_ISA static void sha1_ni_reset(ssh_hash *hash) -{ - sha1_ni *s = container_of(hash, sha1_ni, hash); - - /* Initialise the core vectors in their storage order */ - s->core[0] = _mm_set_epi64x( - 0x67452301efcdab89ULL, 0x98badcfe10325476ULL); - s->core[1] = _mm_set_epi32(0xc3d2e1f0, 0, 0, 0); - - sha1_block_setup(&s->blk); -} - -static void sha1_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig) -{ - sha1_ni *copy = container_of(hcopy, sha1_ni, hash); - sha1_ni *orig = container_of(horig, sha1_ni, hash); - - void *ptf_save = copy->pointer_to_free; - *copy = *orig; /* structure copy */ - copy->pointer_to_free = ptf_save; - - BinarySink_COPIED(copy); - BinarySink_DELEGATE_INIT(©->hash, copy); -} - -static void sha1_ni_free(ssh_hash *hash) -{ - sha1_ni *s = container_of(hash, sha1_ni, hash); - - void *ptf = s->pointer_to_free; - smemclr(s, sizeof(*s)); - sfree(ptf); -} - -static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len) -{ - sha1_ni *s = BinarySink_DOWNCAST(bs, sha1_ni); - - while (len > 0) - if (sha1_block_write(&s->blk, &vp, &len)) - sha1_ni_block(s->core, s->blk.block); -} - -FUNC_ISA static void sha1_ni_digest(ssh_hash *hash, uint8_t *digest) -{ - sha1_ni *s = container_of(hash, sha1_ni, hash); - - sha1_block_pad(&s->blk, BinarySink_UPCAST(s)); - - /* Rearrange the first vector into its output order */ - __m128i abcd = _mm_shuffle_epi32(s->core[0], 0x1B); - - /* Byte-swap it into the output endianness */ - const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12); - abcd = _mm_shuffle_epi8(abcd, mask); - - /* And store it */ - _mm_storeu_si128((__m128i *)digest, abcd); - - /* Finally, store the leftover word */ - uint32_t e = _mm_extract_epi32(s->core[1], 3); - PUT_32BIT_MSB_FIRST(digest + 16, e); -} - -const ssh_hashalg ssh_sha1_hw = { - .new = sha1_ni_new, - .reset = sha1_ni_reset, - .copyfrom = sha1_ni_copyfrom, - .digest = sha1_ni_digest, - .free = sha1_ni_free, - .hlen = 20, - .blocklen = 64, - HASHALG_NAMES_ANNOTATED("SHA-1", "SHA-NI accelerated"), -}; - -/* ---------------------------------------------------------------------- - * Hardware-accelerated implementation of SHA-1 using Arm NEON. - */ - -#elif HW_SHA1 == HW_SHA1_NEON - -/* - * Manually set the target architecture, if we decided above that we - * need to. - */ -#ifdef USE_CLANG_ATTR_TARGET_AARCH64 -/* - * A spot of cheating: redefine some ACLE feature macros before - * including arm_neon.h. Otherwise we won't get the SHA intrinsics - * defined by that header, because it will be looking at the settings - * for the whole translation unit rather than the ones we're going to - * put on some particular functions using __attribute__((target)). - */ -#define __ARM_NEON 1 -#define __ARM_FEATURE_CRYPTO 1 -#define FUNC_ISA __attribute__ ((target("neon,crypto"))) -#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */ - -#ifndef FUNC_ISA -#define FUNC_ISA -#endif - -#ifdef USE_ARM64_NEON_H -#include -#else -#include -#endif - -static bool sha1_hw_available(void) -{ - /* - * For Arm, we delegate to a per-platform detection function (see - * explanation in sshaes.c). - */ - return platform_sha1_hw_available(); -} - -typedef struct sha1_neon_core sha1_neon_core; -struct sha1_neon_core { - uint32x4_t abcd; - uint32_t e; -}; - -FUNC_ISA -static inline uint32x4_t sha1_neon_load_input(const uint8_t *p) -{ - return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p))); -} - -FUNC_ISA -static inline uint32x4_t sha1_neon_schedule_update( - uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1) -{ - return vsha1su1q_u32(vsha1su0q_u32(m4, m3, m2), m1); -} - -/* - * SHA-1 has three different kinds of round, differing in whether they - * use the Ch, Maj or Par functions defined above. Each one uses a - * separate NEON instruction, so we define three inline functions for - * the different round types using this macro. - * - * The two batches of Par-type rounds also use a different constant, - * but that's passed in as an operand, so we don't need a fourth - * inline function just for that. - */ -#define SHA1_NEON_ROUND_FN(type) \ - FUNC_ISA static inline sha1_neon_core sha1_neon_round4_##type( \ - sha1_neon_core old, uint32x4_t sched, uint32x4_t constant) \ - { \ - sha1_neon_core new; \ - uint32x4_t round_input = vaddq_u32(sched, constant); \ - new.abcd = vsha1##type##q_u32(old.abcd, old.e, round_input); \ - new.e = vsha1h_u32(vget_lane_u32(vget_low_u32(old.abcd), 0)); \ - return new; \ - } -SHA1_NEON_ROUND_FN(c) -SHA1_NEON_ROUND_FN(p) -SHA1_NEON_ROUND_FN(m) - -FUNC_ISA -static inline void sha1_neon_block(sha1_neon_core *core, const uint8_t *p) -{ - uint32x4_t constant, s0, s1, s2, s3; - sha1_neon_core cr = *core; - - constant = vdupq_n_u32(SHA1_STAGE0_CONSTANT); - s0 = sha1_neon_load_input(p); - cr = sha1_neon_round4_c(cr, s0, constant); - s1 = sha1_neon_load_input(p + 16); - cr = sha1_neon_round4_c(cr, s1, constant); - s2 = sha1_neon_load_input(p + 32); - cr = sha1_neon_round4_c(cr, s2, constant); - s3 = sha1_neon_load_input(p + 48); - cr = sha1_neon_round4_c(cr, s3, constant); - s0 = sha1_neon_schedule_update(s0, s1, s2, s3); - cr = sha1_neon_round4_c(cr, s0, constant); - - constant = vdupq_n_u32(SHA1_STAGE1_CONSTANT); - s1 = sha1_neon_schedule_update(s1, s2, s3, s0); - cr = sha1_neon_round4_p(cr, s1, constant); - s2 = sha1_neon_schedule_update(s2, s3, s0, s1); - cr = sha1_neon_round4_p(cr, s2, constant); - s3 = sha1_neon_schedule_update(s3, s0, s1, s2); - cr = sha1_neon_round4_p(cr, s3, constant); - s0 = sha1_neon_schedule_update(s0, s1, s2, s3); - cr = sha1_neon_round4_p(cr, s0, constant); - s1 = sha1_neon_schedule_update(s1, s2, s3, s0); - cr = sha1_neon_round4_p(cr, s1, constant); - - constant = vdupq_n_u32(SHA1_STAGE2_CONSTANT); - s2 = sha1_neon_schedule_update(s2, s3, s0, s1); - cr = sha1_neon_round4_m(cr, s2, constant); - s3 = sha1_neon_schedule_update(s3, s0, s1, s2); - cr = sha1_neon_round4_m(cr, s3, constant); - s0 = sha1_neon_schedule_update(s0, s1, s2, s3); - cr = sha1_neon_round4_m(cr, s0, constant); - s1 = sha1_neon_schedule_update(s1, s2, s3, s0); - cr = sha1_neon_round4_m(cr, s1, constant); - s2 = sha1_neon_schedule_update(s2, s3, s0, s1); - cr = sha1_neon_round4_m(cr, s2, constant); - - constant = vdupq_n_u32(SHA1_STAGE3_CONSTANT); - s3 = sha1_neon_schedule_update(s3, s0, s1, s2); - cr = sha1_neon_round4_p(cr, s3, constant); - s0 = sha1_neon_schedule_update(s0, s1, s2, s3); - cr = sha1_neon_round4_p(cr, s0, constant); - s1 = sha1_neon_schedule_update(s1, s2, s3, s0); - cr = sha1_neon_round4_p(cr, s1, constant); - s2 = sha1_neon_schedule_update(s2, s3, s0, s1); - cr = sha1_neon_round4_p(cr, s2, constant); - s3 = sha1_neon_schedule_update(s3, s0, s1, s2); - cr = sha1_neon_round4_p(cr, s3, constant); - - core->abcd = vaddq_u32(core->abcd, cr.abcd); - core->e += cr.e; -} - -typedef struct sha1_neon { - sha1_neon_core core; - sha1_block blk; - BinarySink_IMPLEMENTATION; - ssh_hash hash; -} sha1_neon; - -static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len); - -static ssh_hash *sha1_neon_new(const ssh_hashalg *alg) -{ - if (!sha1_hw_available_cached()) - return NULL; - - sha1_neon *s = snew(sha1_neon); - - s->hash.vt = alg; - BinarySink_INIT(s, sha1_neon_write); - BinarySink_DELEGATE_INIT(&s->hash, s); - return &s->hash; -} - -static void sha1_neon_reset(ssh_hash *hash) -{ - sha1_neon *s = container_of(hash, sha1_neon, hash); - - s->core.abcd = vld1q_u32(sha1_initial_state); - s->core.e = sha1_initial_state[4]; - - sha1_block_setup(&s->blk); -} - -static void sha1_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig) -{ - sha1_neon *copy = container_of(hcopy, sha1_neon, hash); - sha1_neon *orig = container_of(horig, sha1_neon, hash); - - *copy = *orig; /* structure copy */ - - BinarySink_COPIED(copy); - BinarySink_DELEGATE_INIT(©->hash, copy); -} - -static void sha1_neon_free(ssh_hash *hash) -{ - sha1_neon *s = container_of(hash, sha1_neon, hash); - smemclr(s, sizeof(*s)); - sfree(s); -} - -static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len) -{ - sha1_neon *s = BinarySink_DOWNCAST(bs, sha1_neon); - - while (len > 0) - if (sha1_block_write(&s->blk, &vp, &len)) - sha1_neon_block(&s->core, s->blk.block); -} - -static void sha1_neon_digest(ssh_hash *hash, uint8_t *digest) -{ - sha1_neon *s = container_of(hash, sha1_neon, hash); - - sha1_block_pad(&s->blk, BinarySink_UPCAST(s)); - vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd))); - PUT_32BIT_MSB_FIRST(digest + 16, s->core.e); -} - -const ssh_hashalg ssh_sha1_hw = { - .new = sha1_neon_new, - .reset = sha1_neon_reset, - .copyfrom = sha1_neon_copyfrom, - .digest = sha1_neon_digest, - .free = sha1_neon_free, - .hlen = 20, - .blocklen = 64, - HASHALG_NAMES_ANNOTATED("SHA-1", "NEON accelerated"), -}; - -/* ---------------------------------------------------------------------- - * Stub functions if we have no hardware-accelerated SHA-1. In this - * case, sha1_hw_new returns NULL (though it should also never be - * selected by sha1_select, so the only thing that should even be - * _able_ to call it is testcrypt). As a result, the remaining vtable - * functions should never be called at all. - */ - -#elif HW_SHA1 == HW_SHA1_NONE - -static bool sha1_hw_available(void) -{ - return false; -} - -static ssh_hash *sha1_stub_new(const ssh_hashalg *alg) -{ - return NULL; -} - -#define STUB_BODY { unreachable("Should never be called"); } - -static void sha1_stub_reset(ssh_hash *hash) STUB_BODY -static void sha1_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY -static void sha1_stub_free(ssh_hash *hash) STUB_BODY -static void sha1_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY - -const ssh_hashalg ssh_sha1_hw = { - .new = sha1_stub_new, - .reset = sha1_stub_reset, - .copyfrom = sha1_stub_copyfrom, - .digest = sha1_stub_digest, - .free = sha1_stub_free, - .hlen = 20, - .blocklen = 64, - HASHALG_NAMES_ANNOTATED("SHA-1", "!NONEXISTENT ACCELERATED VERSION!"), -}; - -#endif /* HW_SHA1 */ diff --git a/crypto/sha1.h b/crypto/sha1.h new file mode 100644 index 00000000..2cdba0d4 --- /dev/null +++ b/crypto/sha1.h @@ -0,0 +1,109 @@ +/* + * Definitions likely to be helpful to multiple SHA-1 implementations. + */ + +/* + * The 'extra' structure used by SHA-1 implementations is used to + * include information about how to check if a given implementation is + * available at run time, and whether we've already checked. + */ +struct sha1_extra_mutable; +struct sha1_extra { + /* Function to check availability. Might be expensive, so we don't + * want to call it more than once. */ + bool (*check_available)(void); + + /* Point to a writable substructure. */ + struct sha1_extra_mutable *mut; +}; +struct sha1_extra_mutable { + bool checked_availability; + bool is_available; +}; +static inline bool check_availability(const struct sha1_extra *extra) +{ + if (!extra->mut->checked_availability) { + extra->mut->is_available = extra->check_available(); + extra->mut->checked_availability = true; + } + + return extra->mut->is_available; +} + +/* + * Macro to define a SHA-1 vtable together with its 'extra' + * structure. + */ +#define SHA1_VTABLE(impl_c, impl_display) \ + static struct sha1_extra_mutable sha1_ ## impl_c ## _extra_mut; \ + static const struct sha1_extra sha1_ ## impl_c ## _extra = { \ + .check_available = sha1_ ## impl_c ## _available, \ + .mut = &sha1_ ## impl_c ## _extra_mut, \ + }; \ + const ssh_hashalg ssh_sha1_ ## impl_c = { \ + .new = sha1_ ## impl_c ## _new, \ + .reset = sha1_ ## impl_c ## _reset, \ + .copyfrom = sha1_ ## impl_c ## _copyfrom, \ + .digest = sha1_ ## impl_c ## _digest, \ + .free = sha1_ ## impl_c ## _free, \ + .hlen = 20, \ + .blocklen = 64, \ + HASHALG_NAMES_ANNOTATED("SHA-1", impl_display), \ + .extra = &sha1_ ## impl_c ## _extra, \ + } + +extern const uint32_t sha1_initial_state[5]; + +#define SHA1_ROUNDS_PER_STAGE 20 +#define SHA1_STAGE0_CONSTANT 0x5a827999 +#define SHA1_STAGE1_CONSTANT 0x6ed9eba1 +#define SHA1_STAGE2_CONSTANT 0x8f1bbcdc +#define SHA1_STAGE3_CONSTANT 0xca62c1d6 +#define SHA1_ROUNDS (4 * SHA1_ROUNDS_PER_STAGE) + +typedef struct sha1_block sha1_block; +struct sha1_block { + uint8_t block[64]; + size_t used; + uint64_t len; +}; + +static inline void sha1_block_setup(sha1_block *blk) +{ + blk->used = 0; + blk->len = 0; +} + +static inline bool sha1_block_write( + sha1_block *blk, const void **vdata, size_t *len) +{ + size_t blkleft = sizeof(blk->block) - blk->used; + size_t chunk = *len < blkleft ? *len : blkleft; + + const uint8_t *p = *vdata; + memcpy(blk->block + blk->used, p, chunk); + *vdata = p + chunk; + *len -= chunk; + blk->used += chunk; + blk->len += chunk; + + if (blk->used == sizeof(blk->block)) { + blk->used = 0; + return true; + } + + return false; +} + +static inline void sha1_block_pad(sha1_block *blk, BinarySink *bs) +{ + uint64_t final_len = blk->len << 3; + size_t pad = 1 + (63 & (55 - blk->used)); + + put_byte(bs, 0x80); + for (size_t i = 1; i < pad; i++) + put_byte(bs, 0); + put_uint64(bs, final_len); + + assert(blk->used == 0 && "Should have exactly hit a block boundary"); +} diff --git a/crypto/sha256-common.c b/crypto/sha256-common.c new file mode 100644 index 00000000..52904c08 --- /dev/null +++ b/crypto/sha256-common.c @@ -0,0 +1,30 @@ +/* + * Common variable definitions across all the SHA-256 implementations. + */ + +#include "ssh.h" +#include "sha256.h" + +const uint32_t sha256_initial_state[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19, +}; + +const uint32_t sha256_round_constants[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +}; diff --git a/crypto/sha256-neon.c b/crypto/sha256-neon.c new file mode 100644 index 00000000..87d24d0c --- /dev/null +++ b/crypto/sha256-neon.c @@ -0,0 +1,162 @@ +/* + * Hardware-accelerated implementation of SHA-256 using Arm NEON. + */ + +#include "ssh.h" +#include "sha256.h" + +#if USE_ARM64_NEON_H +#include +#else +#include +#endif + +static bool sha256_neon_available(void) +{ + /* + * For Arm, we delegate to a per-platform detection function (see + * explanation in aes-neon.c). + */ + return platform_sha256_neon_available(); +} + +typedef struct sha256_neon_core sha256_neon_core; +struct sha256_neon_core { + uint32x4_t abcd, efgh; +}; + +static inline uint32x4_t sha256_neon_load_input(const uint8_t *p) +{ + return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p))); +} + +static inline uint32x4_t sha256_neon_schedule_update( + uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1) +{ + return vsha256su1q_u32(vsha256su0q_u32(m4, m3), m2, m1); +} + +static inline sha256_neon_core sha256_neon_round4( + sha256_neon_core old, uint32x4_t sched, unsigned round) +{ + sha256_neon_core new; + + uint32x4_t round_input = vaddq_u32( + sched, vld1q_u32(sha256_round_constants + round)); + new.abcd = vsha256hq_u32 (old.abcd, old.efgh, round_input); + new.efgh = vsha256h2q_u32(old.efgh, old.abcd, round_input); + return new; +} + +static inline void sha256_neon_block(sha256_neon_core *core, const uint8_t *p) +{ + uint32x4_t s0, s1, s2, s3; + sha256_neon_core cr = *core; + + s0 = sha256_neon_load_input(p); + cr = sha256_neon_round4(cr, s0, 0); + s1 = sha256_neon_load_input(p+16); + cr = sha256_neon_round4(cr, s1, 4); + s2 = sha256_neon_load_input(p+32); + cr = sha256_neon_round4(cr, s2, 8); + s3 = sha256_neon_load_input(p+48); + cr = sha256_neon_round4(cr, s3, 12); + s0 = sha256_neon_schedule_update(s0, s1, s2, s3); + cr = sha256_neon_round4(cr, s0, 16); + s1 = sha256_neon_schedule_update(s1, s2, s3, s0); + cr = sha256_neon_round4(cr, s1, 20); + s2 = sha256_neon_schedule_update(s2, s3, s0, s1); + cr = sha256_neon_round4(cr, s2, 24); + s3 = sha256_neon_schedule_update(s3, s0, s1, s2); + cr = sha256_neon_round4(cr, s3, 28); + s0 = sha256_neon_schedule_update(s0, s1, s2, s3); + cr = sha256_neon_round4(cr, s0, 32); + s1 = sha256_neon_schedule_update(s1, s2, s3, s0); + cr = sha256_neon_round4(cr, s1, 36); + s2 = sha256_neon_schedule_update(s2, s3, s0, s1); + cr = sha256_neon_round4(cr, s2, 40); + s3 = sha256_neon_schedule_update(s3, s0, s1, s2); + cr = sha256_neon_round4(cr, s3, 44); + s0 = sha256_neon_schedule_update(s0, s1, s2, s3); + cr = sha256_neon_round4(cr, s0, 48); + s1 = sha256_neon_schedule_update(s1, s2, s3, s0); + cr = sha256_neon_round4(cr, s1, 52); + s2 = sha256_neon_schedule_update(s2, s3, s0, s1); + cr = sha256_neon_round4(cr, s2, 56); + s3 = sha256_neon_schedule_update(s3, s0, s1, s2); + cr = sha256_neon_round4(cr, s3, 60); + + core->abcd = vaddq_u32(core->abcd, cr.abcd); + core->efgh = vaddq_u32(core->efgh, cr.efgh); +} + +typedef struct sha256_neon { + sha256_neon_core core; + sha256_block blk; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha256_neon; + +static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len); + +static ssh_hash *sha256_neon_new(const ssh_hashalg *alg) +{ + const struct sha256_extra *extra = (const struct sha256_extra *)alg->extra; + if (!check_availability(extra)) + return NULL; + + sha256_neon *s = snew(sha256_neon); + + s->hash.vt = alg; + BinarySink_INIT(s, sha256_neon_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + return &s->hash; +} + +static void sha256_neon_reset(ssh_hash *hash) +{ + sha256_neon *s = container_of(hash, sha256_neon, hash); + + s->core.abcd = vld1q_u32(sha256_initial_state); + s->core.efgh = vld1q_u32(sha256_initial_state + 4); + + sha256_block_setup(&s->blk); +} + +static void sha256_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig) +{ + sha256_neon *copy = container_of(hcopy, sha256_neon, hash); + sha256_neon *orig = container_of(horig, sha256_neon, hash); + + *copy = *orig; /* structure copy */ + + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); +} + +static void sha256_neon_free(ssh_hash *hash) +{ + sha256_neon *s = container_of(hash, sha256_neon, hash); + smemclr(s, sizeof(*s)); + sfree(s); +} + +static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len) +{ + sha256_neon *s = BinarySink_DOWNCAST(bs, sha256_neon); + + while (len > 0) + if (sha256_block_write(&s->blk, &vp, &len)) + sha256_neon_block(&s->core, s->blk.block); +} + +static void sha256_neon_digest(ssh_hash *hash, uint8_t *digest) +{ + sha256_neon *s = container_of(hash, sha256_neon, hash); + + sha256_block_pad(&s->blk, BinarySink_UPCAST(s)); + vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd))); + vst1q_u8(digest + 16, vrev32q_u8(vreinterpretq_u8_u32(s->core.efgh))); +} + +SHA256_VTABLE(neon, "NEON accelerated"); diff --git a/crypto/sha256-ni.c b/crypto/sha256-ni.c new file mode 100644 index 00000000..530fa433 --- /dev/null +++ b/crypto/sha256-ni.c @@ -0,0 +1,342 @@ +/* + * Hardware-accelerated implementation of SHA-256 using x86 SHA-NI. + */ + +#include "ssh.h" +#include "sha256.h" + +#include +#include +#include +#if HAVE_SHAINTRIN_H +#include +#endif + +#if defined(__clang__) || defined(__GNUC__) +#include +#define GET_CPU_ID_0(out) \ + __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3]) +#define GET_CPU_ID_7(out) \ + __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3]) +#else +#define GET_CPU_ID_0(out) __cpuid(out, 0) +#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0) +#endif + +static bool sha256_ni_available(void) +{ + unsigned int CPUInfo[4]; + GET_CPU_ID_0(CPUInfo); + if (CPUInfo[0] < 7) + return false; + + GET_CPU_ID_7(CPUInfo); + return CPUInfo[1] & (1 << 29); /* Check SHA */ +} + +/* SHA256 implementation using new instructions + The code is based on Jeffrey Walton's SHA256 implementation: + https://github.com/noloader/SHA-Intrinsics +*/ +static inline void sha256_ni_block(__m128i *core, const uint8_t *p) +{ + __m128i STATE0, STATE1; + __m128i MSG, TMP; + __m128i MSG0, MSG1, MSG2, MSG3; + const __m128i *block = (const __m128i *)p; + const __m128i MASK = _mm_set_epi64x( + 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); + + /* Load initial values */ + STATE0 = core[0]; + STATE1 = core[1]; + + /* Rounds 0-3 */ + MSG = _mm_loadu_si128(block); + MSG0 = _mm_shuffle_epi8(MSG, MASK); + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x( + 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Rounds 4-7 */ + MSG1 = _mm_loadu_si128(block + 1); + MSG1 = _mm_shuffle_epi8(MSG1, MASK); + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x( + 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + + /* Rounds 8-11 */ + MSG2 = _mm_loadu_si128(block + 2); + MSG2 = _mm_shuffle_epi8(MSG2, MASK); + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x( + 0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + + /* Rounds 12-15 */ + MSG3 = _mm_loadu_si128(block + 3); + MSG3 = _mm_shuffle_epi8(MSG3, MASK); + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x( + 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + + /* Rounds 16-19 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x( + 0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + + /* Rounds 20-23 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x( + 0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + + /* Rounds 24-27 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x( + 0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + + /* Rounds 28-31 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x( + 0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + + /* Rounds 32-35 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x( + 0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + + /* Rounds 36-39 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x( + 0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + + /* Rounds 40-43 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x( + 0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + + /* Rounds 44-47 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x( + 0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + + /* Rounds 48-51 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x( + 0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + + /* Rounds 52-55 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x( + 0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Rounds 56-59 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x( + 0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Rounds 60-63 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x( + 0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Combine state */ + core[0] = _mm_add_epi32(STATE0, core[0]); + core[1] = _mm_add_epi32(STATE1, core[1]); +} + +typedef struct sha256_ni { + /* + * These two vectors store the 8 words of the SHA-256 state, but + * not in the same order they appear in the spec: the first word + * holds A,B,E,F and the second word C,D,G,H. + */ + __m128i core[2]; + sha256_block blk; + void *pointer_to_free; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha256_ni; + +static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len); + +static sha256_ni *sha256_ni_alloc(void) +{ + /* + * The __m128i variables in the context structure need to be + * 16-byte aligned, but not all malloc implementations that this + * code has to work with will guarantee to return a 16-byte + * aligned pointer. So we over-allocate, manually realign the + * pointer ourselves, and store the original one inside the + * context so we know how to free it later. + */ + void *allocation = smalloc(sizeof(sha256_ni) + 15); + uintptr_t alloc_address = (uintptr_t)allocation; + uintptr_t aligned_address = (alloc_address + 15) & ~15; + sha256_ni *s = (sha256_ni *)aligned_address; + s->pointer_to_free = allocation; + return s; +} + +static ssh_hash *sha256_ni_new(const ssh_hashalg *alg) +{ + const struct sha256_extra *extra = (const struct sha256_extra *)alg->extra; + if (!check_availability(extra)) + return NULL; + + sha256_ni *s = sha256_ni_alloc(); + + s->hash.vt = alg; + BinarySink_INIT(s, sha256_ni_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + + return &s->hash; +} + +static void sha256_ni_reset(ssh_hash *hash) +{ + sha256_ni *s = container_of(hash, sha256_ni, hash); + + /* Initialise the core vectors in their storage order */ + s->core[0] = _mm_set_epi64x( + 0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL); + s->core[1] = _mm_set_epi64x( + 0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL); + + sha256_block_setup(&s->blk); +} + +static void sha256_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig) +{ + sha256_ni *copy = container_of(hcopy, sha256_ni, hash); + sha256_ni *orig = container_of(horig, sha256_ni, hash); + + void *ptf_save = copy->pointer_to_free; + *copy = *orig; /* structure copy */ + copy->pointer_to_free = ptf_save; + + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); +} + +static void sha256_ni_free(ssh_hash *hash) +{ + sha256_ni *s = container_of(hash, sha256_ni, hash); + + void *ptf = s->pointer_to_free; + smemclr(s, sizeof(*s)); + sfree(ptf); +} + +static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len) +{ + sha256_ni *s = BinarySink_DOWNCAST(bs, sha256_ni); + + while (len > 0) + if (sha256_block_write(&s->blk, &vp, &len)) + sha256_ni_block(s->core, s->blk.block); +} + +static void sha256_ni_digest(ssh_hash *hash, uint8_t *digest) +{ + sha256_ni *s = container_of(hash, sha256_ni, hash); + + sha256_block_pad(&s->blk, BinarySink_UPCAST(s)); + + /* Rearrange the words into the output order */ + __m128i feba = _mm_shuffle_epi32(s->core[0], 0x1B); + __m128i dchg = _mm_shuffle_epi32(s->core[1], 0xB1); + __m128i dcba = _mm_blend_epi16(feba, dchg, 0xF0); + __m128i hgfe = _mm_alignr_epi8(dchg, feba, 8); + + /* Byte-swap them into the output endianness */ + const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12); + dcba = _mm_shuffle_epi8(dcba, mask); + hgfe = _mm_shuffle_epi8(hgfe, mask); + + /* And store them */ + __m128i *output = (__m128i *)digest; + _mm_storeu_si128(output, dcba); + _mm_storeu_si128(output+1, hgfe); +} + +SHA256_VTABLE(ni, "SHA-NI accelerated"); diff --git a/crypto/sha256-select.c b/crypto/sha256-select.c new file mode 100644 index 00000000..78e5b7e4 --- /dev/null +++ b/crypto/sha256-select.c @@ -0,0 +1,44 @@ +/* + * Top-level vtables to select a SHA-256 implementation. + */ + +#include +#include + +#include "putty.h" +#include "ssh.h" +#include "sha256.h" + +static ssh_hash *sha256_select(const ssh_hashalg *alg) +{ + static const ssh_hashalg *const real_algs[] = { +#if HAVE_SHA_NI + &ssh_sha256_ni, +#endif +#if HAVE_NEON_CRYPTO + &ssh_sha256_neon, +#endif + &ssh_sha256_sw, + NULL, + }; + + for (size_t i = 0; real_algs[i]; i++) { + const ssh_hashalg *alg = real_algs[i]; + const struct sha256_extra *alg_extra = + (const struct sha256_extra *)alg->extra; + if (check_availability(alg_extra)) + return ssh_hash_new(alg); + } + + /* We should never reach the NULL at the end of the list, because + * the last non-NULL entry should be software-only SHA-256, which + * is always available. */ + unreachable("sha256_select ran off the end of its list"); +} + +const ssh_hashalg ssh_sha256 = { + .new = sha256_select, + .hlen = 32, + .blocklen = 64, + HASHALG_NAMES_ANNOTATED("SHA-256", "dummy selector vtable"), +}; diff --git a/crypto/sha256-sw.c b/crypto/sha256-sw.c new file mode 100644 index 00000000..82a116c6 --- /dev/null +++ b/crypto/sha256-sw.c @@ -0,0 +1,157 @@ +/* + * Software implementation of SHA-256. + */ + +#include "ssh.h" +#include "sha256.h" + +static bool sha256_sw_available(void) +{ + /* Software SHA-256 is always available */ + return true; +} + +static inline uint32_t ror(uint32_t x, unsigned y) +{ + return (x << (31 & -y)) | (x >> (31 & y)); +} + +static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0) +{ + return if0 ^ (ctrl & (if1 ^ if0)); +} + +static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) +{ + return (x & y) | (z & (x | y)); +} + +static inline uint32_t Sigma_0(uint32_t x) +{ + return ror(x,2) ^ ror(x,13) ^ ror(x,22); +} + +static inline uint32_t Sigma_1(uint32_t x) +{ + return ror(x,6) ^ ror(x,11) ^ ror(x,25); +} + +static inline uint32_t sigma_0(uint32_t x) +{ + return ror(x,7) ^ ror(x,18) ^ (x >> 3); +} + +static inline uint32_t sigma_1(uint32_t x) +{ + return ror(x,17) ^ ror(x,19) ^ (x >> 10); +} + +static inline void sha256_sw_round( + unsigned round_index, const uint32_t *schedule, + uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, + uint32_t *e, uint32_t *f, uint32_t *g, uint32_t *h) +{ + uint32_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) + + sha256_round_constants[round_index] + schedule[round_index]; + + uint32_t t2 = Sigma_0(*a) + Maj(*a,*b,*c); + + *d += t1; + *h = t1 + t2; +} + +static void sha256_sw_block(uint32_t *core, const uint8_t *block) +{ + uint32_t w[SHA256_ROUNDS]; + uint32_t a,b,c,d,e,f,g,h; + + for (size_t t = 0; t < 16; t++) + w[t] = GET_32BIT_MSB_FIRST(block + 4*t); + + for (size_t t = 16; t < SHA256_ROUNDS; t++) + w[t] = sigma_1(w[t-2]) + w[t-7] + sigma_0(w[t-15]) + w[t-16]; + + a = core[0]; b = core[1]; c = core[2]; d = core[3]; + e = core[4]; f = core[5]; g = core[6]; h = core[7]; + + for (size_t t = 0; t < SHA256_ROUNDS; t += 8) { + sha256_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h); + sha256_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g); + sha256_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f); + sha256_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e); + sha256_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d); + sha256_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c); + sha256_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b); + sha256_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a); + } + + core[0] += a; core[1] += b; core[2] += c; core[3] += d; + core[4] += e; core[5] += f; core[6] += g; core[7] += h; + + smemclr(w, sizeof(w)); +} + +typedef struct sha256_sw { + uint32_t core[8]; + sha256_block blk; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha256_sw; + +static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len); + +static ssh_hash *sha256_sw_new(const ssh_hashalg *alg) +{ + sha256_sw *s = snew(sha256_sw); + + s->hash.vt = alg; + BinarySink_INIT(s, sha256_sw_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + return &s->hash; +} + +static void sha256_sw_reset(ssh_hash *hash) +{ + sha256_sw *s = container_of(hash, sha256_sw, hash); + + memcpy(s->core, sha256_initial_state, sizeof(s->core)); + sha256_block_setup(&s->blk); +} + +static void sha256_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig) +{ + sha256_sw *copy = container_of(hcopy, sha256_sw, hash); + sha256_sw *orig = container_of(horig, sha256_sw, hash); + + memcpy(copy, orig, sizeof(*copy)); + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); +} + +static void sha256_sw_free(ssh_hash *hash) +{ + sha256_sw *s = container_of(hash, sha256_sw, hash); + + smemclr(s, sizeof(*s)); + sfree(s); +} + +static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len) +{ + sha256_sw *s = BinarySink_DOWNCAST(bs, sha256_sw); + + while (len > 0) + if (sha256_block_write(&s->blk, &vp, &len)) + sha256_sw_block(s->core, s->blk.block); +} + +static void sha256_sw_digest(ssh_hash *hash, uint8_t *digest) +{ + sha256_sw *s = container_of(hash, sha256_sw, hash); + + sha256_block_pad(&s->blk, BinarySink_UPCAST(s)); + for (size_t i = 0; i < 8; i++) + PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]); +} + +SHA256_VTABLE(sw, "unaccelerated"); diff --git a/crypto/sha256.c b/crypto/sha256.c deleted file mode 100644 index 206a976c..00000000 --- a/crypto/sha256.c +++ /dev/null @@ -1,939 +0,0 @@ -/* - * SHA-256 algorithm as described at - * - * http://csrc.nist.gov/cryptval/shs.html - */ - -#include "ssh.h" -#include - -/* - * Start by deciding whether we can support hardware SHA at all. - */ -#define HW_SHA256_NONE 0 -#define HW_SHA256_NI 1 -#define HW_SHA256_NEON 2 - -#ifdef _FORCE_SHA_NI -# define HW_SHA256 HW_SHA256_NI -#elif defined(__clang__) -# if __has_attribute(target) && __has_include() && \ - (defined(__x86_64__) || defined(__i386)) -# define HW_SHA256 HW_SHA256_NI -# endif -#elif defined(__GNUC__) -# if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) && \ - (defined(__x86_64__) || defined(__i386)) -# define HW_SHA256 HW_SHA256_NI -# endif -#elif defined (_MSC_VER) -# if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729 -# define HW_SHA256 HW_SHA256_NI -# endif -#endif - -#ifdef _FORCE_SHA_NEON -# define HW_SHA256 HW_SHA256_NEON -#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - /* Arm can potentially support both endiannesses, but this code - * hasn't been tested on anything but little. If anyone wants to - * run big-endian, they'll need to fix it first. */ -#elif defined __ARM_FEATURE_CRYPTO - /* If the Arm crypto extension is available already, we can - * support NEON SHA without having to enable anything by hand */ -# define HW_SHA256 HW_SHA256_NEON -#elif defined(__clang__) -# if __has_attribute(target) && __has_include() && \ - (defined(__aarch64__)) - /* clang can enable the crypto extension in AArch64 using - * __attribute__((target)) */ -# define HW_SHA256 HW_SHA256_NEON -# define USE_CLANG_ATTR_TARGET_AARCH64 -# endif -#elif defined _MSC_VER - /* Visual Studio supports the crypto extension when targeting - * AArch64, but as of VS2017, the AArch32 header doesn't quite - * manage it (declaring the shae/shad intrinsics without a round - * key operand). */ -# if defined _M_ARM64 -# define HW_SHA256 HW_SHA256_NEON -# if defined _M_ARM64 -# define USE_ARM64_NEON_H /* unusual header name in this case */ -# endif -# endif -#endif - -#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA256 -# undef HW_SHA256 -# define HW_SHA256 HW_SHA256_NONE -#endif - -/* - * The actual query function that asks if hardware acceleration is - * available. - */ -static bool sha256_hw_available(void); - -/* - * The top-level selection function, caching the results of - * sha256_hw_available() so it only has to run once. - */ -static bool sha256_hw_available_cached(void) -{ - static bool initialised = false; - static bool hw_available; - if (!initialised) { - hw_available = sha256_hw_available(); - initialised = true; - } - return hw_available; -} - -static ssh_hash *sha256_select(const ssh_hashalg *alg) -{ - const ssh_hashalg *real_alg = - sha256_hw_available_cached() ? &ssh_sha256_hw : &ssh_sha256_sw; - - return ssh_hash_new(real_alg); -} - -const ssh_hashalg ssh_sha256 = { - .new = sha256_select, - .hlen = 32, - .blocklen = 64, - HASHALG_NAMES_ANNOTATED("SHA-256", "dummy selector vtable"), -}; - -/* ---------------------------------------------------------------------- - * Definitions likely to be helpful to multiple implementations. - */ - -static const uint32_t sha256_initial_state[] = { - 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, - 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19, -}; - -static const uint32_t sha256_round_constants[] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, -}; - -#define SHA256_ROUNDS 64 - -typedef struct sha256_block sha256_block; -struct sha256_block { - uint8_t block[64]; - size_t used; - uint64_t len; -}; - -static inline void sha256_block_setup(sha256_block *blk) -{ - blk->used = 0; - blk->len = 0; -} - -static inline bool sha256_block_write( - sha256_block *blk, const void **vdata, size_t *len) -{ - size_t blkleft = sizeof(blk->block) - blk->used; - size_t chunk = *len < blkleft ? *len : blkleft; - - const uint8_t *p = *vdata; - memcpy(blk->block + blk->used, p, chunk); - *vdata = p + chunk; - *len -= chunk; - blk->used += chunk; - blk->len += chunk; - - if (blk->used == sizeof(blk->block)) { - blk->used = 0; - return true; - } - - return false; -} - -static inline void sha256_block_pad(sha256_block *blk, BinarySink *bs) -{ - uint64_t final_len = blk->len << 3; - size_t pad = 1 + (63 & (55 - blk->used)); - - put_byte(bs, 0x80); - for (size_t i = 1; i < pad; i++) - put_byte(bs, 0); - put_uint64(bs, final_len); - - assert(blk->used == 0 && "Should have exactly hit a block boundary"); -} - -/* ---------------------------------------------------------------------- - * Software implementation of SHA-256. - */ - -static inline uint32_t ror(uint32_t x, unsigned y) -{ - return (x << (31 & -y)) | (x >> (31 & y)); -} - -static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0) -{ - return if0 ^ (ctrl & (if1 ^ if0)); -} - -static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) -{ - return (x & y) | (z & (x | y)); -} - -static inline uint32_t Sigma_0(uint32_t x) -{ - return ror(x,2) ^ ror(x,13) ^ ror(x,22); -} - -static inline uint32_t Sigma_1(uint32_t x) -{ - return ror(x,6) ^ ror(x,11) ^ ror(x,25); -} - -static inline uint32_t sigma_0(uint32_t x) -{ - return ror(x,7) ^ ror(x,18) ^ (x >> 3); -} - -static inline uint32_t sigma_1(uint32_t x) -{ - return ror(x,17) ^ ror(x,19) ^ (x >> 10); -} - -static inline void sha256_sw_round( - unsigned round_index, const uint32_t *schedule, - uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, - uint32_t *e, uint32_t *f, uint32_t *g, uint32_t *h) -{ - uint32_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) + - sha256_round_constants[round_index] + schedule[round_index]; - - uint32_t t2 = Sigma_0(*a) + Maj(*a,*b,*c); - - *d += t1; - *h = t1 + t2; -} - -static void sha256_sw_block(uint32_t *core, const uint8_t *block) -{ - uint32_t w[SHA256_ROUNDS]; - uint32_t a,b,c,d,e,f,g,h; - - for (size_t t = 0; t < 16; t++) - w[t] = GET_32BIT_MSB_FIRST(block + 4*t); - - for (size_t t = 16; t < SHA256_ROUNDS; t++) - w[t] = sigma_1(w[t-2]) + w[t-7] + sigma_0(w[t-15]) + w[t-16]; - - a = core[0]; b = core[1]; c = core[2]; d = core[3]; - e = core[4]; f = core[5]; g = core[6]; h = core[7]; - - for (size_t t = 0; t < SHA256_ROUNDS; t += 8) { - sha256_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h); - sha256_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g); - sha256_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f); - sha256_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e); - sha256_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d); - sha256_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c); - sha256_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b); - sha256_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a); - } - - core[0] += a; core[1] += b; core[2] += c; core[3] += d; - core[4] += e; core[5] += f; core[6] += g; core[7] += h; - - smemclr(w, sizeof(w)); -} - -typedef struct sha256_sw { - uint32_t core[8]; - sha256_block blk; - BinarySink_IMPLEMENTATION; - ssh_hash hash; -} sha256_sw; - -static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len); - -static ssh_hash *sha256_sw_new(const ssh_hashalg *alg) -{ - sha256_sw *s = snew(sha256_sw); - - s->hash.vt = alg; - BinarySink_INIT(s, sha256_sw_write); - BinarySink_DELEGATE_INIT(&s->hash, s); - return &s->hash; -} - -static void sha256_sw_reset(ssh_hash *hash) -{ - sha256_sw *s = container_of(hash, sha256_sw, hash); - - memcpy(s->core, sha256_initial_state, sizeof(s->core)); - sha256_block_setup(&s->blk); -} - -static void sha256_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig) -{ - sha256_sw *copy = container_of(hcopy, sha256_sw, hash); - sha256_sw *orig = container_of(horig, sha256_sw, hash); - - memcpy(copy, orig, sizeof(*copy)); - BinarySink_COPIED(copy); - BinarySink_DELEGATE_INIT(©->hash, copy); -} - -static void sha256_sw_free(ssh_hash *hash) -{ - sha256_sw *s = container_of(hash, sha256_sw, hash); - - smemclr(s, sizeof(*s)); - sfree(s); -} - -static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len) -{ - sha256_sw *s = BinarySink_DOWNCAST(bs, sha256_sw); - - while (len > 0) - if (sha256_block_write(&s->blk, &vp, &len)) - sha256_sw_block(s->core, s->blk.block); -} - -static void sha256_sw_digest(ssh_hash *hash, uint8_t *digest) -{ - sha256_sw *s = container_of(hash, sha256_sw, hash); - - sha256_block_pad(&s->blk, BinarySink_UPCAST(s)); - for (size_t i = 0; i < 8; i++) - PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]); -} - -const ssh_hashalg ssh_sha256_sw = { - .new = sha256_sw_new, - .reset = sha256_sw_reset, - .copyfrom = sha256_sw_copyfrom, - .digest = sha256_sw_digest, - .free = sha256_sw_free, - .hlen = 32, - .blocklen = 64, - HASHALG_NAMES_ANNOTATED("SHA-256", "unaccelerated"), -}; - -/* ---------------------------------------------------------------------- - * Hardware-accelerated implementation of SHA-256 using x86 SHA-NI. - */ - -#if HW_SHA256 == HW_SHA256_NI - -/* - * Set target architecture for Clang and GCC - */ -#if defined(__clang__) || defined(__GNUC__) -# define FUNC_ISA __attribute__ ((target("sse4.1,sha"))) -#if !defined(__clang__) -# pragma GCC target("sha") -# pragma GCC target("sse4.1") -#endif -#else -# define FUNC_ISA -#endif - -#include -#include -#include -#if defined(__clang__) || defined(__GNUC__) -#include -#endif - -#if defined(__clang__) || defined(__GNUC__) -#include -#define GET_CPU_ID_0(out) \ - __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3]) -#define GET_CPU_ID_7(out) \ - __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3]) -#else -#define GET_CPU_ID_0(out) __cpuid(out, 0) -#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0) -#endif - -static bool sha256_hw_available(void) -{ - unsigned int CPUInfo[4]; - GET_CPU_ID_0(CPUInfo); - if (CPUInfo[0] < 7) - return false; - - GET_CPU_ID_7(CPUInfo); - return CPUInfo[1] & (1 << 29); /* Check SHA */ -} - -/* SHA256 implementation using new instructions - The code is based on Jeffrey Walton's SHA256 implementation: - https://github.com/noloader/SHA-Intrinsics -*/ -FUNC_ISA -static inline void sha256_ni_block(__m128i *core, const uint8_t *p) -{ - __m128i STATE0, STATE1; - __m128i MSG, TMP; - __m128i MSG0, MSG1, MSG2, MSG3; - const __m128i *block = (const __m128i *)p; - const __m128i MASK = _mm_set_epi64x( - 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); - - /* Load initial values */ - STATE0 = core[0]; - STATE1 = core[1]; - - /* Rounds 0-3 */ - MSG = _mm_loadu_si128(block); - MSG0 = _mm_shuffle_epi8(MSG, MASK); - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x( - 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Rounds 4-7 */ - MSG1 = _mm_loadu_si128(block + 1); - MSG1 = _mm_shuffle_epi8(MSG1, MASK); - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x( - 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); - - /* Rounds 8-11 */ - MSG2 = _mm_loadu_si128(block + 2); - MSG2 = _mm_shuffle_epi8(MSG2, MASK); - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x( - 0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); - - /* Rounds 12-15 */ - MSG3 = _mm_loadu_si128(block + 3); - MSG3 = _mm_shuffle_epi8(MSG3, MASK); - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x( - 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG3, MSG2, 4); - MSG0 = _mm_add_epi32(MSG0, TMP); - MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); - - /* Rounds 16-19 */ - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x( - 0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG0, MSG3, 4); - MSG1 = _mm_add_epi32(MSG1, TMP); - MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); - - /* Rounds 20-23 */ - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x( - 0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG1, MSG0, 4); - MSG2 = _mm_add_epi32(MSG2, TMP); - MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); - - /* Rounds 24-27 */ - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x( - 0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG2, MSG1, 4); - MSG3 = _mm_add_epi32(MSG3, TMP); - MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); - - /* Rounds 28-31 */ - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x( - 0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG3, MSG2, 4); - MSG0 = _mm_add_epi32(MSG0, TMP); - MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); - - /* Rounds 32-35 */ - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x( - 0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG0, MSG3, 4); - MSG1 = _mm_add_epi32(MSG1, TMP); - MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); - - /* Rounds 36-39 */ - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x( - 0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG1, MSG0, 4); - MSG2 = _mm_add_epi32(MSG2, TMP); - MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); - - /* Rounds 40-43 */ - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x( - 0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG2, MSG1, 4); - MSG3 = _mm_add_epi32(MSG3, TMP); - MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); - - /* Rounds 44-47 */ - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x( - 0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG3, MSG2, 4); - MSG0 = _mm_add_epi32(MSG0, TMP); - MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); - - /* Rounds 48-51 */ - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x( - 0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG0, MSG3, 4); - MSG1 = _mm_add_epi32(MSG1, TMP); - MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); - - /* Rounds 52-55 */ - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x( - 0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG1, MSG0, 4); - MSG2 = _mm_add_epi32(MSG2, TMP); - MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Rounds 56-59 */ - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x( - 0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG2, MSG1, 4); - MSG3 = _mm_add_epi32(MSG3, TMP); - MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Rounds 60-63 */ - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x( - 0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Combine state */ - core[0] = _mm_add_epi32(STATE0, core[0]); - core[1] = _mm_add_epi32(STATE1, core[1]); -} - -typedef struct sha256_ni { - /* - * These two vectors store the 8 words of the SHA-256 state, but - * not in the same order they appear in the spec: the first word - * holds A,B,E,F and the second word C,D,G,H. - */ - __m128i core[2]; - sha256_block blk; - void *pointer_to_free; - BinarySink_IMPLEMENTATION; - ssh_hash hash; -} sha256_ni; - -static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len); - -static sha256_ni *sha256_ni_alloc(void) -{ - /* - * The __m128i variables in the context structure need to be - * 16-byte aligned, but not all malloc implementations that this - * code has to work with will guarantee to return a 16-byte - * aligned pointer. So we over-allocate, manually realign the - * pointer ourselves, and store the original one inside the - * context so we know how to free it later. - */ - void *allocation = smalloc(sizeof(sha256_ni) + 15); - uintptr_t alloc_address = (uintptr_t)allocation; - uintptr_t aligned_address = (alloc_address + 15) & ~15; - sha256_ni *s = (sha256_ni *)aligned_address; - s->pointer_to_free = allocation; - return s; -} - -static ssh_hash *sha256_ni_new(const ssh_hashalg *alg) -{ - if (!sha256_hw_available_cached()) - return NULL; - - sha256_ni *s = sha256_ni_alloc(); - - s->hash.vt = alg; - BinarySink_INIT(s, sha256_ni_write); - BinarySink_DELEGATE_INIT(&s->hash, s); - - return &s->hash; -} - -FUNC_ISA static void sha256_ni_reset(ssh_hash *hash) -{ - sha256_ni *s = container_of(hash, sha256_ni, hash); - - /* Initialise the core vectors in their storage order */ - s->core[0] = _mm_set_epi64x( - 0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL); - s->core[1] = _mm_set_epi64x( - 0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL); - - sha256_block_setup(&s->blk); -} - -static void sha256_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig) -{ - sha256_ni *copy = container_of(hcopy, sha256_ni, hash); - sha256_ni *orig = container_of(horig, sha256_ni, hash); - - void *ptf_save = copy->pointer_to_free; - *copy = *orig; /* structure copy */ - copy->pointer_to_free = ptf_save; - - BinarySink_COPIED(copy); - BinarySink_DELEGATE_INIT(©->hash, copy); -} - -static void sha256_ni_free(ssh_hash *hash) -{ - sha256_ni *s = container_of(hash, sha256_ni, hash); - - void *ptf = s->pointer_to_free; - smemclr(s, sizeof(*s)); - sfree(ptf); -} - -static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len) -{ - sha256_ni *s = BinarySink_DOWNCAST(bs, sha256_ni); - - while (len > 0) - if (sha256_block_write(&s->blk, &vp, &len)) - sha256_ni_block(s->core, s->blk.block); -} - -FUNC_ISA static void sha256_ni_digest(ssh_hash *hash, uint8_t *digest) -{ - sha256_ni *s = container_of(hash, sha256_ni, hash); - - sha256_block_pad(&s->blk, BinarySink_UPCAST(s)); - - /* Rearrange the words into the output order */ - __m128i feba = _mm_shuffle_epi32(s->core[0], 0x1B); - __m128i dchg = _mm_shuffle_epi32(s->core[1], 0xB1); - __m128i dcba = _mm_blend_epi16(feba, dchg, 0xF0); - __m128i hgfe = _mm_alignr_epi8(dchg, feba, 8); - - /* Byte-swap them into the output endianness */ - const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12); - dcba = _mm_shuffle_epi8(dcba, mask); - hgfe = _mm_shuffle_epi8(hgfe, mask); - - /* And store them */ - __m128i *output = (__m128i *)digest; - _mm_storeu_si128(output, dcba); - _mm_storeu_si128(output+1, hgfe); -} - -const ssh_hashalg ssh_sha256_hw = { - .new = sha256_ni_new, - .reset = sha256_ni_reset, - .copyfrom = sha256_ni_copyfrom, - .digest = sha256_ni_digest, - .free = sha256_ni_free, - .hlen = 32, - .blocklen = 64, - HASHALG_NAMES_ANNOTATED("SHA-256", "SHA-NI accelerated"), -}; - -/* ---------------------------------------------------------------------- - * Hardware-accelerated implementation of SHA-256 using Arm NEON. - */ - -#elif HW_SHA256 == HW_SHA256_NEON - -/* - * Manually set the target architecture, if we decided above that we - * need to. - */ -#ifdef USE_CLANG_ATTR_TARGET_AARCH64 -/* - * A spot of cheating: redefine some ACLE feature macros before - * including arm_neon.h. Otherwise we won't get the SHA intrinsics - * defined by that header, because it will be looking at the settings - * for the whole translation unit rather than the ones we're going to - * put on some particular functions using __attribute__((target)). - */ -#define __ARM_NEON 1 -#define __ARM_FEATURE_CRYPTO 1 -#define FUNC_ISA __attribute__ ((target("neon,crypto"))) -#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */ - -#ifndef FUNC_ISA -#define FUNC_ISA -#endif - -#ifdef USE_ARM64_NEON_H -#include -#else -#include -#endif - -static bool sha256_hw_available(void) -{ - /* - * For Arm, we delegate to a per-platform detection function (see - * explanation in sshaes.c). - */ - return platform_sha256_hw_available(); -} - -typedef struct sha256_neon_core sha256_neon_core; -struct sha256_neon_core { - uint32x4_t abcd, efgh; -}; - -FUNC_ISA -static inline uint32x4_t sha256_neon_load_input(const uint8_t *p) -{ - return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p))); -} - -FUNC_ISA -static inline uint32x4_t sha256_neon_schedule_update( - uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1) -{ - return vsha256su1q_u32(vsha256su0q_u32(m4, m3), m2, m1); -} - -FUNC_ISA -static inline sha256_neon_core sha256_neon_round4( - sha256_neon_core old, uint32x4_t sched, unsigned round) -{ - sha256_neon_core new; - - uint32x4_t round_input = vaddq_u32( - sched, vld1q_u32(sha256_round_constants + round)); - new.abcd = vsha256hq_u32 (old.abcd, old.efgh, round_input); - new.efgh = vsha256h2q_u32(old.efgh, old.abcd, round_input); - return new; -} - -FUNC_ISA -static inline void sha256_neon_block(sha256_neon_core *core, const uint8_t *p) -{ - uint32x4_t s0, s1, s2, s3; - sha256_neon_core cr = *core; - - s0 = sha256_neon_load_input(p); - cr = sha256_neon_round4(cr, s0, 0); - s1 = sha256_neon_load_input(p+16); - cr = sha256_neon_round4(cr, s1, 4); - s2 = sha256_neon_load_input(p+32); - cr = sha256_neon_round4(cr, s2, 8); - s3 = sha256_neon_load_input(p+48); - cr = sha256_neon_round4(cr, s3, 12); - s0 = sha256_neon_schedule_update(s0, s1, s2, s3); - cr = sha256_neon_round4(cr, s0, 16); - s1 = sha256_neon_schedule_update(s1, s2, s3, s0); - cr = sha256_neon_round4(cr, s1, 20); - s2 = sha256_neon_schedule_update(s2, s3, s0, s1); - cr = sha256_neon_round4(cr, s2, 24); - s3 = sha256_neon_schedule_update(s3, s0, s1, s2); - cr = sha256_neon_round4(cr, s3, 28); - s0 = sha256_neon_schedule_update(s0, s1, s2, s3); - cr = sha256_neon_round4(cr, s0, 32); - s1 = sha256_neon_schedule_update(s1, s2, s3, s0); - cr = sha256_neon_round4(cr, s1, 36); - s2 = sha256_neon_schedule_update(s2, s3, s0, s1); - cr = sha256_neon_round4(cr, s2, 40); - s3 = sha256_neon_schedule_update(s3, s0, s1, s2); - cr = sha256_neon_round4(cr, s3, 44); - s0 = sha256_neon_schedule_update(s0, s1, s2, s3); - cr = sha256_neon_round4(cr, s0, 48); - s1 = sha256_neon_schedule_update(s1, s2, s3, s0); - cr = sha256_neon_round4(cr, s1, 52); - s2 = sha256_neon_schedule_update(s2, s3, s0, s1); - cr = sha256_neon_round4(cr, s2, 56); - s3 = sha256_neon_schedule_update(s3, s0, s1, s2); - cr = sha256_neon_round4(cr, s3, 60); - - core->abcd = vaddq_u32(core->abcd, cr.abcd); - core->efgh = vaddq_u32(core->efgh, cr.efgh); -} - -typedef struct sha256_neon { - sha256_neon_core core; - sha256_block blk; - BinarySink_IMPLEMENTATION; - ssh_hash hash; -} sha256_neon; - -static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len); - -static ssh_hash *sha256_neon_new(const ssh_hashalg *alg) -{ - if (!sha256_hw_available_cached()) - return NULL; - - sha256_neon *s = snew(sha256_neon); - - s->hash.vt = alg; - BinarySink_INIT(s, sha256_neon_write); - BinarySink_DELEGATE_INIT(&s->hash, s); - return &s->hash; -} - -static void sha256_neon_reset(ssh_hash *hash) -{ - sha256_neon *s = container_of(hash, sha256_neon, hash); - - s->core.abcd = vld1q_u32(sha256_initial_state); - s->core.efgh = vld1q_u32(sha256_initial_state + 4); - - sha256_block_setup(&s->blk); -} - -static void sha256_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig) -{ - sha256_neon *copy = container_of(hcopy, sha256_neon, hash); - sha256_neon *orig = container_of(horig, sha256_neon, hash); - - *copy = *orig; /* structure copy */ - - BinarySink_COPIED(copy); - BinarySink_DELEGATE_INIT(©->hash, copy); -} - -static void sha256_neon_free(ssh_hash *hash) -{ - sha256_neon *s = container_of(hash, sha256_neon, hash); - smemclr(s, sizeof(*s)); - sfree(s); -} - -static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len) -{ - sha256_neon *s = BinarySink_DOWNCAST(bs, sha256_neon); - - while (len > 0) - if (sha256_block_write(&s->blk, &vp, &len)) - sha256_neon_block(&s->core, s->blk.block); -} - -static void sha256_neon_digest(ssh_hash *hash, uint8_t *digest) -{ - sha256_neon *s = container_of(hash, sha256_neon, hash); - - sha256_block_pad(&s->blk, BinarySink_UPCAST(s)); - vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd))); - vst1q_u8(digest + 16, vrev32q_u8(vreinterpretq_u8_u32(s->core.efgh))); -} - -const ssh_hashalg ssh_sha256_hw = { - .new = sha256_neon_new, - .reset = sha256_neon_reset, - .copyfrom = sha256_neon_copyfrom, - .digest = sha256_neon_digest, - .free = sha256_neon_free, - .hlen = 32, - .blocklen = 64, - HASHALG_NAMES_ANNOTATED("SHA-256", "NEON accelerated"), -}; - -/* ---------------------------------------------------------------------- - * Stub functions if we have no hardware-accelerated SHA-256. In this - * case, sha256_hw_new returns NULL (though it should also never be - * selected by sha256_select, so the only thing that should even be - * _able_ to call it is testcrypt). As a result, the remaining vtable - * functions should never be called at all. - */ - -#elif HW_SHA256 == HW_SHA256_NONE - -static bool sha256_hw_available(void) -{ - return false; -} - -static ssh_hash *sha256_stub_new(const ssh_hashalg *alg) -{ - return NULL; -} - -#define STUB_BODY { unreachable("Should never be called"); } - -static void sha256_stub_reset(ssh_hash *hash) STUB_BODY -static void sha256_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY -static void sha256_stub_free(ssh_hash *hash) STUB_BODY -static void sha256_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY - -const ssh_hashalg ssh_sha256_hw = { - .new = sha256_stub_new, - .reset = sha256_stub_reset, - .copyfrom = sha256_stub_copyfrom, - .digest = sha256_stub_digest, - .free = sha256_stub_free, - .hlen = 32, - .blocklen = 64, - HASHALG_NAMES_ANNOTATED("SHA-256", "!NONEXISTENT ACCELERATED VERSION!"), -}; - -#endif /* HW_SHA256 */ diff --git a/crypto/sha256.h b/crypto/sha256.h new file mode 100644 index 00000000..e6ca7564 --- /dev/null +++ b/crypto/sha256.h @@ -0,0 +1,105 @@ +/* + * Definitions likely to be helpful to multiple SHA-256 implementations. + */ + +/* + * The 'extra' structure used by SHA-256 implementations is used to + * include information about how to check if a given implementation is + * available at run time, and whether we've already checked. + */ +struct sha256_extra_mutable; +struct sha256_extra { + /* Function to check availability. Might be expensive, so we don't + * want to call it more than once. */ + bool (*check_available)(void); + + /* Point to a writable substructure. */ + struct sha256_extra_mutable *mut; +}; +struct sha256_extra_mutable { + bool checked_availability; + bool is_available; +}; +static inline bool check_availability(const struct sha256_extra *extra) +{ + if (!extra->mut->checked_availability) { + extra->mut->is_available = extra->check_available(); + extra->mut->checked_availability = true; + } + + return extra->mut->is_available; +} + +/* + * Macro to define a SHA-256 vtable together with its 'extra' + * structure. + */ +#define SHA256_VTABLE(impl_c, impl_display) \ + static struct sha256_extra_mutable sha256_ ## impl_c ## _extra_mut; \ + static const struct sha256_extra sha256_ ## impl_c ## _extra = { \ + .check_available = sha256_ ## impl_c ## _available, \ + .mut = &sha256_ ## impl_c ## _extra_mut, \ + }; \ + const ssh_hashalg ssh_sha256_ ## impl_c = { \ + .new = sha256_ ## impl_c ## _new, \ + .reset = sha256_ ## impl_c ## _reset, \ + .copyfrom = sha256_ ## impl_c ## _copyfrom, \ + .digest = sha256_ ## impl_c ## _digest, \ + .free = sha256_ ## impl_c ## _free, \ + .hlen = 32, \ + .blocklen = 64, \ + HASHALG_NAMES_ANNOTATED("SHA-256", impl_display), \ + .extra = &sha256_ ## impl_c ## _extra, \ + } + +extern const uint32_t sha256_initial_state[8]; +extern const uint32_t sha256_round_constants[64]; + +#define SHA256_ROUNDS 64 + +typedef struct sha256_block sha256_block; +struct sha256_block { + uint8_t block[64]; + size_t used; + uint64_t len; +}; + +static inline void sha256_block_setup(sha256_block *blk) +{ + blk->used = 0; + blk->len = 0; +} + +static inline bool sha256_block_write( + sha256_block *blk, const void **vdata, size_t *len) +{ + size_t blkleft = sizeof(blk->block) - blk->used; + size_t chunk = *len < blkleft ? *len : blkleft; + + const uint8_t *p = *vdata; + memcpy(blk->block + blk->used, p, chunk); + *vdata = p + chunk; + *len -= chunk; + blk->used += chunk; + blk->len += chunk; + + if (blk->used == sizeof(blk->block)) { + blk->used = 0; + return true; + } + + return false; +} + +static inline void sha256_block_pad(sha256_block *blk, BinarySink *bs) +{ + uint64_t final_len = blk->len << 3; + size_t pad = 1 + (63 & (55 - blk->used)); + + put_byte(bs, 0x80); + for (size_t i = 1; i < pad; i++) + put_byte(bs, 0); + put_uint64(bs, final_len); + + assert(blk->used == 0 && "Should have exactly hit a block boundary"); +} diff --git a/crypto/sha512-common.c b/crypto/sha512-common.c new file mode 100644 index 00000000..89ac136c --- /dev/null +++ b/crypto/sha512-common.c @@ -0,0 +1,71 @@ +/* + * Common variable definitions across all the SHA-512 implementations. + */ + +#include "ssh.h" +#include "sha512.h" + +const uint64_t sha512_initial_state[8] = { + 0x6a09e667f3bcc908ULL, + 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, + 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, + 0x5be0cd19137e2179ULL, +}; + +const uint64_t sha384_initial_state[8] = { + 0xcbbb9d5dc1059ed8ULL, + 0x629a292a367cd507ULL, + 0x9159015a3070dd17ULL, + 0x152fecd8f70e5939ULL, + 0x67332667ffc00b31ULL, + 0x8eb44a8768581511ULL, + 0xdb0c2e0d64f98fa7ULL, + 0x47b5481dbefa4fa4ULL, +}; + +const uint64_t sha512_round_constants[80] = { + 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, + 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, + 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, + 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, + 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, + 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, + 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, + 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, + 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, + 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, + 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, + 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, + 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, + 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, + 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, + 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, + 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, + 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, + 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, + 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, + 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, + 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, + 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, + 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, + 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, + 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, + 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, + 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, + 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, + 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, + 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, + 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, + 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, + 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, + 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, + 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, + 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, + 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, + 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, + 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL, +}; diff --git a/crypto/sha512-neon.c b/crypto/sha512-neon.c new file mode 100644 index 00000000..849a79d7 --- /dev/null +++ b/crypto/sha512-neon.c @@ -0,0 +1,329 @@ +/* + * Hardware-accelerated implementation of SHA-512 using Arm NEON. + */ + +#include "ssh.h" +#include "sha512.h" + +#if USE_ARM64_NEON_H +#include +#else +#include +#endif + +static bool sha512_neon_available(void) +{ + /* + * For Arm, we delegate to a per-platform detection function (see + * explanation in aes-neon.c). + */ + return platform_sha512_neon_available(); +} + +#if !HAVE_NEON_SHA512_INTRINSICS +/* + * clang 12 and before do not provide the SHA-512 NEON intrinsics, but + * do provide assembler support for the underlying instructions. So I + * define the intrinsic functions myself, using inline assembler. + */ +static inline uint64x2_t vsha512su0q_u64(uint64x2_t x, uint64x2_t y) +{ + __asm__("sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y)); + return x; +} +static inline uint64x2_t vsha512su1q_u64(uint64x2_t x, uint64x2_t y, + uint64x2_t z) +{ + __asm__("sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z)); + return x; +} +static inline uint64x2_t vsha512hq_u64(uint64x2_t x, uint64x2_t y, + uint64x2_t z) +{ + __asm__("sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z)); + return x; +} +static inline uint64x2_t vsha512h2q_u64(uint64x2_t x, uint64x2_t y, + uint64x2_t z) +{ + __asm__("sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z)); + return x; +} +#endif /* HAVE_NEON_SHA512_INTRINSICS */ + +typedef struct sha512_neon_core sha512_neon_core; +struct sha512_neon_core { + uint64x2_t ab, cd, ef, gh; +}; + +static inline uint64x2_t sha512_neon_load_input(const uint8_t *p) +{ + return vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(p))); +} + +static inline uint64x2_t sha512_neon_schedule_update( + uint64x2_t m8, uint64x2_t m7, uint64x2_t m4, uint64x2_t m3, uint64x2_t m1) +{ + /* + * vsha512su0q_u64() takes words from a long way back in the + * schedule and performs the sigma_0 half of the computation of + * the next two 64-bit message-schedule words. + * + * vsha512su1q_u64() combines the result of that with the sigma_1 + * steps, to output the finished version of those two words. The + * total amount of input data it requires fits nicely into three + * 128-bit vector registers, but one of those registers is + * misaligned compared to the 128-bit chunks that the message + * schedule is stored in. So we use vextq_u64 to make one of its + * input words out of the second half of m4 and the first half of + * m3. + */ + return vsha512su1q_u64(vsha512su0q_u64(m8, m7), m1, vextq_u64(m4, m3, 1)); +} + +static inline void sha512_neon_round2( + unsigned round_index, uint64x2_t schedule_words, + uint64x2_t *ab, uint64x2_t *cd, uint64x2_t *ef, uint64x2_t *gh) +{ + /* + * vsha512hq_u64 performs the Sigma_1 and Ch half of the + * computation of two rounds of SHA-512 (including feeding back + * one of the outputs from the first of those half-rounds into the + * second one). + * + * vsha512h2q_u64 combines the result of that with the Sigma_0 and + * Maj steps, and outputs one 128-bit vector that replaces the gh + * piece of the input hash state, and a second that updates cd by + * addition. + * + * Similarly to vsha512su1q_u64 above, some of the input registers + * expected by these instructions are misaligned by 64 bits + * relative to the chunks we've divided the hash state into, so we + * have to start by making 'de' and 'fg' words out of our input + * cd,ef,gh, using vextq_u64. + * + * Also, one of the inputs to vsha512hq_u64 is expected to contain + * the results of summing gh + two round constants + two words of + * message schedule, but the two words of the message schedule + * have to be the opposite way round in the vector register from + * the way that vsha512su1q_u64 output them. Hence, there's + * another vextq_u64 in here that swaps the two halves of the + * initial_sum vector register. + * + * (This also means that I don't have to prepare a specially + * reordered version of the sha512_round_constants[] array: as + * long as I'm unavoidably doing a swap at run time _anyway_, I + * can load from the normally ordered version of that array, and + * just take care to fold in that data _before_ the swap rather + * than after.) + */ + + /* Load two round constants, with the first one in the low half */ + uint64x2_t round_constants = vld1q_u64( + sha512_round_constants + round_index); + + /* Add schedule words to round constants */ + uint64x2_t initial_sum = vaddq_u64(schedule_words, round_constants); + + /* Swap that sum around so the word used in the first of the two + * rounds is in the _high_ half of the vector, matching where h + * lives in the gh vector */ + uint64x2_t swapped_initial_sum = vextq_u64(initial_sum, initial_sum, 1); + + /* Add gh to that, now that they're matching ways round */ + uint64x2_t sum = vaddq_u64(swapped_initial_sum, *gh); + + /* Make the misaligned de and fg words */ + uint64x2_t de = vextq_u64(*cd, *ef, 1); + uint64x2_t fg = vextq_u64(*ef, *gh, 1); + + /* Now we're ready to put all the pieces together. The output from + * vsha512h2q_u64 can be used directly as the new gh, and the + * output from vsha512hq_u64 is simultaneously the intermediate + * value passed to h2 and the thing you have to add on to cd. */ + uint64x2_t intermed = vsha512hq_u64(sum, fg, de); + *gh = vsha512h2q_u64(intermed, *cd, *ab); + *cd = vaddq_u64(*cd, intermed); +} + +static inline void sha512_neon_block(sha512_neon_core *core, const uint8_t *p) +{ + uint64x2_t s0, s1, s2, s3, s4, s5, s6, s7; + + uint64x2_t ab = core->ab, cd = core->cd, ef = core->ef, gh = core->gh; + + s0 = sha512_neon_load_input(p + 16*0); + sha512_neon_round2(0, s0, &ab, &cd, &ef, &gh); + s1 = sha512_neon_load_input(p + 16*1); + sha512_neon_round2(2, s1, &gh, &ab, &cd, &ef); + s2 = sha512_neon_load_input(p + 16*2); + sha512_neon_round2(4, s2, &ef, &gh, &ab, &cd); + s3 = sha512_neon_load_input(p + 16*3); + sha512_neon_round2(6, s3, &cd, &ef, &gh, &ab); + s4 = sha512_neon_load_input(p + 16*4); + sha512_neon_round2(8, s4, &ab, &cd, &ef, &gh); + s5 = sha512_neon_load_input(p + 16*5); + sha512_neon_round2(10, s5, &gh, &ab, &cd, &ef); + s6 = sha512_neon_load_input(p + 16*6); + sha512_neon_round2(12, s6, &ef, &gh, &ab, &cd); + s7 = sha512_neon_load_input(p + 16*7); + sha512_neon_round2(14, s7, &cd, &ef, &gh, &ab); + s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7); + sha512_neon_round2(16, s0, &ab, &cd, &ef, &gh); + s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0); + sha512_neon_round2(18, s1, &gh, &ab, &cd, &ef); + s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1); + sha512_neon_round2(20, s2, &ef, &gh, &ab, &cd); + s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2); + sha512_neon_round2(22, s3, &cd, &ef, &gh, &ab); + s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3); + sha512_neon_round2(24, s4, &ab, &cd, &ef, &gh); + s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4); + sha512_neon_round2(26, s5, &gh, &ab, &cd, &ef); + s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5); + sha512_neon_round2(28, s6, &ef, &gh, &ab, &cd); + s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6); + sha512_neon_round2(30, s7, &cd, &ef, &gh, &ab); + s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7); + sha512_neon_round2(32, s0, &ab, &cd, &ef, &gh); + s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0); + sha512_neon_round2(34, s1, &gh, &ab, &cd, &ef); + s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1); + sha512_neon_round2(36, s2, &ef, &gh, &ab, &cd); + s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2); + sha512_neon_round2(38, s3, &cd, &ef, &gh, &ab); + s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3); + sha512_neon_round2(40, s4, &ab, &cd, &ef, &gh); + s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4); + sha512_neon_round2(42, s5, &gh, &ab, &cd, &ef); + s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5); + sha512_neon_round2(44, s6, &ef, &gh, &ab, &cd); + s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6); + sha512_neon_round2(46, s7, &cd, &ef, &gh, &ab); + s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7); + sha512_neon_round2(48, s0, &ab, &cd, &ef, &gh); + s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0); + sha512_neon_round2(50, s1, &gh, &ab, &cd, &ef); + s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1); + sha512_neon_round2(52, s2, &ef, &gh, &ab, &cd); + s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2); + sha512_neon_round2(54, s3, &cd, &ef, &gh, &ab); + s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3); + sha512_neon_round2(56, s4, &ab, &cd, &ef, &gh); + s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4); + sha512_neon_round2(58, s5, &gh, &ab, &cd, &ef); + s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5); + sha512_neon_round2(60, s6, &ef, &gh, &ab, &cd); + s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6); + sha512_neon_round2(62, s7, &cd, &ef, &gh, &ab); + s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7); + sha512_neon_round2(64, s0, &ab, &cd, &ef, &gh); + s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0); + sha512_neon_round2(66, s1, &gh, &ab, &cd, &ef); + s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1); + sha512_neon_round2(68, s2, &ef, &gh, &ab, &cd); + s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2); + sha512_neon_round2(70, s3, &cd, &ef, &gh, &ab); + s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3); + sha512_neon_round2(72, s4, &ab, &cd, &ef, &gh); + s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4); + sha512_neon_round2(74, s5, &gh, &ab, &cd, &ef); + s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5); + sha512_neon_round2(76, s6, &ef, &gh, &ab, &cd); + s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6); + sha512_neon_round2(78, s7, &cd, &ef, &gh, &ab); + + core->ab = vaddq_u64(core->ab, ab); + core->cd = vaddq_u64(core->cd, cd); + core->ef = vaddq_u64(core->ef, ef); + core->gh = vaddq_u64(core->gh, gh); +} + +typedef struct sha512_neon { + sha512_neon_core core; + sha512_block blk; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha512_neon; + +static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len); + +static ssh_hash *sha512_neon_new(const ssh_hashalg *alg) +{ + const struct sha512_extra *extra = (const struct sha512_extra *)alg->extra; + if (!check_availability(extra)) + return NULL; + + sha512_neon *s = snew(sha512_neon); + + s->hash.vt = alg; + BinarySink_INIT(s, sha512_neon_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + return &s->hash; +} + +static void sha512_neon_reset(ssh_hash *hash) +{ + sha512_neon *s = container_of(hash, sha512_neon, hash); + const struct sha512_extra *extra = + (const struct sha512_extra *)hash->vt->extra; + + s->core.ab = vld1q_u64(extra->initial_state); + s->core.cd = vld1q_u64(extra->initial_state+2); + s->core.ef = vld1q_u64(extra->initial_state+4); + s->core.gh = vld1q_u64(extra->initial_state+6); + + sha512_block_setup(&s->blk); +} + +static void sha512_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig) +{ + sha512_neon *copy = container_of(hcopy, sha512_neon, hash); + sha512_neon *orig = container_of(horig, sha512_neon, hash); + + *copy = *orig; /* structure copy */ + + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); +} + +static void sha512_neon_free(ssh_hash *hash) +{ + sha512_neon *s = container_of(hash, sha512_neon, hash); + smemclr(s, sizeof(*s)); + sfree(s); +} + +static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len) +{ + sha512_neon *s = BinarySink_DOWNCAST(bs, sha512_neon); + + while (len > 0) + if (sha512_block_write(&s->blk, &vp, &len)) + sha512_neon_block(&s->core, s->blk.block); +} + +static void sha512_neon_digest(ssh_hash *hash, uint8_t *digest) +{ + sha512_neon *s = container_of(hash, sha512_neon, hash); + + sha512_block_pad(&s->blk, BinarySink_UPCAST(s)); + + vst1q_u8(digest, vrev64q_u8(vreinterpretq_u8_u64(s->core.ab))); + vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd))); + vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef))); + vst1q_u8(digest+48, vrev64q_u8(vreinterpretq_u8_u64(s->core.gh))); +} + +static void sha384_neon_digest(ssh_hash *hash, uint8_t *digest) +{ + sha512_neon *s = container_of(hash, sha512_neon, hash); + + sha512_block_pad(&s->blk, BinarySink_UPCAST(s)); + + vst1q_u8(digest, vrev64q_u8(vreinterpretq_u8_u64(s->core.ab))); + vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd))); + vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef))); +} + +SHA512_VTABLES(neon, "NEON accelerated"); diff --git a/crypto/sha512-select.c b/crypto/sha512-select.c new file mode 100644 index 00000000..ecd567bd --- /dev/null +++ b/crypto/sha512-select.c @@ -0,0 +1,61 @@ +/* + * Top-level vtables to select a SHA-512 implementation. + */ + +#include +#include + +#include "putty.h" +#include "ssh.h" +#include "sha512.h" + +static const ssh_hashalg *const real_sha512_algs[] = { +#if HAVE_NEON_SHA512 + &ssh_sha512_neon, +#endif + &ssh_sha512_sw, + NULL, +}; + +static const ssh_hashalg *const real_sha384_algs[] = { +#if HAVE_NEON_SHA512 + &ssh_sha384_neon, +#endif + &ssh_sha384_sw, + NULL, +}; + +static ssh_hash *sha512_select(const ssh_hashalg *alg) +{ + const ssh_hashalg *const *real_algs = + (const ssh_hashalg *const *)alg->extra; + + for (size_t i = 0; real_algs[i]; i++) { + const ssh_hashalg *alg = real_algs[i]; + const struct sha512_extra *alg_extra = + (const struct sha512_extra *)alg->extra; + if (check_availability(alg_extra)) + return ssh_hash_new(alg); + } + + /* We should never reach the NULL at the end of the list, because + * the last non-NULL entry should be software-only SHA-512, which + * is always available. */ + unreachable("sha512_select ran off the end of its list"); +} + +const ssh_hashalg ssh_sha512 = { + .new = sha512_select, + .hlen = 64, + .blocklen = 128, + HASHALG_NAMES_ANNOTATED("SHA-512", "dummy selector vtable"), + .extra = real_sha512_algs, +}; + +const ssh_hashalg ssh_sha384 = { + .new = sha512_select, + .hlen = 48, + .blocklen = 128, + HASHALG_NAMES_ANNOTATED("SHA-384", "dummy selector vtable"), + .extra = real_sha384_algs, +}; diff --git a/crypto/sha512-sw.c b/crypto/sha512-sw.c new file mode 100644 index 00000000..9e47bbb9 --- /dev/null +++ b/crypto/sha512-sw.c @@ -0,0 +1,168 @@ +/* + * Software implementation of SHA-512. + */ + +#include "ssh.h" +#include "sha512.h" + +static bool sha512_sw_available(void) +{ + /* Software SHA-512 is always available */ + return true; +} + +static inline uint64_t ror(uint64_t x, unsigned y) +{ + return (x << (63 & -y)) | (x >> (63 & y)); +} + +static inline uint64_t Ch(uint64_t ctrl, uint64_t if1, uint64_t if0) +{ + return if0 ^ (ctrl & (if1 ^ if0)); +} + +static inline uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) +{ + return (x & y) | (z & (x | y)); +} + +static inline uint64_t Sigma_0(uint64_t x) +{ + return ror(x,28) ^ ror(x,34) ^ ror(x,39); +} + +static inline uint64_t Sigma_1(uint64_t x) +{ + return ror(x,14) ^ ror(x,18) ^ ror(x,41); +} + +static inline uint64_t sigma_0(uint64_t x) +{ + return ror(x,1) ^ ror(x,8) ^ (x >> 7); +} + +static inline uint64_t sigma_1(uint64_t x) +{ + return ror(x,19) ^ ror(x,61) ^ (x >> 6); +} + +static inline void sha512_sw_round( + unsigned round_index, const uint64_t *schedule, + uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d, + uint64_t *e, uint64_t *f, uint64_t *g, uint64_t *h) +{ + uint64_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) + + sha512_round_constants[round_index] + schedule[round_index]; + + uint64_t t2 = Sigma_0(*a) + Maj(*a,*b,*c); + + *d += t1; + *h = t1 + t2; +} + +static void sha512_sw_block(uint64_t *core, const uint8_t *block) +{ + uint64_t w[SHA512_ROUNDS]; + uint64_t a,b,c,d,e,f,g,h; + + int t; + + for (t = 0; t < 16; t++) + w[t] = GET_64BIT_MSB_FIRST(block + 8*t); + + for (t = 16; t < SHA512_ROUNDS; t++) + w[t] = w[t-16] + w[t-7] + sigma_0(w[t-15]) + sigma_1(w[t-2]); + + a = core[0]; b = core[1]; c = core[2]; d = core[3]; + e = core[4]; f = core[5]; g = core[6]; h = core[7]; + + for (t = 0; t < SHA512_ROUNDS; t+=8) { + sha512_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h); + sha512_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g); + sha512_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f); + sha512_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e); + sha512_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d); + sha512_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c); + sha512_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b); + sha512_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a); + } + + core[0] += a; core[1] += b; core[2] += c; core[3] += d; + core[4] += e; core[5] += f; core[6] += g; core[7] += h; + + smemclr(w, sizeof(w)); +} + +typedef struct sha512_sw { + uint64_t core[8]; + sha512_block blk; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha512_sw; + +static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len); + +static ssh_hash *sha512_sw_new(const ssh_hashalg *alg) +{ + sha512_sw *s = snew(sha512_sw); + + s->hash.vt = alg; + BinarySink_INIT(s, sha512_sw_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + return &s->hash; +} + +static void sha512_sw_reset(ssh_hash *hash) +{ + sha512_sw *s = container_of(hash, sha512_sw, hash); + const struct sha512_extra *extra = + (const struct sha512_extra *)hash->vt->extra; + + memcpy(s->core, extra->initial_state, sizeof(s->core)); + sha512_block_setup(&s->blk); +} + +static void sha512_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig) +{ + sha512_sw *copy = container_of(hcopy, sha512_sw, hash); + sha512_sw *orig = container_of(horig, sha512_sw, hash); + + memcpy(copy, orig, sizeof(*copy)); + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); +} + +static void sha512_sw_free(ssh_hash *hash) +{ + sha512_sw *s = container_of(hash, sha512_sw, hash); + + smemclr(s, sizeof(*s)); + sfree(s); +} + +static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len) +{ + sha512_sw *s = BinarySink_DOWNCAST(bs, sha512_sw); + + while (len > 0) + if (sha512_block_write(&s->blk, &vp, &len)) + sha512_sw_block(s->core, s->blk.block); +} + +static void sha512_sw_digest(ssh_hash *hash, uint8_t *digest) +{ + sha512_sw *s = container_of(hash, sha512_sw, hash); + + sha512_block_pad(&s->blk, BinarySink_UPCAST(s)); + for (size_t i = 0; i < hash->vt->hlen / 8; i++) + PUT_64BIT_MSB_FIRST(digest + 8*i, s->core[i]); +} + +/* + * This implementation doesn't need separate digest methods for + * SHA-384 and SHA-512, because the above implementation reads the + * hash length out of the vtable. + */ +#define sha384_sw_digest sha512_sw_digest + +SHA512_VTABLES(sw, "unaccelerated"); diff --git a/crypto/sha512.c b/crypto/sha512.c deleted file mode 100644 index cba7f38d..00000000 --- a/crypto/sha512.c +++ /dev/null @@ -1,836 +0,0 @@ -/* - * SHA-512 algorithm as described at - * - * http://csrc.nist.gov/cryptval/shs.html - * - * Modifications made for SHA-384 also - */ - -#include -#include "ssh.h" - -/* - * Start by deciding whether we can support hardware SHA at all. - */ -#define HW_SHA512_NONE 0 -#define HW_SHA512_NEON 1 - -#ifdef _FORCE_SHA512_NEON -# define HW_SHA512 HW_SHA512_NEON -#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - /* Arm can potentially support both endiannesses, but this code - * hasn't been tested on anything but little. If anyone wants to - * run big-endian, they'll need to fix it first. */ -#elif defined __ARM_FEATURE_SHA512 - /* If the Arm SHA-512 extension is available already, we can - * support NEON SHA without having to enable anything by hand */ -# define HW_SHA512 HW_SHA512_NEON -#elif defined(__clang__) -# if __has_attribute(target) && __has_include() && \ - (defined(__aarch64__)) - /* clang can enable the crypto extension in AArch64 using - * __attribute__((target)) */ -# define HW_SHA512 HW_SHA512_NEON -# define USE_CLANG_ATTR_TARGET_AARCH64 -# endif -#endif - -#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA512 -# undef HW_SHA512 -# define HW_SHA512 HW_SHA512_NONE -#endif - -/* - * The actual query function that asks if hardware acceleration is - * available. - */ -static bool sha512_hw_available(void); - -/* - * The top-level selection function, caching the results of - * sha512_hw_available() so it only has to run once. - */ -static bool sha512_hw_available_cached(void) -{ - static bool initialised = false; - static bool hw_available; - if (!initialised) { - hw_available = sha512_hw_available(); - initialised = true; - } - return hw_available; -} - -struct sha512_select_options { - const ssh_hashalg *hw, *sw; -}; - -static ssh_hash *sha512_select(const ssh_hashalg *alg) -{ - const struct sha512_select_options *options = - (const struct sha512_select_options *)alg->extra; - - const ssh_hashalg *real_alg = - sha512_hw_available_cached() ? options->hw : options->sw; - - return ssh_hash_new(real_alg); -} - -const struct sha512_select_options ssh_sha512_select_options = { - &ssh_sha512_hw, &ssh_sha512_sw, -}; -const struct sha512_select_options ssh_sha384_select_options = { - &ssh_sha384_hw, &ssh_sha384_sw, -}; - -const ssh_hashalg ssh_sha512 = { - .new = sha512_select, - .hlen = 64, - .blocklen = 128, - HASHALG_NAMES_ANNOTATED("SHA-512", "dummy selector vtable"), - .extra = &ssh_sha512_select_options, -}; - -const ssh_hashalg ssh_sha384 = { - .new = sha512_select, - .hlen = 48, - .blocklen = 128, - HASHALG_NAMES_ANNOTATED("SHA-384", "dummy selector vtable"), - .extra = &ssh_sha384_select_options, -}; - -/* ---------------------------------------------------------------------- - * Definitions likely to be helpful to multiple implementations. - */ - -static const uint64_t sha512_initial_state[] = { - 0x6a09e667f3bcc908ULL, - 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, - 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, - 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, - 0x5be0cd19137e2179ULL, -}; - -static const uint64_t sha384_initial_state[] = { - 0xcbbb9d5dc1059ed8ULL, - 0x629a292a367cd507ULL, - 0x9159015a3070dd17ULL, - 0x152fecd8f70e5939ULL, - 0x67332667ffc00b31ULL, - 0x8eb44a8768581511ULL, - 0xdb0c2e0d64f98fa7ULL, - 0x47b5481dbefa4fa4ULL, -}; - -static const uint64_t sha512_round_constants[] = { - 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, - 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, - 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, - 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, - 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, - 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, - 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, - 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, - 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, - 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, - 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, - 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, - 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, - 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, - 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, - 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, - 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, - 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, - 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, - 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, - 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, - 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, - 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, - 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, - 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, - 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, - 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, - 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, - 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, - 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, - 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, - 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, - 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, - 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, - 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, - 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, - 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, - 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, - 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, - 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL, -}; - -#define SHA512_ROUNDS 80 - -typedef struct sha512_block sha512_block; -struct sha512_block { - uint8_t block[128]; - size_t used; - uint64_t lenhi, lenlo; -}; - -static inline void sha512_block_setup(sha512_block *blk) -{ - blk->used = 0; - blk->lenhi = blk->lenlo = 0; -} - -static inline bool sha512_block_write( - sha512_block *blk, const void **vdata, size_t *len) -{ - size_t blkleft = sizeof(blk->block) - blk->used; - size_t chunk = *len < blkleft ? *len : blkleft; - - const uint8_t *p = *vdata; - memcpy(blk->block + blk->used, p, chunk); - *vdata = p + chunk; - *len -= chunk; - blk->used += chunk; - - size_t chunkbits = chunk << 3; - - blk->lenlo += chunkbits; - blk->lenhi += (blk->lenlo < chunkbits); - - if (blk->used == sizeof(blk->block)) { - blk->used = 0; - return true; - } - - return false; -} - -static inline void sha512_block_pad(sha512_block *blk, BinarySink *bs) -{ - uint64_t final_lenhi = blk->lenhi; - uint64_t final_lenlo = blk->lenlo; - size_t pad = 127 & (111 - blk->used); - - put_byte(bs, 0x80); - put_padding(bs, pad, 0); - put_uint64(bs, final_lenhi); - put_uint64(bs, final_lenlo); - - assert(blk->used == 0 && "Should have exactly hit a block boundary"); -} - -/* ---------------------------------------------------------------------- - * Software implementation of SHA-512. - */ - -static inline uint64_t ror(uint64_t x, unsigned y) -{ - return (x << (63 & -y)) | (x >> (63 & y)); -} - -static inline uint64_t Ch(uint64_t ctrl, uint64_t if1, uint64_t if0) -{ - return if0 ^ (ctrl & (if1 ^ if0)); -} - -static inline uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) -{ - return (x & y) | (z & (x | y)); -} - -static inline uint64_t Sigma_0(uint64_t x) -{ - return ror(x,28) ^ ror(x,34) ^ ror(x,39); -} - -static inline uint64_t Sigma_1(uint64_t x) -{ - return ror(x,14) ^ ror(x,18) ^ ror(x,41); -} - -static inline uint64_t sigma_0(uint64_t x) -{ - return ror(x,1) ^ ror(x,8) ^ (x >> 7); -} - -static inline uint64_t sigma_1(uint64_t x) -{ - return ror(x,19) ^ ror(x,61) ^ (x >> 6); -} - -static inline void sha512_sw_round( - unsigned round_index, const uint64_t *schedule, - uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d, - uint64_t *e, uint64_t *f, uint64_t *g, uint64_t *h) -{ - uint64_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) + - sha512_round_constants[round_index] + schedule[round_index]; - - uint64_t t2 = Sigma_0(*a) + Maj(*a,*b,*c); - - *d += t1; - *h = t1 + t2; -} - -static void sha512_sw_block(uint64_t *core, const uint8_t *block) -{ - uint64_t w[SHA512_ROUNDS]; - uint64_t a,b,c,d,e,f,g,h; - - int t; - - for (t = 0; t < 16; t++) - w[t] = GET_64BIT_MSB_FIRST(block + 8*t); - - for (t = 16; t < SHA512_ROUNDS; t++) - w[t] = w[t-16] + w[t-7] + sigma_0(w[t-15]) + sigma_1(w[t-2]); - - a = core[0]; b = core[1]; c = core[2]; d = core[3]; - e = core[4]; f = core[5]; g = core[6]; h = core[7]; - - for (t = 0; t < SHA512_ROUNDS; t+=8) { - sha512_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h); - sha512_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g); - sha512_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f); - sha512_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e); - sha512_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d); - sha512_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c); - sha512_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b); - sha512_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a); - } - - core[0] += a; core[1] += b; core[2] += c; core[3] += d; - core[4] += e; core[5] += f; core[6] += g; core[7] += h; - - smemclr(w, sizeof(w)); -} - -typedef struct sha512_sw { - uint64_t core[8]; - sha512_block blk; - BinarySink_IMPLEMENTATION; - ssh_hash hash; -} sha512_sw; - -static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len); - -static ssh_hash *sha512_sw_new(const ssh_hashalg *alg) -{ - sha512_sw *s = snew(sha512_sw); - - s->hash.vt = alg; - BinarySink_INIT(s, sha512_sw_write); - BinarySink_DELEGATE_INIT(&s->hash, s); - return &s->hash; -} - -static void sha512_sw_reset(ssh_hash *hash) -{ - sha512_sw *s = container_of(hash, sha512_sw, hash); - - /* The 'extra' field in the ssh_hashalg indicates which - * initialisation vector we're using */ - memcpy(s->core, hash->vt->extra, sizeof(s->core)); - sha512_block_setup(&s->blk); -} - -static void sha512_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig) -{ - sha512_sw *copy = container_of(hcopy, sha512_sw, hash); - sha512_sw *orig = container_of(horig, sha512_sw, hash); - - memcpy(copy, orig, sizeof(*copy)); - BinarySink_COPIED(copy); - BinarySink_DELEGATE_INIT(©->hash, copy); -} - -static void sha512_sw_free(ssh_hash *hash) -{ - sha512_sw *s = container_of(hash, sha512_sw, hash); - - smemclr(s, sizeof(*s)); - sfree(s); -} - -static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len) -{ - sha512_sw *s = BinarySink_DOWNCAST(bs, sha512_sw); - - while (len > 0) - if (sha512_block_write(&s->blk, &vp, &len)) - sha512_sw_block(s->core, s->blk.block); -} - -static void sha512_sw_digest(ssh_hash *hash, uint8_t *digest) -{ - sha512_sw *s = container_of(hash, sha512_sw, hash); - - sha512_block_pad(&s->blk, BinarySink_UPCAST(s)); - for (size_t i = 0; i < hash->vt->hlen / 8; i++) - PUT_64BIT_MSB_FIRST(digest + 8*i, s->core[i]); -} - -const ssh_hashalg ssh_sha512_sw = { - .new = sha512_sw_new, - .reset = sha512_sw_reset, - .copyfrom = sha512_sw_copyfrom, - .digest = sha512_sw_digest, - .free = sha512_sw_free, - .hlen = 64, - .blocklen = 128, - HASHALG_NAMES_ANNOTATED("SHA-512", "unaccelerated"), - .extra = sha512_initial_state, -}; - -const ssh_hashalg ssh_sha384_sw = { - .new = sha512_sw_new, - .reset = sha512_sw_reset, - .copyfrom = sha512_sw_copyfrom, - .digest = sha512_sw_digest, - .free = sha512_sw_free, - .hlen = 48, - .blocklen = 128, - HASHALG_NAMES_ANNOTATED("SHA-384", "unaccelerated"), - .extra = sha384_initial_state, -}; - -/* ---------------------------------------------------------------------- - * Hardware-accelerated implementation of SHA-512 using Arm NEON. - */ - -#if HW_SHA512 == HW_SHA512_NEON - -/* - * Manually set the target architecture, if we decided above that we - * need to. - */ -#ifdef USE_CLANG_ATTR_TARGET_AARCH64 -/* - * A spot of cheating: redefine some ACLE feature macros before - * including arm_neon.h. Otherwise we won't get the SHA intrinsics - * defined by that header, because it will be looking at the settings - * for the whole translation unit rather than the ones we're going to - * put on some particular functions using __attribute__((target)). - */ -#define __ARM_NEON 1 -#define __ARM_FEATURE_CRYPTO 1 -#define FUNC_ISA __attribute__ ((target("neon,sha3"))) -#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */ - -#ifndef FUNC_ISA -#define FUNC_ISA -#endif - -#ifdef USE_ARM64_NEON_H -#include -#else -#include -#endif - -static bool sha512_hw_available(void) -{ - /* - * For Arm, we delegate to a per-platform detection function (see - * explanation in sshaes.c). - */ - return platform_sha512_hw_available(); -} - -#if defined __clang__ -/* - * As of 2020-12-24, I've found that clang doesn't provide the SHA-512 - * NEON intrinsics. So I define my own set using inline assembler, and - * use #define to effectively rename them over the top of the standard - * names. - * - * The aim of that #define technique is that it should avoid a build - * failure if these intrinsics _are_ defined in . - * Obviously it would be better in that situation to switch back to - * using the real intrinsics, but until I see a version of clang that - * supports them, I won't know what version number to test in the - * ifdef. - */ -static inline FUNC_ISA -uint64x2_t vsha512su0q_u64_asm(uint64x2_t x, uint64x2_t y) { - __asm__("sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y)); - return x; -} -static inline FUNC_ISA -uint64x2_t vsha512su1q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) { - __asm__("sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z)); - return x; -} -static inline FUNC_ISA -uint64x2_t vsha512hq_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) { - __asm__("sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z)); - return x; -} -static inline FUNC_ISA -uint64x2_t vsha512h2q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) { - __asm__("sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z)); - return x; -} -#undef vsha512su0q_u64 -#define vsha512su0q_u64 vsha512su0q_u64_asm -#undef vsha512su1q_u64 -#define vsha512su1q_u64 vsha512su1q_u64_asm -#undef vsha512hq_u64 -#define vsha512hq_u64 vsha512hq_u64_asm -#undef vsha512h2q_u64 -#define vsha512h2q_u64 vsha512h2q_u64_asm -#endif /* defined __clang__ */ - -typedef struct sha512_neon_core sha512_neon_core; -struct sha512_neon_core { - uint64x2_t ab, cd, ef, gh; -}; - -FUNC_ISA -static inline uint64x2_t sha512_neon_load_input(const uint8_t *p) -{ - return vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(p))); -} - -FUNC_ISA -static inline uint64x2_t sha512_neon_schedule_update( - uint64x2_t m8, uint64x2_t m7, uint64x2_t m4, uint64x2_t m3, uint64x2_t m1) -{ - /* - * vsha512su0q_u64() takes words from a long way back in the - * schedule and performs the sigma_0 half of the computation of - * the next two 64-bit message-schedule words. - * - * vsha512su1q_u64() combines the result of that with the sigma_1 - * steps, to output the finished version of those two words. The - * total amount of input data it requires fits nicely into three - * 128-bit vector registers, but one of those registers is - * misaligned compared to the 128-bit chunks that the message - * schedule is stored in. So we use vextq_u64 to make one of its - * input words out of the second half of m4 and the first half of - * m3. - */ - return vsha512su1q_u64(vsha512su0q_u64(m8, m7), m1, vextq_u64(m4, m3, 1)); -} - -FUNC_ISA -static inline void sha512_neon_round2( - unsigned round_index, uint64x2_t schedule_words, - uint64x2_t *ab, uint64x2_t *cd, uint64x2_t *ef, uint64x2_t *gh) -{ - /* - * vsha512hq_u64 performs the Sigma_1 and Ch half of the - * computation of two rounds of SHA-512 (including feeding back - * one of the outputs from the first of those half-rounds into the - * second one). - * - * vsha512h2q_u64 combines the result of that with the Sigma_0 and - * Maj steps, and outputs one 128-bit vector that replaces the gh - * piece of the input hash state, and a second that updates cd by - * addition. - * - * Similarly to vsha512su1q_u64 above, some of the input registers - * expected by these instructions are misaligned by 64 bits - * relative to the chunks we've divided the hash state into, so we - * have to start by making 'de' and 'fg' words out of our input - * cd,ef,gh, using vextq_u64. - * - * Also, one of the inputs to vsha512hq_u64 is expected to contain - * the results of summing gh + two round constants + two words of - * message schedule, but the two words of the message schedule - * have to be the opposite way round in the vector register from - * the way that vsha512su1q_u64 output them. Hence, there's - * another vextq_u64 in here that swaps the two halves of the - * initial_sum vector register. - * - * (This also means that I don't have to prepare a specially - * reordered version of the sha512_round_constants[] array: as - * long as I'm unavoidably doing a swap at run time _anyway_, I - * can load from the normally ordered version of that array, and - * just take care to fold in that data _before_ the swap rather - * than after.) - */ - - /* Load two round constants, with the first one in the low half */ - uint64x2_t round_constants = vld1q_u64( - sha512_round_constants + round_index); - - /* Add schedule words to round constants */ - uint64x2_t initial_sum = vaddq_u64(schedule_words, round_constants); - - /* Swap that sum around so the word used in the first of the two - * rounds is in the _high_ half of the vector, matching where h - * lives in the gh vector */ - uint64x2_t swapped_initial_sum = vextq_u64(initial_sum, initial_sum, 1); - - /* Add gh to that, now that they're matching ways round */ - uint64x2_t sum = vaddq_u64(swapped_initial_sum, *gh); - - /* Make the misaligned de and fg words */ - uint64x2_t de = vextq_u64(*cd, *ef, 1); - uint64x2_t fg = vextq_u64(*ef, *gh, 1); - - /* Now we're ready to put all the pieces together. The output from - * vsha512h2q_u64 can be used directly as the new gh, and the - * output from vsha512hq_u64 is simultaneously the intermediate - * value passed to h2 and the thing you have to add on to cd. */ - uint64x2_t intermed = vsha512hq_u64(sum, fg, de); - *gh = vsha512h2q_u64(intermed, *cd, *ab); - *cd = vaddq_u64(*cd, intermed); -} - -FUNC_ISA -static inline void sha512_neon_block(sha512_neon_core *core, const uint8_t *p) -{ - uint64x2_t s0, s1, s2, s3, s4, s5, s6, s7; - - uint64x2_t ab = core->ab, cd = core->cd, ef = core->ef, gh = core->gh; - - s0 = sha512_neon_load_input(p + 16*0); - sha512_neon_round2(0, s0, &ab, &cd, &ef, &gh); - s1 = sha512_neon_load_input(p + 16*1); - sha512_neon_round2(2, s1, &gh, &ab, &cd, &ef); - s2 = sha512_neon_load_input(p + 16*2); - sha512_neon_round2(4, s2, &ef, &gh, &ab, &cd); - s3 = sha512_neon_load_input(p + 16*3); - sha512_neon_round2(6, s3, &cd, &ef, &gh, &ab); - s4 = sha512_neon_load_input(p + 16*4); - sha512_neon_round2(8, s4, &ab, &cd, &ef, &gh); - s5 = sha512_neon_load_input(p + 16*5); - sha512_neon_round2(10, s5, &gh, &ab, &cd, &ef); - s6 = sha512_neon_load_input(p + 16*6); - sha512_neon_round2(12, s6, &ef, &gh, &ab, &cd); - s7 = sha512_neon_load_input(p + 16*7); - sha512_neon_round2(14, s7, &cd, &ef, &gh, &ab); - s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7); - sha512_neon_round2(16, s0, &ab, &cd, &ef, &gh); - s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0); - sha512_neon_round2(18, s1, &gh, &ab, &cd, &ef); - s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1); - sha512_neon_round2(20, s2, &ef, &gh, &ab, &cd); - s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2); - sha512_neon_round2(22, s3, &cd, &ef, &gh, &ab); - s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3); - sha512_neon_round2(24, s4, &ab, &cd, &ef, &gh); - s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4); - sha512_neon_round2(26, s5, &gh, &ab, &cd, &ef); - s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5); - sha512_neon_round2(28, s6, &ef, &gh, &ab, &cd); - s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6); - sha512_neon_round2(30, s7, &cd, &ef, &gh, &ab); - s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7); - sha512_neon_round2(32, s0, &ab, &cd, &ef, &gh); - s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0); - sha512_neon_round2(34, s1, &gh, &ab, &cd, &ef); - s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1); - sha512_neon_round2(36, s2, &ef, &gh, &ab, &cd); - s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2); - sha512_neon_round2(38, s3, &cd, &ef, &gh, &ab); - s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3); - sha512_neon_round2(40, s4, &ab, &cd, &ef, &gh); - s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4); - sha512_neon_round2(42, s5, &gh, &ab, &cd, &ef); - s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5); - sha512_neon_round2(44, s6, &ef, &gh, &ab, &cd); - s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6); - sha512_neon_round2(46, s7, &cd, &ef, &gh, &ab); - s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7); - sha512_neon_round2(48, s0, &ab, &cd, &ef, &gh); - s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0); - sha512_neon_round2(50, s1, &gh, &ab, &cd, &ef); - s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1); - sha512_neon_round2(52, s2, &ef, &gh, &ab, &cd); - s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2); - sha512_neon_round2(54, s3, &cd, &ef, &gh, &ab); - s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3); - sha512_neon_round2(56, s4, &ab, &cd, &ef, &gh); - s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4); - sha512_neon_round2(58, s5, &gh, &ab, &cd, &ef); - s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5); - sha512_neon_round2(60, s6, &ef, &gh, &ab, &cd); - s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6); - sha512_neon_round2(62, s7, &cd, &ef, &gh, &ab); - s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7); - sha512_neon_round2(64, s0, &ab, &cd, &ef, &gh); - s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0); - sha512_neon_round2(66, s1, &gh, &ab, &cd, &ef); - s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1); - sha512_neon_round2(68, s2, &ef, &gh, &ab, &cd); - s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2); - sha512_neon_round2(70, s3, &cd, &ef, &gh, &ab); - s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3); - sha512_neon_round2(72, s4, &ab, &cd, &ef, &gh); - s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4); - sha512_neon_round2(74, s5, &gh, &ab, &cd, &ef); - s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5); - sha512_neon_round2(76, s6, &ef, &gh, &ab, &cd); - s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6); - sha512_neon_round2(78, s7, &cd, &ef, &gh, &ab); - - core->ab = vaddq_u64(core->ab, ab); - core->cd = vaddq_u64(core->cd, cd); - core->ef = vaddq_u64(core->ef, ef); - core->gh = vaddq_u64(core->gh, gh); -} - -typedef struct sha512_neon { - sha512_neon_core core; - sha512_block blk; - BinarySink_IMPLEMENTATION; - ssh_hash hash; -} sha512_neon; - -static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len); - -static ssh_hash *sha512_neon_new(const ssh_hashalg *alg) -{ - if (!sha512_hw_available_cached()) - return NULL; - - sha512_neon *s = snew(sha512_neon); - - s->hash.vt = alg; - BinarySink_INIT(s, sha512_neon_write); - BinarySink_DELEGATE_INIT(&s->hash, s); - return &s->hash; -} - -static void sha512_neon_reset(ssh_hash *hash) -{ - sha512_neon *s = container_of(hash, sha512_neon, hash); - const uint64_t *iv = (const uint64_t *)hash->vt->extra; - - s->core.ab = vld1q_u64(iv); - s->core.cd = vld1q_u64(iv+2); - s->core.ef = vld1q_u64(iv+4); - s->core.gh = vld1q_u64(iv+6); - - sha512_block_setup(&s->blk); -} - -static void sha512_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig) -{ - sha512_neon *copy = container_of(hcopy, sha512_neon, hash); - sha512_neon *orig = container_of(horig, sha512_neon, hash); - - *copy = *orig; /* structure copy */ - - BinarySink_COPIED(copy); - BinarySink_DELEGATE_INIT(©->hash, copy); -} - -static void sha512_neon_free(ssh_hash *hash) -{ - sha512_neon *s = container_of(hash, sha512_neon, hash); - smemclr(s, sizeof(*s)); - sfree(s); -} - -static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len) -{ - sha512_neon *s = BinarySink_DOWNCAST(bs, sha512_neon); - - while (len > 0) - if (sha512_block_write(&s->blk, &vp, &len)) - sha512_neon_block(&s->core, s->blk.block); -} - -static void sha512_neon_digest(ssh_hash *hash, uint8_t *digest) -{ - sha512_neon *s = container_of(hash, sha512_neon, hash); - - sha512_block_pad(&s->blk, BinarySink_UPCAST(s)); - - vst1q_u8(digest, vrev64q_u8(vreinterpretq_u8_u64(s->core.ab))); - vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd))); - vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef))); - vst1q_u8(digest+48, vrev64q_u8(vreinterpretq_u8_u64(s->core.gh))); -} - -static void sha384_neon_digest(ssh_hash *hash, uint8_t *digest) -{ - sha512_neon *s = container_of(hash, sha512_neon, hash); - - sha512_block_pad(&s->blk, BinarySink_UPCAST(s)); - - vst1q_u8(digest, vrev64q_u8(vreinterpretq_u8_u64(s->core.ab))); - vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd))); - vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef))); -} - -const ssh_hashalg ssh_sha512_hw = { - .new = sha512_neon_new, - .reset = sha512_neon_reset, - .copyfrom = sha512_neon_copyfrom, - .digest = sha512_neon_digest, - .free = sha512_neon_free, - .hlen = 64, - .blocklen = 128, - HASHALG_NAMES_ANNOTATED("SHA-512", "NEON accelerated"), - .extra = sha512_initial_state, -}; - -const ssh_hashalg ssh_sha384_hw = { - .new = sha512_neon_new, - .reset = sha512_neon_reset, - .copyfrom = sha512_neon_copyfrom, - .digest = sha384_neon_digest, - .free = sha512_neon_free, - .hlen = 48, - .blocklen = 128, - HASHALG_NAMES_ANNOTATED("SHA-384", "NEON accelerated"), - .extra = sha384_initial_state, -}; - -/* ---------------------------------------------------------------------- - * Stub functions if we have no hardware-accelerated SHA-512. In this - * case, sha512_hw_new returns NULL (though it should also never be - * selected by sha512_select, so the only thing that should even be - * _able_ to call it is testcrypt). As a result, the remaining vtable - * functions should never be called at all. - */ - -#elif HW_SHA512 == HW_SHA512_NONE - -static bool sha512_hw_available(void) -{ - return false; -} - -static ssh_hash *sha512_stub_new(const ssh_hashalg *alg) -{ - return NULL; -} - -#define STUB_BODY { unreachable("Should never be called"); } - -static void sha512_stub_reset(ssh_hash *hash) STUB_BODY -static void sha512_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY -static void sha512_stub_free(ssh_hash *hash) STUB_BODY -static void sha512_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY - -const ssh_hashalg ssh_sha512_hw = { - .new = sha512_stub_new, - .reset = sha512_stub_reset, - .copyfrom = sha512_stub_copyfrom, - .digest = sha512_stub_digest, - .free = sha512_stub_free, - .hlen = 64, - .blocklen = 128, - HASHALG_NAMES_ANNOTATED("SHA-512", "!NONEXISTENT ACCELERATED VERSION!"), -}; - -const ssh_hashalg ssh_sha384_hw = { - .new = sha512_stub_new, - .reset = sha512_stub_reset, - .copyfrom = sha512_stub_copyfrom, - .digest = sha512_stub_digest, - .free = sha512_stub_free, - .hlen = 48, - .blocklen = 128, - HASHALG_NAMES_ANNOTATED("SHA-384", "!NONEXISTENT ACCELERATED VERSION!"), -}; - -#endif /* HW_SHA512 */ diff --git a/crypto/sha512.h b/crypto/sha512.h new file mode 100644 index 00000000..98145558 --- /dev/null +++ b/crypto/sha512.h @@ -0,0 +1,131 @@ +/* + * Definitions likely to be helpful to multiple SHA-512 implementations. + */ + +/* + * The 'extra' structure used by SHA-512 implementations is used to + * include information about how to check if a given implementation is + * available at run time, and whether we've already checked. + */ +struct sha512_extra_mutable; +struct sha512_extra { + /* Pointer to the initial state (distinguishes SHA-384 from -512) */ + const uint64_t *initial_state; + + /* Function to check availability. Might be expensive, so we don't + * want to call it more than once. */ + bool (*check_available)(void); + + /* Point to a writable substructure. */ + struct sha512_extra_mutable *mut; +}; +struct sha512_extra_mutable { + bool checked_availability; + bool is_available; +}; +static inline bool check_availability(const struct sha512_extra *extra) +{ + if (!extra->mut->checked_availability) { + extra->mut->is_available = extra->check_available(); + extra->mut->checked_availability = true; + } + + return extra->mut->is_available; +} + +/* + * Macro to define a pair of SHA-{384,512} vtables together with their + * 'extra' structure. + */ +#define SHA512_VTABLES(impl_c, impl_display) \ + static struct sha512_extra_mutable sha512_ ## impl_c ## _extra_mut; \ + static const struct sha512_extra sha384_ ## impl_c ## _extra = { \ + .initial_state = sha384_initial_state, \ + .check_available = sha512_ ## impl_c ## _available, \ + .mut = &sha512_ ## impl_c ## _extra_mut, \ + }; \ + static const struct sha512_extra sha512_ ## impl_c ## _extra = { \ + .initial_state = sha512_initial_state, \ + .check_available = sha512_ ## impl_c ## _available, \ + .mut = &sha512_ ## impl_c ## _extra_mut, \ + }; \ + const ssh_hashalg ssh_sha384_ ## impl_c = { \ + .new = sha512_ ## impl_c ## _new, \ + .reset = sha512_ ## impl_c ## _reset, \ + .copyfrom = sha512_ ## impl_c ## _copyfrom, \ + .digest = sha384_ ## impl_c ## _digest, \ + .free = sha512_ ## impl_c ## _free, \ + .hlen = 48, \ + .blocklen = 128, \ + HASHALG_NAMES_ANNOTATED("SHA-384", impl_display), \ + .extra = &sha384_ ## impl_c ## _extra, \ + }; \ + const ssh_hashalg ssh_sha512_ ## impl_c = { \ + .new = sha512_ ## impl_c ## _new, \ + .reset = sha512_ ## impl_c ## _reset, \ + .copyfrom = sha512_ ## impl_c ## _copyfrom, \ + .digest = sha512_ ## impl_c ## _digest, \ + .free = sha512_ ## impl_c ## _free, \ + .hlen = 64, \ + .blocklen = 128, \ + HASHALG_NAMES_ANNOTATED("SHA-512", impl_display), \ + .extra = &sha512_ ## impl_c ## _extra, \ + } + +extern const uint64_t sha512_initial_state[8]; +extern const uint64_t sha384_initial_state[8]; +extern const uint64_t sha512_round_constants[80]; + +#define SHA512_ROUNDS 80 + +typedef struct sha512_block sha512_block; +struct sha512_block { + uint8_t block[128]; + size_t used; + uint64_t lenhi, lenlo; +}; + +static inline void sha512_block_setup(sha512_block *blk) +{ + blk->used = 0; + blk->lenhi = blk->lenlo = 0; +} + +static inline bool sha512_block_write( + sha512_block *blk, const void **vdata, size_t *len) +{ + size_t blkleft = sizeof(blk->block) - blk->used; + size_t chunk = *len < blkleft ? *len : blkleft; + + const uint8_t *p = *vdata; + memcpy(blk->block + blk->used, p, chunk); + *vdata = p + chunk; + *len -= chunk; + blk->used += chunk; + + size_t chunkbits = chunk << 3; + + blk->lenlo += chunkbits; + blk->lenhi += (blk->lenlo < chunkbits); + + if (blk->used == sizeof(blk->block)) { + blk->used = 0; + return true; + } + + return false; +} + +static inline void sha512_block_pad(sha512_block *blk, BinarySink *bs) +{ + uint64_t final_lenhi = blk->lenhi; + uint64_t final_lenlo = blk->lenlo; + size_t pad = 127 & (111 - blk->used); + + put_byte(bs, 0x80); + put_padding(bs, pad, 0); + put_uint64(bs, final_lenhi); + put_uint64(bs, final_lenlo); + + assert(blk->used == 0 && "Should have exactly hit a block boundary"); +} diff --git a/ssh.h b/ssh.h index 4162fc1e..24d7f1a5 100644 --- a/ssh.h +++ b/ssh.h @@ -953,22 +953,28 @@ extern const ssh_cipheralg ssh_3des_ssh2; extern const ssh_cipheralg ssh_des; extern const ssh_cipheralg ssh_des_sshcom_ssh2; extern const ssh_cipheralg ssh_aes256_sdctr; -extern const ssh_cipheralg ssh_aes256_sdctr_hw; +extern const ssh_cipheralg ssh_aes256_sdctr_ni; +extern const ssh_cipheralg ssh_aes256_sdctr_neon; extern const ssh_cipheralg ssh_aes256_sdctr_sw; extern const ssh_cipheralg ssh_aes256_cbc; -extern const ssh_cipheralg ssh_aes256_cbc_hw; +extern const ssh_cipheralg ssh_aes256_cbc_ni; +extern const ssh_cipheralg ssh_aes256_cbc_neon; extern const ssh_cipheralg ssh_aes256_cbc_sw; extern const ssh_cipheralg ssh_aes192_sdctr; -extern const ssh_cipheralg ssh_aes192_sdctr_hw; +extern const ssh_cipheralg ssh_aes192_sdctr_ni; +extern const ssh_cipheralg ssh_aes192_sdctr_neon; extern const ssh_cipheralg ssh_aes192_sdctr_sw; extern const ssh_cipheralg ssh_aes192_cbc; -extern const ssh_cipheralg ssh_aes192_cbc_hw; +extern const ssh_cipheralg ssh_aes192_cbc_ni; +extern const ssh_cipheralg ssh_aes192_cbc_neon; extern const ssh_cipheralg ssh_aes192_cbc_sw; extern const ssh_cipheralg ssh_aes128_sdctr; -extern const ssh_cipheralg ssh_aes128_sdctr_hw; +extern const ssh_cipheralg ssh_aes128_sdctr_ni; +extern const ssh_cipheralg ssh_aes128_sdctr_neon; extern const ssh_cipheralg ssh_aes128_sdctr_sw; extern const ssh_cipheralg ssh_aes128_cbc; -extern const ssh_cipheralg ssh_aes128_cbc_hw; +extern const ssh_cipheralg ssh_aes128_cbc_ni; +extern const ssh_cipheralg ssh_aes128_cbc_neon; extern const ssh_cipheralg ssh_aes128_cbc_sw; extern const ssh_cipheralg ssh_blowfish_ssh2_ctr; extern const ssh_cipheralg ssh_blowfish_ssh2; @@ -983,16 +989,18 @@ extern const ssh2_ciphers ssh2_arcfour; extern const ssh2_ciphers ssh2_ccp; extern const ssh_hashalg ssh_md5; extern const ssh_hashalg ssh_sha1; -extern const ssh_hashalg ssh_sha1_hw; +extern const ssh_hashalg ssh_sha1_ni; +extern const ssh_hashalg ssh_sha1_neon; extern const ssh_hashalg ssh_sha1_sw; extern const ssh_hashalg ssh_sha256; -extern const ssh_hashalg ssh_sha256_hw; +extern const ssh_hashalg ssh_sha256_ni; +extern const ssh_hashalg ssh_sha256_neon; extern const ssh_hashalg ssh_sha256_sw; extern const ssh_hashalg ssh_sha384; -extern const ssh_hashalg ssh_sha384_hw; +extern const ssh_hashalg ssh_sha384_neon; extern const ssh_hashalg ssh_sha384_sw; extern const ssh_hashalg ssh_sha512; -extern const ssh_hashalg ssh_sha512_hw; +extern const ssh_hashalg ssh_sha512_neon; extern const ssh_hashalg ssh_sha512_sw; extern const ssh_hashalg ssh_sha3_224; extern const ssh_hashalg ssh_sha3_256; @@ -1039,10 +1047,10 @@ ssh_hash *blake2b_new_general(unsigned hashlen); * itself. If so, then this function should be implemented in each * platform subdirectory. */ -bool platform_aes_hw_available(void); -bool platform_sha256_hw_available(void); -bool platform_sha1_hw_available(void); -bool platform_sha512_hw_available(void); +bool platform_aes_neon_available(void); +bool platform_sha256_neon_available(void); +bool platform_sha1_neon_available(void); +bool platform_sha512_neon_available(void); /* * PuTTY version number formatted as an SSH version string. diff --git a/test/cryptsuite.py b/test/cryptsuite.py index 757de673..9ed0c3f5 100755 --- a/test/cryptsuite.py +++ b/test/cryptsuite.py @@ -141,6 +141,14 @@ def mac_str(alg, key, message, cipher=None): def lcm(a, b): return a * b // gcd(a, b) +def get_implementations(alg): + return get_implementations_commasep(alg).decode("ASCII").split(",") + +def get_aes_impls(): + return [impl.rsplit("_", 1)[-1] + for impl in get_implementations("aes128_cbc") + if impl.startswith("aes128_cbc_")] + class MyTestBase(unittest.TestCase): "Intermediate class that adds useful helper methods." def assertEqualBin(self, x, y): @@ -1181,9 +1189,9 @@ class crypt(MyTestBase): # reference implementation of AES in Python. ('Mostly' # independent in that it was written by me.) - def vector(cipher, key, iv, plaintext, ciphertext): - for suffix in "hw", "sw": - c = ssh_cipher_new("{}_{}".format(cipher, suffix)) + def vector(cipherbase, key, iv, plaintext, ciphertext): + for cipher in get_implementations(cipherbase): + c = ssh_cipher_new(cipher) if c is None: return # skip test if HW AES not available ssh_cipher_setkey(c, key) ssh_cipher_setiv(c, iv) @@ -1302,7 +1310,7 @@ class crypt(MyTestBase): # We also test this at all three AES key lengths, in case the # core cipher routines are written separately for each one. - for suffix in "hw", "sw": + for suffix in get_aes_impls(): for keylen in [128, 192, 256]: hexTestValues = ["00000000", "00000001", "ffffffff"] for ivHexBytes in itertools.product(*([hexTestValues] * 4)): @@ -1325,7 +1333,7 @@ class crypt(MyTestBase): for keylen in [128, 192, 256]: decryptions = [] - for suffix in "hw", "sw": + for suffix in get_aes_impls(): c = ssh_cipher_new("aes{:d}_cbc_{}".format(keylen, suffix)) if c is None: continue ssh_cipher_setkey(c, test_key[:keylen//8]) @@ -1493,23 +1501,11 @@ class crypt(MyTestBase): ("3des_ssh1", 24, 8, False, unhex('d5f1cc25b8fbc62de63590b9b92344adf6dd72753273ff0fb32d4dbc6af858529129f34242f3d557eed3a5c84204eb4f868474294964cf70df5d8f45dfccfc45')), ("des_cbc", 8, 8, True, unhex('051524e77fb40e109d9fffeceacf0f28c940e2f8415ddccc117020bdd2612af5036490b12085d0e46129919b8e499f51cb82a4b341d7a1a1ea3e65201ef248f6')), ("aes256_ctr", 32, 16, False, unhex('b87b35e819f60f0f398a37b05d7bcf0b04ad4ebe570bd08e8bfa8606bafb0db2cfcd82baf2ccceae5de1a3c1ae08a8b8fdd884fdc5092031ea8ce53333e62976')), - ("aes256_ctr_hw", 32, 16, False, unhex('b87b35e819f60f0f398a37b05d7bcf0b04ad4ebe570bd08e8bfa8606bafb0db2cfcd82baf2ccceae5de1a3c1ae08a8b8fdd884fdc5092031ea8ce53333e62976')), - ("aes256_ctr_sw", 32, 16, False, unhex('b87b35e819f60f0f398a37b05d7bcf0b04ad4ebe570bd08e8bfa8606bafb0db2cfcd82baf2ccceae5de1a3c1ae08a8b8fdd884fdc5092031ea8ce53333e62976')), ("aes256_cbc", 32, 16, True, unhex('381cbb2fbcc48118d0094540242bd990dd6af5b9a9890edd013d5cad2d904f34b9261c623a452f32ea60e5402919a77165df12862742f1059f8c4a862f0827c5')), - ("aes256_cbc_hw", 32, 16, True, unhex('381cbb2fbcc48118d0094540242bd990dd6af5b9a9890edd013d5cad2d904f34b9261c623a452f32ea60e5402919a77165df12862742f1059f8c4a862f0827c5')), - ("aes256_cbc_sw", 32, 16, True, unhex('381cbb2fbcc48118d0094540242bd990dd6af5b9a9890edd013d5cad2d904f34b9261c623a452f32ea60e5402919a77165df12862742f1059f8c4a862f0827c5')), ("aes192_ctr", 24, 16, False, unhex('06bcfa7ccf075d723e12b724695a571a0fad67c56287ea609c410ac12749c51bb96e27fa7e1c7ea3b14792bbbb8856efb0617ebec24a8e4a87340d820cf347b8')), - ("aes192_ctr_hw", 24, 16, False, unhex('06bcfa7ccf075d723e12b724695a571a0fad67c56287ea609c410ac12749c51bb96e27fa7e1c7ea3b14792bbbb8856efb0617ebec24a8e4a87340d820cf347b8')), - ("aes192_ctr_sw", 24, 16, False, unhex('06bcfa7ccf075d723e12b724695a571a0fad67c56287ea609c410ac12749c51bb96e27fa7e1c7ea3b14792bbbb8856efb0617ebec24a8e4a87340d820cf347b8')), ("aes192_cbc", 24, 16, True, unhex('ac97f8698170f9c05341214bd7624d5d2efef8311596163dc597d9fe6c868971bd7557389974612cbf49ea4e7cc6cc302d4cc90519478dd88a4f09b530c141f3')), - ("aes192_cbc_hw", 24, 16, True, unhex('ac97f8698170f9c05341214bd7624d5d2efef8311596163dc597d9fe6c868971bd7557389974612cbf49ea4e7cc6cc302d4cc90519478dd88a4f09b530c141f3')), - ("aes192_cbc_sw", 24, 16, True, unhex('ac97f8698170f9c05341214bd7624d5d2efef8311596163dc597d9fe6c868971bd7557389974612cbf49ea4e7cc6cc302d4cc90519478dd88a4f09b530c141f3')), ("aes128_ctr", 16, 16, False, unhex('0ad4ddfd2360ec59d77dcb9a981f92109437c68c5e7f02f92017d9f424f89ab7850473ac0e19274125e740f252c84ad1f6ad138b6020a03bdaba2f3a7378ce1e')), - ("aes128_ctr_hw", 16, 16, False, unhex('0ad4ddfd2360ec59d77dcb9a981f92109437c68c5e7f02f92017d9f424f89ab7850473ac0e19274125e740f252c84ad1f6ad138b6020a03bdaba2f3a7378ce1e')), - ("aes128_ctr_sw", 16, 16, False, unhex('0ad4ddfd2360ec59d77dcb9a981f92109437c68c5e7f02f92017d9f424f89ab7850473ac0e19274125e740f252c84ad1f6ad138b6020a03bdaba2f3a7378ce1e')), ("aes128_cbc", 16, 16, True, unhex('36de36917fb7955a711c8b0bf149b29120a77524f393ae3490f4ce5b1d5ca2a0d7064ce3c38e267807438d12c0e40cd0d84134647f9f4a5b11804a0cc5070e62')), - ("aes128_cbc_hw", 16, 16, True, unhex('36de36917fb7955a711c8b0bf149b29120a77524f393ae3490f4ce5b1d5ca2a0d7064ce3c38e267807438d12c0e40cd0d84134647f9f4a5b11804a0cc5070e62')), - ("aes128_cbc_sw", 16, 16, True, unhex('36de36917fb7955a711c8b0bf149b29120a77524f393ae3490f4ce5b1d5ca2a0d7064ce3c38e267807438d12c0e40cd0d84134647f9f4a5b11804a0cc5070e62')), ("blowfish_ctr", 32, 8, False, unhex('079daf0f859363ccf72e975764d709232ec48adc74f88ccd1f342683f0bfa89ca0e8dbfccc8d4d99005d6b61e9cc4e6eaa2fd2a8163271b94bf08ef212129f01')), ("blowfish_ssh2", 16, 8, True, unhex('e986b7b01f17dfe80ee34cac81fa029b771ec0f859ae21ae3ec3df1674bc4ceb54a184c6c56c17dd2863c3e9c068e76fd9aef5673465995f0d648b0bb848017f')), ("blowfish_ssh1", 32, 8, True, unhex('d44092a9035d895acf564ba0365d19570fbb4f125d5a4fd2a1812ee6c8a1911a51bb181fbf7d1a261253cab71ee19346eb477b3e7ecf1d95dd941e635c1a4fbf')), @@ -1517,36 +1513,37 @@ class crypt(MyTestBase): ("arcfour128", 16, None, False, unhex('fd4af54c5642cb29629e50a15d22e4944e21ffba77d0543b27590eafffe3886686d1aefae0484afc9e67edc0e67eb176bbb5340af1919ea39adfe866d066dd05')), ] - for alg, keylen, ivlen, simple_cbc, c in ciphers: - cipher = ssh_cipher_new(alg) - if cipher is None: - continue # hardware-accelerated cipher not available + for algbase, keylen, ivlen, simple_cbc, c in ciphers: + for alg in get_implementations(algbase): + cipher = ssh_cipher_new(alg) + if cipher is None: + continue # hardware-accelerated cipher not available - ssh_cipher_setkey(cipher, k[:keylen]) - if ivlen is not None: - ssh_cipher_setiv(cipher, iv[:ivlen]) - self.assertEqualBin(ssh_cipher_encrypt(cipher, p), c) - - ssh_cipher_setkey(cipher, k[:keylen]) - if ivlen is not None: - ssh_cipher_setiv(cipher, iv[:ivlen]) - self.assertEqualBin(ssh_cipher_decrypt(cipher, c), p) - - if simple_cbc: - # CBC ciphers (other than the three-layered CBC used - # by SSH-1 3DES) have more specific semantics for - # their IV than 'some kind of starting state for the - # cipher mode': the IV is specifically supposed to - # represent the previous block of ciphertext. So we - # can check that, by supplying the IV _as_ a - # ciphertext block via a call to decrypt(), and seeing - # if that causes our test ciphertext to decrypt the - # same way as when we provided the same IV via - # setiv(). ssh_cipher_setkey(cipher, k[:keylen]) - ssh_cipher_decrypt(cipher, iv[:ivlen]) + if ivlen is not None: + ssh_cipher_setiv(cipher, iv[:ivlen]) + self.assertEqualBin(ssh_cipher_encrypt(cipher, p), c) + + ssh_cipher_setkey(cipher, k[:keylen]) + if ivlen is not None: + ssh_cipher_setiv(cipher, iv[:ivlen]) self.assertEqualBin(ssh_cipher_decrypt(cipher, c), p) + if simple_cbc: + # CBC ciphers (other than the three-layered CBC used + # by SSH-1 3DES) have more specific semantics for + # their IV than 'some kind of starting state for the + # cipher mode': the IV is specifically supposed to + # represent the previous block of ciphertext. So we + # can check that, by supplying the IV _as_ a + # ciphertext block via a call to decrypt(), and seeing + # if that causes our test ciphertext to decrypt the + # same way as when we provided the same IV via + # setiv(). + ssh_cipher_setkey(cipher, k[:keylen]) + ssh_cipher_decrypt(cipher, iv[:ivlen]) + self.assertEqualBin(ssh_cipher_decrypt(cipher, c), p) + def testRSAKex(self): # Round-trip test of the RSA key exchange functions, plus a # hardcoded plain/ciphertext pair to guard against the @@ -2324,7 +2321,7 @@ Private-MAC: 5b1f6f4cc43eb0060d2c3e181bc0129343adba2b class standard_test_vectors(MyTestBase): def testAES(self): def vector(cipher, key, plaintext, ciphertext): - for suffix in "hw", "sw": + for suffix in get_aes_impls(): c = ssh_cipher_new("{}_{}".format(cipher, suffix)) if c is None: return # skip test if HW AES not available ssh_cipher_setkey(c, key) @@ -2540,7 +2537,7 @@ class standard_test_vectors(MyTestBase): unhex('56be34521d144c88dbb8c733f0e8b3f6')) def testSHA1(self): - for hashname in ['sha1_sw', 'sha1_hw']: + for hashname in get_implementations("sha1"): if ssh_hash_new(hashname) is None: continue # skip testing of unavailable HW implementation @@ -2577,7 +2574,7 @@ class standard_test_vectors(MyTestBase): "cb0082c8f197d260991ba6a460e76e202bad27b3")) def testSHA256(self): - for hashname in ['sha256_sw', 'sha256_hw']: + for hashname in get_implementations("sha256"): if ssh_hash_new(hashname) is None: continue # skip testing of unavailable HW implementation @@ -2621,7 +2618,7 @@ class standard_test_vectors(MyTestBase): "8ad3361763f7e9b2d95f4f0da6e1ccbc")) def testSHA384(self): - for hashname in ['sha384_sw', 'sha384_hw']: + for hashname in get_implementations("sha384"): if ssh_hash_new(hashname) is None: continue # skip testing of unavailable HW implementation @@ -2663,7 +2660,7 @@ class standard_test_vectors(MyTestBase): '38e42b5c4de660f5de8fb2a5b2fbd2a3cbffd20cff1288c0')) def testSHA512(self): - for hashname in ['sha512_sw', 'sha512_hw']: + for hashname in get_implementations("sha512"): if ssh_hash_new(hashname) is None: continue # skip testing of unavailable HW implementation diff --git a/testcrypt.c b/testcrypt.c index 752947cf..1948da08 100644 --- a/testcrypt.c +++ b/testcrypt.c @@ -207,16 +207,24 @@ static const ssh_hashalg *get_hashalg(BinarySource *in) {"md5", &ssh_md5}, {"sha1", &ssh_sha1}, {"sha1_sw", &ssh_sha1_sw}, - {"sha1_hw", &ssh_sha1_hw}, {"sha256", &ssh_sha256}, - {"sha256_sw", &ssh_sha256_sw}, - {"sha256_hw", &ssh_sha256_hw}, {"sha384", &ssh_sha384}, - {"sha384_sw", &ssh_sha384_sw}, - {"sha384_hw", &ssh_sha384_hw}, {"sha512", &ssh_sha512}, + {"sha256_sw", &ssh_sha256_sw}, + {"sha384_sw", &ssh_sha384_sw}, {"sha512_sw", &ssh_sha512_sw}, - {"sha512_hw", &ssh_sha512_hw}, +#if HAVE_SHA_NI + {"sha1_ni", &ssh_sha1_ni}, + {"sha256_ni", &ssh_sha256_ni}, +#endif +#if HAVE_NEON_CRYPTO + {"sha1_neon", &ssh_sha1_neon}, + {"sha256_neon", &ssh_sha256_neon}, +#endif +#if HAVE_NEON_SHA512 + {"sha384_neon", &ssh_sha384_neon}, + {"sha512_neon", &ssh_sha512_neon}, +#endif {"sha3_224", &ssh_sha3_224}, {"sha3_256", &ssh_sha3_256}, {"sha3_384", &ssh_sha3_384}, @@ -290,23 +298,33 @@ static const ssh_cipheralg *get_cipheralg(BinarySource *in) {"3des_ssh1", &ssh_3des_ssh1}, {"des_cbc", &ssh_des}, {"aes256_ctr", &ssh_aes256_sdctr}, - {"aes256_ctr_hw", &ssh_aes256_sdctr_hw}, - {"aes256_ctr_sw", &ssh_aes256_sdctr_sw}, {"aes256_cbc", &ssh_aes256_cbc}, - {"aes256_cbc_hw", &ssh_aes256_cbc_hw}, - {"aes256_cbc_sw", &ssh_aes256_cbc_sw}, {"aes192_ctr", &ssh_aes192_sdctr}, - {"aes192_ctr_hw", &ssh_aes192_sdctr_hw}, - {"aes192_ctr_sw", &ssh_aes192_sdctr_sw}, {"aes192_cbc", &ssh_aes192_cbc}, - {"aes192_cbc_hw", &ssh_aes192_cbc_hw}, - {"aes192_cbc_sw", &ssh_aes192_cbc_sw}, {"aes128_ctr", &ssh_aes128_sdctr}, - {"aes128_ctr_hw", &ssh_aes128_sdctr_hw}, - {"aes128_ctr_sw", &ssh_aes128_sdctr_sw}, {"aes128_cbc", &ssh_aes128_cbc}, - {"aes128_cbc_hw", &ssh_aes128_cbc_hw}, + {"aes256_ctr_sw", &ssh_aes256_sdctr_sw}, + {"aes256_cbc_sw", &ssh_aes256_cbc_sw}, + {"aes192_ctr_sw", &ssh_aes192_sdctr_sw}, + {"aes192_cbc_sw", &ssh_aes192_cbc_sw}, + {"aes128_ctr_sw", &ssh_aes128_sdctr_sw}, {"aes128_cbc_sw", &ssh_aes128_cbc_sw}, +#if HAVE_AES_NI + {"aes256_ctr_ni", &ssh_aes256_sdctr_ni}, + {"aes256_cbc_ni", &ssh_aes256_cbc_ni}, + {"aes192_ctr_ni", &ssh_aes192_sdctr_ni}, + {"aes192_cbc_ni", &ssh_aes192_cbc_ni}, + {"aes128_ctr_ni", &ssh_aes128_sdctr_ni}, + {"aes128_cbc_ni", &ssh_aes128_cbc_ni}, +#endif +#if HAVE_NEON_CRYPTO + {"aes256_ctr_neon", &ssh_aes256_sdctr_neon}, + {"aes256_cbc_neon", &ssh_aes256_cbc_neon}, + {"aes192_ctr_neon", &ssh_aes192_sdctr_neon}, + {"aes192_cbc_neon", &ssh_aes192_cbc_neon}, + {"aes128_ctr_neon", &ssh_aes128_sdctr_neon}, + {"aes128_cbc_neon", &ssh_aes128_cbc_neon}, +#endif {"blowfish_ctr", &ssh_blowfish_ssh2_ctr}, {"blowfish_ssh2", &ssh_blowfish_ssh2}, {"blowfish_ssh1", &ssh_blowfish_ssh1}, @@ -1285,6 +1303,38 @@ strbuf *argon2_wrapper(Argon2Flavour flavour, uint32_t mem, uint32_t passes, } #define argon2 argon2_wrapper +strbuf *get_implementations_commasep(ptrlen alg) +{ + strbuf *out = strbuf_new(); + put_datapl(out, alg); + + if (ptrlen_startswith(alg, PTRLEN_LITERAL("aes"), NULL)) { + strbuf_catf(out, ",%.*s_sw", PTRLEN_PRINTF(alg)); +#if HAVE_AES_NI + strbuf_catf(out, ",%.*s_ni", PTRLEN_PRINTF(alg)); +#endif +#if HAVE_NEON_CRYPTO + strbuf_catf(out, ",%.*s_neon", PTRLEN_PRINTF(alg)); +#endif + } else if (ptrlen_startswith(alg, PTRLEN_LITERAL("sha256"), NULL) || + ptrlen_startswith(alg, PTRLEN_LITERAL("sha1"), NULL)) { + strbuf_catf(out, ",%.*s_sw", PTRLEN_PRINTF(alg)); +#if HAVE_SHA_NI + strbuf_catf(out, ",%.*s_ni", PTRLEN_PRINTF(alg)); +#endif +#if HAVE_NEON_CRYPTO + strbuf_catf(out, ",%.*s_neon", PTRLEN_PRINTF(alg)); +#endif + } else if (ptrlen_startswith(alg, PTRLEN_LITERAL("sha512"), NULL)) { + strbuf_catf(out, ",%.*s_sw", PTRLEN_PRINTF(alg)); +#if HAVE_NEON_SHA512 + strbuf_catf(out, ",%.*s_neon", PTRLEN_PRINTF(alg)); +#endif + } + + return out; +} + #define OPTIONAL_PTR_FUNC(type) \ typedef TD_val_##type TD_opt_val_##type; \ static TD_opt_val_##type get_opt_val_##type(BinarySource *in) { \ diff --git a/testcrypt.h b/testcrypt.h index 298abc0f..2e6e993b 100644 --- a/testcrypt.h +++ b/testcrypt.h @@ -315,6 +315,7 @@ FUNC1(uint, crc32_rfc1662, val_string_ptrlen) FUNC1(uint, crc32_ssh1, val_string_ptrlen) FUNC2(uint, crc32_update, uint, val_string_ptrlen) FUNC2(boolean, crcda_detect, val_string_ptrlen, val_string_ptrlen) +FUNC1(val_string, get_implementations_commasep, val_string_ptrlen) /* * These functions aren't part of PuTTY's own API, but are additions diff --git a/testsc.c b/testsc.c index 93bf263a..b182d382 100644 --- a/testsc.c +++ b/testsc.c @@ -216,6 +216,31 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x)) return x; } +#if HAVE_AES_NI +#define CIPHERS_AES_NI(X, Y) \ + X(Y, ssh_aes256_sdctr_ni) \ + X(Y, ssh_aes256_cbc_ni) \ + X(Y, ssh_aes192_sdctr_ni) \ + X(Y, ssh_aes192_cbc_ni) \ + X(Y, ssh_aes128_sdctr_ni) \ + X(Y, ssh_aes128_cbc_ni) \ + /* end of list */ +#else +#define CIPHERS_AES_NI(X, Y) +#endif +#if HAVE_NEON_CRYPTO +#define CIPHERS_AES_NEON(X, Y) \ + X(Y, ssh_aes256_sdctr_neon) \ + X(Y, ssh_aes256_cbc_neon) \ + X(Y, ssh_aes192_sdctr_neon) \ + X(Y, ssh_aes192_cbc_neon) \ + X(Y, ssh_aes128_sdctr_neon) \ + X(Y, ssh_aes128_cbc_neon) \ + /* end of list */ +#else +#define CIPHERS_AES_NEON(X, Y) +#endif + /* Ciphers that we expect to pass this test. Blowfish and Arcfour are * intentionally omitted, because we already know they don't. */ #define CIPHERS(X, Y) \ @@ -225,23 +250,19 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x)) X(Y, ssh_des) \ X(Y, ssh_des_sshcom_ssh2) \ X(Y, ssh_aes256_sdctr) \ - X(Y, ssh_aes256_sdctr_hw) \ - X(Y, ssh_aes256_sdctr_sw) \ X(Y, ssh_aes256_cbc) \ - X(Y, ssh_aes256_cbc_hw) \ - X(Y, ssh_aes256_cbc_sw) \ X(Y, ssh_aes192_sdctr) \ - X(Y, ssh_aes192_sdctr_hw) \ - X(Y, ssh_aes192_sdctr_sw) \ X(Y, ssh_aes192_cbc) \ - X(Y, ssh_aes192_cbc_hw) \ - X(Y, ssh_aes192_cbc_sw) \ X(Y, ssh_aes128_sdctr) \ - X(Y, ssh_aes128_sdctr_hw) \ - X(Y, ssh_aes128_sdctr_sw) \ X(Y, ssh_aes128_cbc) \ - X(Y, ssh_aes128_cbc_hw) \ + X(Y, ssh_aes256_sdctr_sw) \ + X(Y, ssh_aes256_cbc_sw) \ + X(Y, ssh_aes192_sdctr_sw) \ + X(Y, ssh_aes192_cbc_sw) \ + X(Y, ssh_aes128_sdctr_sw) \ X(Y, ssh_aes128_cbc_sw) \ + CIPHERS_AES_NI(X, Y) \ + CIPHERS_AES_NEON(X, Y) \ X(Y, ssh2_chacha20_poly1305) \ /* end of list */ @@ -258,16 +279,35 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x)) #define MAC_TESTLIST(X, name) X(mac_ ## name) +#if HAVE_SHA_NI +#define HASH_SHA_NI(X, Y) X(Y, ssh_sha256_ni) X(Y, ssh_sha1_ni) +#else +#define HASH_SHA_NI(X, Y) +#endif +#if HAVE_NEON_CRYPTO +#define HASH_SHA_NEON(X, Y) X(Y, ssh_sha256_neon) X(Y, ssh_sha1_neon) +#else +#define HASH_SHA_NEON(X, Y) +#endif +#if HAVE_NEON_SHA512 +#define HASH_SHA512_NEON(X, Y) X(Y, ssh_sha384_neon) X(Y, ssh_sha512_neon) +#else +#define HASH_SHA512_NEON(X, Y) +#endif + #define HASHES(X, Y) \ X(Y, ssh_md5) \ X(Y, ssh_sha1) \ - X(Y, ssh_sha1_hw) \ X(Y, ssh_sha1_sw) \ X(Y, ssh_sha256) \ - X(Y, ssh_sha256_hw) \ X(Y, ssh_sha256_sw) \ X(Y, ssh_sha384) \ X(Y, ssh_sha512) \ + X(Y, ssh_sha384_sw) \ + X(Y, ssh_sha512_sw) \ + HASH_SHA_NI(X, Y) \ + HASH_SHA_NEON(X, Y) \ + HASH_SHA512_NEON(X, Y) \ X(Y, ssh_sha3_224) \ X(Y, ssh_sha3_256) \ X(Y, ssh_sha3_384) \ diff --git a/unix/utils/arm_arch_queries.c b/unix/utils/arm_arch_queries.c index 7c0957fa..cc3e4125 100644 --- a/unix/utils/arm_arch_queries.c +++ b/unix/utils/arm_arch_queries.c @@ -10,7 +10,7 @@ #if defined __arm__ || defined __aarch64__ -bool platform_aes_hw_available(void) +bool platform_aes_neon_available(void) { #if defined HWCAP_AES return getauxval(AT_HWCAP) & HWCAP_AES; @@ -26,7 +26,7 @@ bool platform_aes_hw_available(void) #endif } -bool platform_sha256_hw_available(void) +bool platform_sha256_neon_available(void) { #if defined HWCAP_SHA2 return getauxval(AT_HWCAP) & HWCAP_SHA2; @@ -40,7 +40,7 @@ bool platform_sha256_hw_available(void) #endif } -bool platform_sha1_hw_available(void) +bool platform_sha1_neon_available(void) { #if defined HWCAP_SHA1 return getauxval(AT_HWCAP) & HWCAP_SHA1; @@ -54,7 +54,7 @@ bool platform_sha1_hw_available(void) #endif } -bool platform_sha512_hw_available(void) +bool platform_sha512_neon_available(void) { #if defined HWCAP_SHA512 return getauxval(AT_HWCAP) & HWCAP_SHA512; diff --git a/windows/utils/arm_arch_queries.c b/windows/utils/arm_arch_queries.c index 05132b14..439a59fb 100644 --- a/windows/utils/arm_arch_queries.c +++ b/windows/utils/arm_arch_queries.c @@ -15,22 +15,22 @@ #define IsProcessorFeaturePresent(...) false #endif -bool platform_aes_hw_available(void) +bool platform_aes_neon_available(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); } -bool platform_sha256_hw_available(void) +bool platform_sha256_neon_available(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); } -bool platform_sha1_hw_available(void) +bool platform_sha1_neon_available(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); } -bool platform_sha512_hw_available(void) +bool platform_sha512_neon_available(void) { /* As of 2020-12-24, as far as I can tell from docs.microsoft.com, * Windows on Arm does not yet provide a PF_ARM_V8_* flag for the