diff --git a/ssh.h b/ssh.h index 1e732d78..d0eff519 100644 --- a/ssh.h +++ b/ssh.h @@ -856,6 +856,8 @@ extern const ssh_compression_alg ssh_zlib; * platform subdirectory. */ bool platform_aes_hw_available(void); +bool platform_sha256_hw_available(void); +bool platform_sha1_hw_available(void); /* * PuTTY version number formatted as an SSH version string. diff --git a/sshsh256.c b/sshsh256.c index 11facbf8..9a54ea99 100644 --- a/sshsh256.c +++ b/sshsh256.c @@ -12,6 +12,7 @@ */ #define HW_SHA256_NONE 0 #define HW_SHA256_NI 1 +#define HW_SHA256_NEON 2 #ifdef _FORCE_SHA_NI # define HW_SHA256 HW_SHA256_NI @@ -31,6 +32,37 @@ # endif #endif +#ifdef _FORCE_SHA_NEON +# define HW_SHA256 HW_SHA256_NEON +#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + /* Arm can potentially support both endiannesses, but this code + * hasn't been tested on anything but little. If anyone wants to + * run big-endian, they'll need to fix it first. */ +#elif defined __ARM_FEATURE_CRYPTO + /* If the Arm crypto extension is available already, we can + * support NEON SHA without having to enable anything by hand */ +# define HW_SHA256 HW_SHA256_NEON +#elif defined(__clang__) +# if __has_attribute(target) && __has_include() && \ + (defined(__aarch64__)) + /* clang can enable the crypto extension in AArch64 using + * __attribute__((target)) */ +# define HW_SHA256 HW_SHA256_NEON +# define USE_CLANG_ATTR_TARGET_AARCH64 +# endif +#elif defined _MSC_VER + /* Visual Studio supports the crypto extension when targeting + * AArch64, but as of VS2017, the AArch32 header doesn't quite + * manage it (declaring the shae/shad intrinsics without a round + * key operand). */ +# if defined _M_ARM64 +# define HW_SHA256 HW_SHA256_NEON +# if defined _M_ARM64 +# define USE_ARM64_NEON_H /* unusual header name in this case */ +# endif +# endif +#endif + #if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA256 # undef HW_SHA256 # define HW_SHA256 HW_SHA256_NONE @@ -655,6 +687,193 @@ const ssh_hashalg ssh_sha256_hw = { 32, 64, "SHA-256", }; +/* ---------------------------------------------------------------------- + * Hardware-accelerated implementation of SHA-256 using Arm NEON. + */ + +#elif HW_SHA256 == HW_SHA256_NEON + +/* + * Manually set the target architecture, if we decided above that we + * need to. + */ +#ifdef USE_CLANG_ATTR_TARGET_AARCH64 +/* + * A spot of cheating: redefine some ACLE feature macros before + * including arm_neon.h. Otherwise we won't get the SHA intrinsics + * defined by that header, because it will be looking at the settings + * for the whole translation unit rather than the ones we're going to + * put on some particular functions using __attribute__((target)). + */ +#define __ARM_NEON 1 +#define __ARM_FEATURE_CRYPTO 1 +#define FUNC_ISA __attribute__ ((target("neon,crypto"))) +#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */ + +#ifndef FUNC_ISA +#define FUNC_ISA +#endif + +#ifdef USE_ARM64_NEON_H +#include +#else +#include +#endif + +static bool sha256_hw_available(void) +{ + /* + * For Arm, we delegate to a per-platform detection function (see + * explanation in sshaes.c). + */ + return platform_sha256_hw_available(); +} + +typedef struct sha256_neon_core sha256_neon_core; +struct sha256_neon_core { + uint32x4_t abcd, efgh; +}; + +FUNC_ISA +static inline uint32x4_t sha256_neon_load_input(const uint8_t *p) +{ + return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p))); +} + +FUNC_ISA +static inline uint32x4_t sha256_neon_schedule_update( + uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1) +{ + return vsha256su1q_u32(vsha256su0q_u32(m4, m3), m2, m1); +} + +FUNC_ISA +static inline sha256_neon_core sha256_neon_round4( + sha256_neon_core old, uint32x4_t sched, unsigned round) +{ + sha256_neon_core new; + + uint32x4_t round_input = vaddq_u32( + sched, vld1q_u32(sha256_round_constants + round)); + new.abcd = vsha256hq_u32 (old.abcd, old.efgh, round_input); + new.efgh = vsha256h2q_u32(old.efgh, old.abcd, round_input); + return new; +} + +FUNC_ISA +static inline void sha256_neon_block(sha256_neon_core *core, const uint8_t *p) +{ + uint32x4_t s0, s1, s2, s3; + sha256_neon_core cr = *core; + + s0 = sha256_neon_load_input(p); + cr = sha256_neon_round4(cr, s0, 0); + s1 = sha256_neon_load_input(p+16); + cr = sha256_neon_round4(cr, s1, 4); + s2 = sha256_neon_load_input(p+32); + cr = sha256_neon_round4(cr, s2, 8); + s3 = sha256_neon_load_input(p+48); + cr = sha256_neon_round4(cr, s3, 12); + s0 = sha256_neon_schedule_update(s0, s1, s2, s3); + cr = sha256_neon_round4(cr, s0, 16); + s1 = sha256_neon_schedule_update(s1, s2, s3, s0); + cr = sha256_neon_round4(cr, s1, 20); + s2 = sha256_neon_schedule_update(s2, s3, s0, s1); + cr = sha256_neon_round4(cr, s2, 24); + s3 = sha256_neon_schedule_update(s3, s0, s1, s2); + cr = sha256_neon_round4(cr, s3, 28); + s0 = sha256_neon_schedule_update(s0, s1, s2, s3); + cr = sha256_neon_round4(cr, s0, 32); + s1 = sha256_neon_schedule_update(s1, s2, s3, s0); + cr = sha256_neon_round4(cr, s1, 36); + s2 = sha256_neon_schedule_update(s2, s3, s0, s1); + cr = sha256_neon_round4(cr, s2, 40); + s3 = sha256_neon_schedule_update(s3, s0, s1, s2); + cr = sha256_neon_round4(cr, s3, 44); + s0 = sha256_neon_schedule_update(s0, s1, s2, s3); + cr = sha256_neon_round4(cr, s0, 48); + s1 = sha256_neon_schedule_update(s1, s2, s3, s0); + cr = sha256_neon_round4(cr, s1, 52); + s2 = sha256_neon_schedule_update(s2, s3, s0, s1); + cr = sha256_neon_round4(cr, s2, 56); + s3 = sha256_neon_schedule_update(s3, s0, s1, s2); + cr = sha256_neon_round4(cr, s3, 60); + + core->abcd = vaddq_u32(core->abcd, cr.abcd); + core->efgh = vaddq_u32(core->efgh, cr.efgh); +} + +typedef struct sha256_neon { + sha256_neon_core core; + sha256_block blk; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha256_neon; + +static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len); + +static ssh_hash *sha256_neon_new(const ssh_hashalg *alg) +{ + if (!sha256_hw_available_cached()) + return NULL; + + sha256_neon *s = snew(sha256_neon); + + s->core.abcd = vld1q_u32(sha256_initial_state); + s->core.efgh = vld1q_u32(sha256_initial_state + 4); + + sha256_block_setup(&s->blk); + + s->hash.vt = alg; + BinarySink_INIT(s, sha256_neon_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + return &s->hash; +} + +static ssh_hash *sha256_neon_copy(ssh_hash *hash) +{ + sha256_neon *s = container_of(hash, sha256_neon, hash); + sha256_neon *copy = snew(sha256_neon); + + *copy = *s; /* structure copy */ + + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); + + return ©->hash; +} + +static void sha256_neon_free(ssh_hash *hash) +{ + sha256_neon *s = container_of(hash, sha256_neon, hash); + smemclr(s, sizeof(*s)); + sfree(s); +} + +static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len) +{ + sha256_neon *s = BinarySink_DOWNCAST(bs, sha256_neon); + + while (len > 0) + if (sha256_block_write(&s->blk, &vp, &len)) + sha256_neon_block(&s->core, s->blk.block); +} + +static void sha256_neon_final(ssh_hash *hash, uint8_t *digest) +{ + sha256_neon *s = container_of(hash, sha256_neon, hash); + + sha256_block_pad(&s->blk, BinarySink_UPCAST(s)); + vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd))); + vst1q_u8(digest + 16, vrev32q_u8(vreinterpretq_u8_u32(s->core.efgh))); + sha256_neon_free(hash); +} + +const ssh_hashalg ssh_sha256_hw = { + sha256_neon_new, sha256_neon_copy, sha256_neon_final, sha256_neon_free, + 32, 64, "SHA-256", +}; + /* ---------------------------------------------------------------------- * Stub functions if we have no hardware-accelerated SHA-256. In this * case, sha256_hw_new returns NULL (though it should also never be diff --git a/sshsha.c b/sshsha.c index c791760c..52626b74 100644 --- a/sshsha.c +++ b/sshsha.c @@ -12,6 +12,7 @@ */ #define HW_SHA1_NONE 0 #define HW_SHA1_NI 1 +#define HW_SHA1_NEON 2 #ifdef _FORCE_SHA_NI # define HW_SHA1 HW_SHA1_NI @@ -31,6 +32,37 @@ # endif #endif +#ifdef _FORCE_SHA_NEON +# define HW_SHA1 HW_SHA1_NEON +#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + /* Arm can potentially support both endiannesses, but this code + * hasn't been tested on anything but little. If anyone wants to + * run big-endian, they'll need to fix it first. */ +#elif defined __ARM_FEATURE_CRYPTO + /* If the Arm crypto extension is available already, we can + * support NEON SHA without having to enable anything by hand */ +# define HW_SHA1 HW_SHA1_NEON +#elif defined(__clang__) +# if __has_attribute(target) && __has_include() && \ + (defined(__aarch64__)) + /* clang can enable the crypto extension in AArch64 using + * __attribute__((target)) */ +# define HW_SHA1 HW_SHA1_NEON +# define USE_CLANG_ATTR_TARGET_AARCH64 +# endif +#elif defined _MSC_VER + /* Visual Studio supports the crypto extension when targeting + * AArch64, but as of VS2017, the AArch32 header doesn't quite + * manage it (declaring the shae/shad intrinsics without a round + * key operand). */ +# if defined _M_ARM64 +# define HW_SHA1 HW_SHA1_NEON +# if defined _M_ARM64 +# define USE_ARM64_NEON_H /* unusual header name in this case */ +# endif +# endif +#endif + #if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA1 # undef HW_SHA1 # define HW_SHA1 HW_SHA1_NONE @@ -622,6 +654,222 @@ const ssh_hashalg ssh_sha1_hw = { 20, 64, "SHA-1", }; +/* ---------------------------------------------------------------------- + * Hardware-accelerated implementation of SHA-1 using Arm NEON. + */ + +#elif HW_SHA1 == HW_SHA1_NEON + +/* + * Manually set the target architecture, if we decided above that we + * need to. + */ +#ifdef USE_CLANG_ATTR_TARGET_AARCH64 +/* + * A spot of cheating: redefine some ACLE feature macros before + * including arm_neon.h. Otherwise we won't get the SHA intrinsics + * defined by that header, because it will be looking at the settings + * for the whole translation unit rather than the ones we're going to + * put on some particular functions using __attribute__((target)). + */ +#define __ARM_NEON 1 +#define __ARM_FEATURE_CRYPTO 1 +#define FUNC_ISA __attribute__ ((target("neon,crypto"))) +#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */ + +#ifndef FUNC_ISA +#define FUNC_ISA +#endif + +#ifdef USE_ARM64_NEON_H +#include +#else +#include +#endif + +static bool sha1_hw_available(void) +{ + /* + * For Arm, we delegate to a per-platform detection function (see + * explanation in sshaes.c). + */ + return platform_sha1_hw_available(); +} + +typedef struct sha1_neon_core sha1_neon_core; +struct sha1_neon_core { + uint32x4_t abcd; + uint32_t e; +}; + +/* ------------- got up to here ----------------------------------------- */ + +FUNC_ISA +static inline uint32x4_t sha1_neon_load_input(const uint8_t *p) +{ + return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p))); +} + +FUNC_ISA +static inline uint32x4_t sha1_neon_schedule_update( + uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1) +{ + return vsha1su1q_u32(vsha1su0q_u32(m4, m3, m2), m1); +} + +/* + * SHA-1 has three different kinds of round, differing in whether they + * use the Ch, Maj or Par functions defined above. Each one uses a + * separate NEON instruction, so we define three inline functions for + * the different round types using this macro. + * + * The two batches of Par-type rounds also use a different constant, + * but that's passed in as an operand, so we don't need a fourth + * inline function just for that. + */ +#define SHA1_NEON_ROUND_FN(type) \ + FUNC_ISA static inline sha1_neon_core sha1_neon_round4_##type( \ + sha1_neon_core old, uint32x4_t sched, uint32x4_t constant) \ + { \ + sha1_neon_core new; \ + uint32x4_t round_input = vaddq_u32(sched, constant); \ + new.abcd = vsha1##type##q_u32(old.abcd, old.e, round_input); \ + new.e = vsha1h_u32(vget_lane_u32(vget_low_u32(old.abcd), 0)); \ + return new; \ + } +SHA1_NEON_ROUND_FN(c) +SHA1_NEON_ROUND_FN(p) +SHA1_NEON_ROUND_FN(m) + +FUNC_ISA +static inline void sha1_neon_block(sha1_neon_core *core, const uint8_t *p) +{ + uint32x4_t constant, s0, s1, s2, s3; + sha1_neon_core cr = *core; + + constant = vdupq_n_u32(SHA1_STAGE0_CONSTANT); + s0 = sha1_neon_load_input(p); + cr = sha1_neon_round4_c(cr, s0, constant); + s1 = sha1_neon_load_input(p + 16); + cr = sha1_neon_round4_c(cr, s1, constant); + s2 = sha1_neon_load_input(p + 32); + cr = sha1_neon_round4_c(cr, s2, constant); + s3 = sha1_neon_load_input(p + 48); + cr = sha1_neon_round4_c(cr, s3, constant); + s0 = sha1_neon_schedule_update(s0, s1, s2, s3); + cr = sha1_neon_round4_c(cr, s0, constant); + + constant = vdupq_n_u32(SHA1_STAGE1_CONSTANT); + s1 = sha1_neon_schedule_update(s1, s2, s3, s0); + cr = sha1_neon_round4_p(cr, s1, constant); + s2 = sha1_neon_schedule_update(s2, s3, s0, s1); + cr = sha1_neon_round4_p(cr, s2, constant); + s3 = sha1_neon_schedule_update(s3, s0, s1, s2); + cr = sha1_neon_round4_p(cr, s3, constant); + s0 = sha1_neon_schedule_update(s0, s1, s2, s3); + cr = sha1_neon_round4_p(cr, s0, constant); + s1 = sha1_neon_schedule_update(s1, s2, s3, s0); + cr = sha1_neon_round4_p(cr, s1, constant); + + constant = vdupq_n_u32(SHA1_STAGE2_CONSTANT); + s2 = sha1_neon_schedule_update(s2, s3, s0, s1); + cr = sha1_neon_round4_m(cr, s2, constant); + s3 = sha1_neon_schedule_update(s3, s0, s1, s2); + cr = sha1_neon_round4_m(cr, s3, constant); + s0 = sha1_neon_schedule_update(s0, s1, s2, s3); + cr = sha1_neon_round4_m(cr, s0, constant); + s1 = sha1_neon_schedule_update(s1, s2, s3, s0); + cr = sha1_neon_round4_m(cr, s1, constant); + s2 = sha1_neon_schedule_update(s2, s3, s0, s1); + cr = sha1_neon_round4_m(cr, s2, constant); + + constant = vdupq_n_u32(SHA1_STAGE3_CONSTANT); + s3 = sha1_neon_schedule_update(s3, s0, s1, s2); + cr = sha1_neon_round4_p(cr, s3, constant); + s0 = sha1_neon_schedule_update(s0, s1, s2, s3); + cr = sha1_neon_round4_p(cr, s0, constant); + s1 = sha1_neon_schedule_update(s1, s2, s3, s0); + cr = sha1_neon_round4_p(cr, s1, constant); + s2 = sha1_neon_schedule_update(s2, s3, s0, s1); + cr = sha1_neon_round4_p(cr, s2, constant); + s3 = sha1_neon_schedule_update(s3, s0, s1, s2); + cr = sha1_neon_round4_p(cr, s3, constant); + + core->abcd = vaddq_u32(core->abcd, cr.abcd); + core->e += cr.e; +} + +typedef struct sha1_neon { + sha1_neon_core core; + sha1_block blk; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha1_neon; + +static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len); + +static ssh_hash *sha1_neon_new(const ssh_hashalg *alg) +{ + if (!sha1_hw_available_cached()) + return NULL; + + sha1_neon *s = snew(sha1_neon); + + s->core.abcd = vld1q_u32(sha1_initial_state); + s->core.e = sha1_initial_state[4]; + + sha1_block_setup(&s->blk); + + s->hash.vt = alg; + BinarySink_INIT(s, sha1_neon_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + return &s->hash; +} + +static ssh_hash *sha1_neon_copy(ssh_hash *hash) +{ + sha1_neon *s = container_of(hash, sha1_neon, hash); + sha1_neon *copy = snew(sha1_neon); + + *copy = *s; /* structure copy */ + + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); + + return ©->hash; +} + +static void sha1_neon_free(ssh_hash *hash) +{ + sha1_neon *s = container_of(hash, sha1_neon, hash); + smemclr(s, sizeof(*s)); + sfree(s); +} + +static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len) +{ + sha1_neon *s = BinarySink_DOWNCAST(bs, sha1_neon); + + while (len > 0) + if (sha1_block_write(&s->blk, &vp, &len)) + sha1_neon_block(&s->core, s->blk.block); +} + +static void sha1_neon_final(ssh_hash *hash, uint8_t *digest) +{ + sha1_neon *s = container_of(hash, sha1_neon, hash); + + sha1_block_pad(&s->blk, BinarySink_UPCAST(s)); + vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd))); + PUT_32BIT_MSB_FIRST(digest + 16, s->core.e); + sha1_neon_free(hash); +} + +const ssh_hashalg ssh_sha1_hw = { + sha1_neon_new, sha1_neon_copy, sha1_neon_final, sha1_neon_free, + 20, 64, "SHA-1", +}; + /* ---------------------------------------------------------------------- * Stub functions if we have no hardware-accelerated SHA-1. In this * case, sha1_hw_new returns NULL (though it should also never be diff --git a/unix/uxutils.c b/unix/uxutils.c index 8622a724..fcbcc4d4 100644 --- a/unix/uxutils.c +++ b/unix/uxutils.c @@ -16,6 +16,28 @@ bool platform_aes_hw_available(void) #endif } +bool platform_sha256_hw_available(void) +{ +#if defined HWCAP_SHA2 + return getauxval(AT_HWCAP) & HWCAP_SHA2; +#elif defined HWCAP2_SHA2 + return getauxval(AT_HWCAP2) & HWCAP2_SHA2; +#else + return false; +#endif +} + +bool platform_sha1_hw_available(void) +{ +#if defined HWCAP_SHA1 + return getauxval(AT_HWCAP) & HWCAP_SHA1; +#elif defined HWCAP2_SHA1 + return getauxval(AT_HWCAP2) & HWCAP2_SHA1; +#else + return false; +#endif +} + #else bool platform_aes_hw_available(void) @@ -23,4 +45,14 @@ bool platform_aes_hw_available(void) return false; } +bool platform_sha256_hw_available(void) +{ + return false; +} + +bool platform_sha1_hw_available(void) +{ + return false; +} + #endif diff --git a/windows/winmiscs.c b/windows/winmiscs.c index 738606dd..1ccd36aa 100644 --- a/windows/winmiscs.c +++ b/windows/winmiscs.c @@ -283,4 +283,14 @@ bool platform_aes_hw_available(void) return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); } +bool platform_sha256_hw_available(void) +{ + return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); +} + +bool platform_sha1_hw_available(void) +{ + return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); +} + #endif