Support hardware AES on Arm platforms.

The refactored sshaes.c gives me a convenient slot to drop in a second hardware-accelerated AES implementation, similar to the existing one but using Arm NEON intrinsics in place of the x86 AES-NI ones. This needed a minor structural change, because Arm systems are often heterogeneous, containing more than one type of CPU which won't necessarily all support the same set of architecture features. So you can't test at run time for the presence of AES acceleration by querying the CPU you're running on - even if you found a way to do it, the answer wouldn't be reliable once the OS started migrating your process between CPUs. Instead, you have to ask the OS itself, because only that knows about _all_ the CPUs on the system. So that means the aes_hw_available() mechanism has to extend a tentacle into each platform subdirectory. The trickiest part was the nest of ifdefs that tries to detect whether the compiler can support the necessary parts. I had successful test-compiles on several compilers, and was able to run the code directly on an AArch64 tablet (so I know it passes cryptsuite), but it's likely that at least some Arm platforms won't be able to build it because of some path through the ifdefs that I haven't been able to test yet.
2025-06-18 13:23:45 -05:00 · 2019-01-16 22:08:45 +00:00 · 2019-01-16 22:08:45 +00:00 · 53747ad3ab
commit 53747ad3ab
parent 1ce95c7ad8
5 changed files with 390 additions and 2 deletions
--- a/4
+++ b/4
@ -283,7 +283,7 @@ WINMISC  = MISCNET winstore winnet winhandl cmdline windefs winmisc winproxy
         + wintime winhsock errsock winsecur winucs miscucs winmiscs
 UXMISCCOMMON = MISCNETCOMMON uxstore uxsel uxnet uxpeer uxmisc time
         + uxfdsock errsock
-UXMISC   = MISCNET UXMISCCOMMON uxproxy
+UXMISC   = MISCNET UXMISCCOMMON uxproxy uxutils

 # SSH server.
 SSHSERVER = SSHCOMMON sshserver settings be_none logging ssh2kex-server
@ -379,7 +379,7 @@ osxlaunch : [UT] osxlaunch

 fuzzterm : [UT] UXTERM CHARSET MISC version uxmisc uxucs fuzzterm time settings
 	 + uxstore be_none uxnogtk memory
-testcrypt : [UT] testcrypt SSHCRYPTO marshal utils memory tree234
+testcrypt : [UT] testcrypt SSHCRYPTO marshal utils memory tree234 uxutils
 testcrypt : [C] testcrypt SSHCRYPTO marshal utils memory tree234 winmiscs
 testzlib : [UT] testzlib sshzlib memory

--- a/ssh.h
+++ b/ssh.h
@ -920,6 +920,14 @@ extern const ssh2_macalg ssh_hmac_sha256;
 extern const ssh2_macalg ssh2_poly1305;
 extern const ssh_compression_alg ssh_zlib;

+/*
+ * On some systems, you have to detect hardware crypto acceleration by
+ * asking the local OS API rather than OS-agnostically asking the CPU
+ * itself. If so, then this function should be implemented in each
+ * platform subdirectory.
+ */
+bool platform_aes_hw_available(void);
+
 /*
 * PuTTY version number formatted as an SSH version string. 
 */
--- a/sshaes.c
+++ b/sshaes.c
@ -13,6 +13,7 @@
 */
 #define HW_AES_NONE 0
 #define HW_AES_NI 1
+#define HW_AES_NEON 2

 #ifdef _FORCE_AES_NI
 #   define HW_AES HW_AES_NI
@ -32,6 +33,37 @@
 #   endif
 #endif

+#ifdef _FORCE_AES_NEON
+#   define HW_AES HW_AES_NEON
+#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    /* Arm can potentially support both endiannesses, but this code
+     * hasn't been tested on anything but little. If anyone wants to
+     * run big-endian, they'll need to fix it first. */
+#elif defined __ARM_FEATURE_CRYPTO
+    /* If the Arm crypto extension is available already, we can
+     * support NEON AES without having to enable anything by hand */
+#   define HW_AES HW_AES_NEON
+#elif defined(__clang__)
+#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
+    (defined(__aarch64__))
+        /* clang can enable the crypto extension in AArch64 using
+         * __attribute__((target)) */
+#       define HW_AES HW_AES_NEON
+#       define USE_CLANG_ATTR_TARGET_AARCH64
+#   endif
+#elif defined _MSC_VER
+    /* Visual Studio supports the crypto extension when targeting
+     * AArch64, but as of VS2017, the AArch32 header doesn't quite
+     * manage it (declaring the aese/aesd intrinsics without a round
+     * key operand). */
+#   if defined _M_ARM64
+#       define HW_AES HW_AES_NEON
+#       if defined _M_ARM64
+#           define USE_ARM64_NEON_H /* unusual header name in this case */
+#       endif
+#   endif
+#endif
+
 #if defined _FORCE_SOFTWARE_AES || !defined HW_AES
 #   undef HW_AES
 #   define HW_AES HW_AES_NONE
@ -39,6 +71,8 @@

 #if HW_AES == HW_AES_NI
 #define HW_NAME_SUFFIX " (AES-NI accelerated)"
+#elif HW_AES == HW_AES_NEON
+#define HW_NAME_SUFFIX " (NEON accelerated)"
 #else
 #define HW_NAME_SUFFIX " (!NONEXISTENT ACCELERATED VERSION!)"
 #endif
@ -1452,6 +1486,317 @@ NI_ENC_DEC(128)
 NI_ENC_DEC(192)
 NI_ENC_DEC(256)

+/* ----------------------------------------------------------------------
+ * Hardware-accelerated implementation of AES using Arm NEON.
+ */
+
+#elif HW_AES == HW_AES_NEON
+
+/*
+ * Manually set the target architecture, if we decided above that we
+ * need to.
+ */
+#ifdef USE_CLANG_ATTR_TARGET_AARCH64
+/*
+ * A spot of cheating: redefine some ACLE feature macros before
+ * including arm_neon.h. Otherwise we won't get the AES intrinsics
+ * defined by that header, because it will be looking at the settings
+ * for the whole translation unit rather than the ones we're going to
+ * put on some particular functions using __attribute__((target)).
+ */
+#define __ARM_NEON 1
+#define __ARM_FEATURE_CRYPTO 1
+#define FUNC_ISA __attribute__ ((target("neon,crypto")))
+#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
+
+#ifndef FUNC_ISA
+#define FUNC_ISA
+#endif
+
+#ifdef USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool aes_hw_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform AES detection function,
+     * because it has to be implemented by asking the operating system
+     * rather than directly querying the CPU.
+     *
+     * That's because Arm systems commonly have multiple cores that
+     * are not all alike, so any method of querying whether NEON
+     * crypto instructions work on the _current_ CPU - even one as
+     * crude as just trying one and catching the SIGILL - wouldn't
+     * give an answer that you could still rely on the first time the
+     * OS migrated your process to another CPU.
+     */
+    return platform_aes_hw_available();
+}
+
+/*
+ * Core NEON encrypt/decrypt functions, one per length and direction.
+ */
+
+#define NEON_CIPHER(len, repmacro)                              \
+    static FUNC_ISA inline uint8x16_t aes_neon_##len##_e(       \
+        uint8x16_t v, const uint8x16_t *keysched)               \
+    {                                                           \
+        repmacro(v = vaesmcq_u8(vaeseq_u8(v, *keysched++)););   \
+        v = vaeseq_u8(v, *keysched++);                          \
+        return veorq_u8(v, *keysched);                          \
+    }                                                           \
+    static FUNC_ISA inline uint8x16_t aes_neon_##len##_d(       \
+        uint8x16_t v, const uint8x16_t *keysched)               \
+    {                                                           \
+        repmacro(v = vaesimcq_u8(vaesdq_u8(v, *keysched++)););  \
+        v = vaesdq_u8(v, *keysched++);                          \
+        return veorq_u8(v, *keysched);                          \
+    }
+
+NEON_CIPHER(128, REP9)
+NEON_CIPHER(192, REP11)
+NEON_CIPHER(256, REP13)
+
+/*
+ * The main key expansion.
+ */
+static FUNC_ISA void aes_neon_key_expand(
+    const unsigned char *key, size_t key_words,
+    uint8x16_t *keysched_e, uint8x16_t *keysched_d)
+{
+    size_t rounds = key_words + 6;
+    size_t sched_words = (rounds + 1) * 4;
+
+    /*
+     * Store the key schedule as 32-bit integers during expansion, so
+     * that it's easy to refer back to individual previous words. We
+     * collect them into the final uint8x16_t form at the end.
+     */
+    uint32_t sched[MAXROUNDKEYS * 4];
+
+    unsigned rconpos = 0;
+
+    for (size_t i = 0; i < sched_words; i++) {
+	if (i < key_words) {
+            sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i);
+        } else {
+	    uint32_t temp = sched[i - 1];
+
+            bool rotate_and_round_constant = (i % key_words == 0);
+            bool sub = rotate_and_round_constant ||
+                (key_words == 8 && i % 8 == 4);
+
+            if (rotate_and_round_constant)
+                temp = (temp << 24) | (temp >> 8);
+
+            if (sub) {
+                uint32x4_t v32 = vdupq_n_u32(temp);
+                uint8x16_t v8 = vreinterpretq_u8_u32(v32);
+                v8 = vaeseq_u8(v8, vdupq_n_u8(0));
+                v32 = vreinterpretq_u32_u8(v8);
+                temp = vget_lane_u32(vget_low_u32(v32), 0);
+            }
+
+            if (rotate_and_round_constant) {
+                assert(rconpos < lenof(key_setup_round_constants));
+                temp ^= key_setup_round_constants[rconpos++];
+            }
+
+            sched[i] = sched[i - key_words] ^ temp;
+	}
+    }
+
+    /*
+     * Combine the key schedule words into uint8x16_t vectors and
+     * store them in the output context.
+     */
+    for (size_t round = 0; round <= rounds; round++)
+        keysched_e[round] = vreinterpretq_u8_u32(vld1q_u32(sched + 4*round));
+
+    smemclr(sched, sizeof(sched));
+
+    /*
+     * Now prepare the modified keys for the inverse cipher.
+     */
+    for (size_t eround = 0; eround <= rounds; eround++) {
+        size_t dround = rounds - eround;
+        uint8x16_t rkey = keysched_e[eround];
+        if (eround && dround)      /* neither first nor last */
+            rkey = vaesimcq_u8(rkey);
+        keysched_d[dround] = rkey;
+    }
+}
+
+/*
+ * Auxiliary routine to reverse the byte order of a vector, so that
+ * the SDCTR IV can be made big-endian for feeding to the cipher.
+ *
+ * In fact we don't need to reverse the vector _all_ the way; we leave
+ * the two lanes in MSW,LSW order, because that makes no difference to
+ * the efficiency of the increment. That way we only have to reverse
+ * bytes within each lane in this function.
+ */
+static FUNC_ISA inline uint8x16_t aes_neon_sdctr_reverse(uint8x16_t v)
+{
+    return vrev64q_u8(v);
+}
+
+/*
+ * Auxiliary routine to increment the 128-bit counter used in SDCTR
+ * mode. There's no instruction to treat a 128-bit vector as a single
+ * long integer, so instead we have to increment the bottom half
+ * unconditionally, and the top half if the bottom half started off as
+ * all 1s (in which case there was about to be a carry).
+ */
+static FUNC_ISA inline uint8x16_t aes_neon_sdctr_increment(uint8x16_t in)
+{
+#ifdef __aarch64__
+    /* There will be a carry if the low 64 bits are all 1s. */
+    uint64x1_t all1 = vcreate_u64(0xFFFFFFFFFFFFFFFF);
+    uint64x1_t carry = vceq_u64(vget_high_u64(vreinterpretq_u64_u8(in)), all1);
+
+    /* Make a word whose bottom half is unconditionally all 1s, and
+     * the top half is 'carry', i.e. all 0s most of the time but all
+     * 1s if we need to increment the top half. Then that word is what
+     * we need to _subtract_ from the input counter. */
+    uint64x2_t subtrahend = vcombine_u64(carry, all1);
+#else
+    /* AArch32 doesn't have comparisons that operate on a 64-bit lane,
+     * so we start by comparing each 32-bit half of the low 64 bits
+     * _separately_ to all-1s. */
+    uint32x2_t all1 = vdup_n_u32(0xFFFFFFFF);
+    uint32x2_t carry = vceq_u32(
+        vget_high_u32(vreinterpretq_u32_u8(in)), all1);
+
+    /* Swap the 32-bit words of the compare output, and AND with the
+     * unswapped version. Now carry is all 1s iff the bottom half of
+     * the input counter was all 1s, and all 0s otherwise. */
+    carry = vand_u32(carry, vrev64_u32(carry));
+
+    /* Now make the vector to subtract in the same way as above. */
+    uint64x2_t subtrahend = vreinterpretq_u64_u32(vcombine_u32(carry, all1));
+#endif
+
+    return vreinterpretq_u8_u64(
+        vsubq_u64(vreinterpretq_u64_u8(in), subtrahend));
+}
+
+/*
+ * The SSH interface and the cipher modes.
+ */
+
+typedef struct aes_neon_context aes_neon_context;
+struct aes_neon_context {
+    uint8x16_t keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv;
+
+    ssh2_cipher ciph;
+};
+
+static ssh2_cipher *aes_hw_new(const ssh2_cipheralg *alg)
+{
+    if (!aes_hw_available_cached())
+        return NULL;
+
+    aes_neon_context *ctx = snew(aes_neon_context);
+    ctx->ciph.vt = alg;
+    return &ctx->ciph;
+}
+
+static void aes_hw_free(ssh2_cipher *ciph)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    smemclr(ctx, sizeof(*ctx));
+    sfree(ctx);
+}
+
+static void aes_hw_setkey(ssh2_cipher *ciph, const void *vkey)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    const unsigned char *key = (const unsigned char *)vkey;
+
+    aes_neon_key_expand(key, ctx->ciph.vt->real_keybits / 32,
+                      ctx->keysched_e, ctx->keysched_d);
+}
+
+static FUNC_ISA void aes_hw_setiv_cbc(ssh2_cipher *ciph, const void *iv)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    ctx->iv = vld1q_u8(iv);
+}
+
+static FUNC_ISA void aes_hw_setiv_sdctr(ssh2_cipher *ciph, const void *iv)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+    uint8x16_t counter = vld1q_u8(iv);
+    ctx->iv = aes_neon_sdctr_reverse(counter);
+}
+
+typedef uint8x16_t (*aes_neon_fn)(uint8x16_t v, const uint8x16_t *keysched);
+
+static FUNC_ISA inline void aes_cbc_neon_encrypt(
+    ssh2_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        uint8x16_t plaintext = vld1q_u8(blk);
+        uint8x16_t cipher_input = veorq_u8(plaintext, ctx->iv);
+        uint8x16_t ciphertext = encrypt(cipher_input, ctx->keysched_e);
+        vst1q_u8(blk, ciphertext);
+        ctx->iv = ciphertext;
+    }
+}
+
+static FUNC_ISA inline void aes_cbc_neon_decrypt(
+    ssh2_cipher *ciph, void *vblk, int blklen, aes_neon_fn decrypt)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        uint8x16_t ciphertext = vld1q_u8(blk);
+        uint8x16_t decrypted = decrypt(ciphertext, ctx->keysched_d);
+        uint8x16_t plaintext = veorq_u8(decrypted, ctx->iv);
+        vst1q_u8(blk, plaintext);
+        ctx->iv = ciphertext;
+    }
+}
+
+static FUNC_ISA inline void aes_sdctr_neon(
+    ssh2_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
+{
+    aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
+
+    for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
+         blk < finish; blk += 16) {
+        uint8x16_t counter = aes_neon_sdctr_reverse(ctx->iv);
+        uint8x16_t keystream = encrypt(counter, ctx->keysched_e);
+        uint8x16_t input = vld1q_u8(blk);
+        uint8x16_t output = veorq_u8(input, keystream);
+        vst1q_u8(blk, output);
+        ctx->iv = aes_neon_sdctr_increment(ctx->iv);
+    }
+}
+
+#define NEON_ENC_DEC(len)                                               \
+    static FUNC_ISA void aes##len##_cbc_hw_encrypt(                     \
+        ssh2_cipher *ciph, void *vblk, int blklen)                      \
+    { aes_cbc_neon_encrypt(ciph, vblk, blklen, aes_neon_##len##_e); }   \
+    static FUNC_ISA void aes##len##_cbc_hw_decrypt(                     \
+        ssh2_cipher *ciph, void *vblk, int blklen)                      \
+    { aes_cbc_neon_decrypt(ciph, vblk, blklen, aes_neon_##len##_d); }   \
+    static FUNC_ISA void aes##len##_sdctr_hw(                           \
+        ssh2_cipher *ciph, void *vblk, int blklen)                      \
+    { aes_sdctr_neon(ciph, vblk, blklen, aes_neon_##len##_e); }         \
+
+NEON_ENC_DEC(128)
+NEON_ENC_DEC(192)
+NEON_ENC_DEC(256)
+
 /* ----------------------------------------------------------------------
 * Stub functions if we have no hardware-accelerated AES. In this
 * case, aes_hw_new returns NULL (though it should also never be
--- a/unix/uxutils.c
+++ b/unix/uxutils.c
@ -0,0 +1,26 @@
+#include "ssh.h"
+
+#if defined __linux__ && (defined __arm__ || defined __aarch64__)
+
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+bool platform_aes_hw_available(void)
+{
+#if defined HWCAP_AES
+    return getauxval(AT_HWCAP) & HWCAP_AES;
+#elif defined HWCAP2_AES
+    return getauxval(AT_HWCAP2) & HWCAP2_AES;
+#else
+    return false;
+#endif
+}
+
+#else
+
+bool platform_aes_hw_available(void)
+{
+    return false;
+}
+
+#endif
--- a/windows/winmiscs.c
+++ b/windows/winmiscs.c
@ -275,3 +275,12 @@ uintmax_t strtoumax(const char *nptr, char **endptr, int base)
 }

 #endif
+
+#if defined _M_ARM || defined _M_ARM64
+
+bool platform_aes_hw_available(void)
+{
+    return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
+}
+
+#endif