1
0
mirror of https://git.tartarus.org/simon/putty.git synced 2025-01-25 01:02:24 +00:00
putty-source/crypto/aes-neon.c

360 lines
13 KiB
C
Raw Permalink Normal View History

Break up crypto modules containing HW acceleration. This applies to all of AES, SHA-1, SHA-256 and SHA-512. All those source files previously contained multiple implementations of the algorithm, enabled or disabled by ifdefs detecting whether they would work on a given compiler. And in order to get advanced machine instructions like AES-NI or NEON crypto into the output file when the compile flags hadn't enabled them, we had to do nasty stuff with compiler-specific pragmas or attributes. Now we can do the detection at cmake time, and enable advanced instructions in the more sensible way, by compile-time flags. So I've broken up each of these modules into lots of sub-pieces: a file called (e.g.) 'foo-common.c' containing common definitions across all implementations (such as round constants), one called 'foo-select.c' containing the top-level vtable(s), and a separate file for each implementation exporting just the vtable(s) for that implementation. One advantage of this is that it depends a lot less on compiler- specific bodgery. My particular least favourite part of the previous setup was the part where I had to _manually_ define some Arm ACLE feature macros before including <arm_neon.h>, so that it would define the intrinsics I wanted. Now I'm enabling interesting architecture features in the normal way, on the compiler command line, there's no need for that kind of trick: the right feature macros are already defined and <arm_neon.h> does the right thing. Another change in this reorganisation is that I've stopped assuming there's just one hardware implementation per platform. Previously, the accelerated vtables were called things like sha256_hw, and varied between FOO-NI and NEON depending on platform; and the selection code would simply ask 'is hw available? if so, use hw, else sw'. Now, each HW acceleration strategy names its vtable its own way, and the selection vtable has a whole list of possibilities to iterate over looking for a supported one. So if someone feels like writing a second accelerated implementation of something for a given platform - for example, I've heard you can use plain NEON to speed up AES somewhat even without the crypto extension - then it will now have somewhere to drop in alongside the existing ones.
2021-04-19 05:42:12 +00:00
/* ----------------------------------------------------------------------
* Hardware-accelerated implementation of AES using Arm NEON.
*/
#include "ssh.h"
#include "aes.h"
#if USE_ARM64_NEON_H
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
static bool aes_neon_available(void)
{
/*
* For Arm, we delegate to a per-platform AES detection function,
* because it has to be implemented by asking the operating system
* rather than directly querying the CPU.
*
* That's because Arm systems commonly have multiple cores that
* are not all alike, so any method of querying whether NEON
* crypto instructions work on the _current_ CPU - even one as
* crude as just trying one and catching the SIGILL - wouldn't
* give an answer that you could still rely on the first time the
* OS migrated your process to another CPU.
*/
return platform_aes_neon_available();
}
/*
* Core NEON encrypt/decrypt functions, one per length and direction.
*/
#define NEON_CIPHER(len, repmacro) \
static inline uint8x16_t aes_neon_##len##_e( \
uint8x16_t v, const uint8x16_t *keysched) \
{ \
repmacro(v = vaesmcq_u8(vaeseq_u8(v, *keysched++));); \
v = vaeseq_u8(v, *keysched++); \
return veorq_u8(v, *keysched); \
} \
static inline uint8x16_t aes_neon_##len##_d( \
uint8x16_t v, const uint8x16_t *keysched) \
{ \
repmacro(v = vaesimcq_u8(vaesdq_u8(v, *keysched++));); \
v = vaesdq_u8(v, *keysched++); \
return veorq_u8(v, *keysched); \
}
NEON_CIPHER(128, REP9)
NEON_CIPHER(192, REP11)
NEON_CIPHER(256, REP13)
/*
* The main key expansion.
*/
static void aes_neon_key_expand(
const unsigned char *key, size_t key_words,
uint8x16_t *keysched_e, uint8x16_t *keysched_d)
{
size_t rounds = key_words + 6;
size_t sched_words = (rounds + 1) * 4;
/*
* Store the key schedule as 32-bit integers during expansion, so
* that it's easy to refer back to individual previous words. We
* collect them into the final uint8x16_t form at the end.
*/
uint32_t sched[MAXROUNDKEYS * 4];
unsigned rconpos = 0;
for (size_t i = 0; i < sched_words; i++) {
if (i < key_words) {
sched[i] = GET_32BIT_LSB_FIRST(key + 4 * i);
} else {
uint32_t temp = sched[i - 1];
bool rotate_and_round_constant = (i % key_words == 0);
bool sub = rotate_and_round_constant ||
(key_words == 8 && i % 8 == 4);
if (rotate_and_round_constant)
temp = (temp << 24) | (temp >> 8);
if (sub) {
uint32x4_t v32 = vdupq_n_u32(temp);
uint8x16_t v8 = vreinterpretq_u8_u32(v32);
v8 = vaeseq_u8(v8, vdupq_n_u8(0));
v32 = vreinterpretq_u32_u8(v8);
temp = vget_lane_u32(vget_low_u32(v32), 0);
}
if (rotate_and_round_constant) {
assert(rconpos < lenof(aes_key_setup_round_constants));
temp ^= aes_key_setup_round_constants[rconpos++];
}
sched[i] = sched[i - key_words] ^ temp;
}
}
/*
* Combine the key schedule words into uint8x16_t vectors and
* store them in the output context.
*/
for (size_t round = 0; round <= rounds; round++)
keysched_e[round] = vreinterpretq_u8_u32(vld1q_u32(sched + 4*round));
smemclr(sched, sizeof(sched));
/*
* Now prepare the modified keys for the inverse cipher.
*/
for (size_t eround = 0; eround <= rounds; eround++) {
size_t dround = rounds - eround;
uint8x16_t rkey = keysched_e[eround];
if (eround && dround) /* neither first nor last */
rkey = vaesimcq_u8(rkey);
keysched_d[dround] = rkey;
}
}
/*
* Auxiliary routine to reverse the byte order of a vector, so that
* the SDCTR IV can be made big-endian for feeding to the cipher.
*
* In fact we don't need to reverse the vector _all_ the way; we leave
* the two lanes in MSW,LSW order, because that makes no difference to
* the efficiency of the increment. That way we only have to reverse
* bytes within each lane in this function.
*/
static inline uint8x16_t aes_neon_sdctr_reverse(uint8x16_t v)
{
return vrev64q_u8(v);
}
/*
* Auxiliary routine to increment the 128-bit counter used in SDCTR
* mode. There's no instruction to treat a 128-bit vector as a single
* long integer, so instead we have to increment the bottom half
* unconditionally, and the top half if the bottom half started off as
* all 1s (in which case there was about to be a carry).
*/
static inline uint8x16_t aes_neon_sdctr_increment(uint8x16_t in)
{
#ifdef __aarch64__
/* There will be a carry if the low 64 bits are all 1s. */
uint64x1_t all1 = vcreate_u64(0xFFFFFFFFFFFFFFFF);
uint64x1_t carry = vceq_u64(vget_high_u64(vreinterpretq_u64_u8(in)), all1);
/* Make a word whose bottom half is unconditionally all 1s, and
* the top half is 'carry', i.e. all 0s most of the time but all
* 1s if we need to increment the top half. Then that word is what
* we need to _subtract_ from the input counter. */
uint64x2_t subtrahend = vcombine_u64(carry, all1);
#else
/* AArch32 doesn't have comparisons that operate on a 64-bit lane,
* so we start by comparing each 32-bit half of the low 64 bits
* _separately_ to all-1s. */
uint32x2_t all1 = vdup_n_u32(0xFFFFFFFF);
uint32x2_t carry = vceq_u32(
vget_high_u32(vreinterpretq_u32_u8(in)), all1);
/* Swap the 32-bit words of the compare output, and AND with the
* unswapped version. Now carry is all 1s iff the bottom half of
* the input counter was all 1s, and all 0s otherwise. */
carry = vand_u32(carry, vrev64_u32(carry));
/* Now make the vector to subtract in the same way as above. */
uint64x2_t subtrahend = vreinterpretq_u64_u32(vcombine_u32(carry, all1));
#endif
return vreinterpretq_u8_u64(
vsubq_u64(vreinterpretq_u64_u8(in), subtrahend));
}
Implement AES-GCM using the @openssh.com protocol IDs. I only recently found out that OpenSSH defined their own protocol IDs for AES-GCM, defined to work the same as the standard ones except that they fixed the semantics for how you select the linked cipher+MAC pair during key exchange. (RFC 5647 defines protocol ids for AES-GCM in both the cipher and MAC namespaces, and requires that you MUST select both or neither - but this contradicts the selection policy set out in the base SSH RFCs, and there's no discussion of how you resolve a conflict between them! OpenSSH's answer is to do it the same way ChaCha20-Poly1305 works, because that will ensure the two suites don't fight.) People do occasionally ask us for this linked cipher/MAC pair, and now I know it's actually feasible, I've implemented it, including a pair of vector implementations for x86 and Arm using their respective architecture extensions for multiplying polynomials over GF(2). Unlike ChaCha20-Poly1305, I've kept the cipher and MAC implementations in separate objects, with an arm's-length link between them that the MAC uses when it needs to encrypt single cipher blocks to use as the inputs to the MAC algorithm. That enables the cipher and the MAC to be independently selected from their hardware-accelerated versions, just in case someone runs on a system that has polynomial multiplication instructions but not AES acceleration, or vice versa. There's a fourth implementation of the GCM MAC, which is a pure software implementation of the same algorithm used in the vectorised versions. It's too slow to use live, but I've kept it in the code for future testing needs, and because it's a convenient place to dump my design comments. The vectorised implementations are fairly crude as far as optimisation goes. I'm sure serious x86 _or_ Arm optimisation engineers would look at them and laugh. But GCM is a fast MAC compared to HMAC-SHA-256 (indeed compared to HMAC-anything-at-all), so it should at least be good enough to use. And we've got a working version with some tests now, so if someone else wants to improve them, they can.
2022-08-16 17:36:58 +00:00
/*
* Much simpler auxiliary routine to increment the counter for GCM
* mode. This only has to increment the low word.
*/
static inline uint8x16_t aes_neon_gcm_increment(uint8x16_t in)
{
uint32x4_t inw = vreinterpretq_u32_u8(in);
uint32x4_t ONE = vcombine_u32(vcreate_u32(0), vcreate_u32(1));
inw = vaddq_u32(inw, ONE);
return vreinterpretq_u8_u32(inw);
}
Break up crypto modules containing HW acceleration. This applies to all of AES, SHA-1, SHA-256 and SHA-512. All those source files previously contained multiple implementations of the algorithm, enabled or disabled by ifdefs detecting whether they would work on a given compiler. And in order to get advanced machine instructions like AES-NI or NEON crypto into the output file when the compile flags hadn't enabled them, we had to do nasty stuff with compiler-specific pragmas or attributes. Now we can do the detection at cmake time, and enable advanced instructions in the more sensible way, by compile-time flags. So I've broken up each of these modules into lots of sub-pieces: a file called (e.g.) 'foo-common.c' containing common definitions across all implementations (such as round constants), one called 'foo-select.c' containing the top-level vtable(s), and a separate file for each implementation exporting just the vtable(s) for that implementation. One advantage of this is that it depends a lot less on compiler- specific bodgery. My particular least favourite part of the previous setup was the part where I had to _manually_ define some Arm ACLE feature macros before including <arm_neon.h>, so that it would define the intrinsics I wanted. Now I'm enabling interesting architecture features in the normal way, on the compiler command line, there's no need for that kind of trick: the right feature macros are already defined and <arm_neon.h> does the right thing. Another change in this reorganisation is that I've stopped assuming there's just one hardware implementation per platform. Previously, the accelerated vtables were called things like sha256_hw, and varied between FOO-NI and NEON depending on platform; and the selection code would simply ask 'is hw available? if so, use hw, else sw'. Now, each HW acceleration strategy names its vtable its own way, and the selection vtable has a whole list of possibilities to iterate over looking for a supported one. So if someone feels like writing a second accelerated implementation of something for a given platform - for example, I've heard you can use plain NEON to speed up AES somewhat even without the crypto extension - then it will now have somewhere to drop in alongside the existing ones.
2021-04-19 05:42:12 +00:00
/*
* The SSH interface and the cipher modes.
*/
typedef struct aes_neon_context aes_neon_context;
struct aes_neon_context {
uint8x16_t keysched_e[MAXROUNDKEYS], keysched_d[MAXROUNDKEYS], iv;
ssh_cipher ciph;
};
static ssh_cipher *aes_neon_new(const ssh_cipheralg *alg)
{
const struct aes_extra *extra = (const struct aes_extra *)alg->extra;
if (!check_availability(extra))
return NULL;
aes_neon_context *ctx = snew(aes_neon_context);
ctx->ciph.vt = alg;
return &ctx->ciph;
}
static void aes_neon_free(ssh_cipher *ciph)
{
aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
smemclr(ctx, sizeof(*ctx));
sfree(ctx);
}
static void aes_neon_setkey(ssh_cipher *ciph, const void *vkey)
{
aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
const unsigned char *key = (const unsigned char *)vkey;
aes_neon_key_expand(key, ctx->ciph.vt->real_keybits / 32,
ctx->keysched_e, ctx->keysched_d);
Break up crypto modules containing HW acceleration. This applies to all of AES, SHA-1, SHA-256 and SHA-512. All those source files previously contained multiple implementations of the algorithm, enabled or disabled by ifdefs detecting whether they would work on a given compiler. And in order to get advanced machine instructions like AES-NI or NEON crypto into the output file when the compile flags hadn't enabled them, we had to do nasty stuff with compiler-specific pragmas or attributes. Now we can do the detection at cmake time, and enable advanced instructions in the more sensible way, by compile-time flags. So I've broken up each of these modules into lots of sub-pieces: a file called (e.g.) 'foo-common.c' containing common definitions across all implementations (such as round constants), one called 'foo-select.c' containing the top-level vtable(s), and a separate file for each implementation exporting just the vtable(s) for that implementation. One advantage of this is that it depends a lot less on compiler- specific bodgery. My particular least favourite part of the previous setup was the part where I had to _manually_ define some Arm ACLE feature macros before including <arm_neon.h>, so that it would define the intrinsics I wanted. Now I'm enabling interesting architecture features in the normal way, on the compiler command line, there's no need for that kind of trick: the right feature macros are already defined and <arm_neon.h> does the right thing. Another change in this reorganisation is that I've stopped assuming there's just one hardware implementation per platform. Previously, the accelerated vtables were called things like sha256_hw, and varied between FOO-NI and NEON depending on platform; and the selection code would simply ask 'is hw available? if so, use hw, else sw'. Now, each HW acceleration strategy names its vtable its own way, and the selection vtable has a whole list of possibilities to iterate over looking for a supported one. So if someone feels like writing a second accelerated implementation of something for a given platform - for example, I've heard you can use plain NEON to speed up AES somewhat even without the crypto extension - then it will now have somewhere to drop in alongside the existing ones.
2021-04-19 05:42:12 +00:00
}
static void aes_neon_setiv_cbc(ssh_cipher *ciph, const void *iv)
{
aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
ctx->iv = vld1q_u8(iv);
}
static void aes_neon_setiv_sdctr(ssh_cipher *ciph, const void *iv)
{
aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
uint8x16_t counter = vld1q_u8(iv);
ctx->iv = aes_neon_sdctr_reverse(counter);
}
Implement AES-GCM using the @openssh.com protocol IDs. I only recently found out that OpenSSH defined their own protocol IDs for AES-GCM, defined to work the same as the standard ones except that they fixed the semantics for how you select the linked cipher+MAC pair during key exchange. (RFC 5647 defines protocol ids for AES-GCM in both the cipher and MAC namespaces, and requires that you MUST select both or neither - but this contradicts the selection policy set out in the base SSH RFCs, and there's no discussion of how you resolve a conflict between them! OpenSSH's answer is to do it the same way ChaCha20-Poly1305 works, because that will ensure the two suites don't fight.) People do occasionally ask us for this linked cipher/MAC pair, and now I know it's actually feasible, I've implemented it, including a pair of vector implementations for x86 and Arm using their respective architecture extensions for multiplying polynomials over GF(2). Unlike ChaCha20-Poly1305, I've kept the cipher and MAC implementations in separate objects, with an arm's-length link between them that the MAC uses when it needs to encrypt single cipher blocks to use as the inputs to the MAC algorithm. That enables the cipher and the MAC to be independently selected from their hardware-accelerated versions, just in case someone runs on a system that has polynomial multiplication instructions but not AES acceleration, or vice versa. There's a fourth implementation of the GCM MAC, which is a pure software implementation of the same algorithm used in the vectorised versions. It's too slow to use live, but I've kept it in the code for future testing needs, and because it's a convenient place to dump my design comments. The vectorised implementations are fairly crude as far as optimisation goes. I'm sure serious x86 _or_ Arm optimisation engineers would look at them and laugh. But GCM is a fast MAC compared to HMAC-SHA-256 (indeed compared to HMAC-anything-at-all), so it should at least be good enough to use. And we've got a working version with some tests now, so if someone else wants to improve them, they can.
2022-08-16 17:36:58 +00:00
static void aes_neon_setiv_gcm(ssh_cipher *ciph, const void *iv)
{
aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
uint8x16_t counter = vld1q_u8(iv);
ctx->iv = aes_neon_sdctr_reverse(counter);
ctx->iv = vreinterpretq_u8_u32(vsetq_lane_u32(
1, vreinterpretq_u32_u8(ctx->iv), 2));
}
static void aes_neon_next_message_gcm(ssh_cipher *ciph)
{
aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
uint32x4_t iv = vreinterpretq_u32_u8(ctx->iv);
uint64_t msg_counter = vgetq_lane_u32(iv, 0);
msg_counter = (msg_counter << 32) | vgetq_lane_u32(iv, 3);
msg_counter++;
iv = vsetq_lane_u32(msg_counter >> 32, iv, 0);
iv = vsetq_lane_u32(msg_counter, iv, 3);
iv = vsetq_lane_u32(1, iv, 2);
ctx->iv = vreinterpretq_u8_u32(iv);
}
Break up crypto modules containing HW acceleration. This applies to all of AES, SHA-1, SHA-256 and SHA-512. All those source files previously contained multiple implementations of the algorithm, enabled or disabled by ifdefs detecting whether they would work on a given compiler. And in order to get advanced machine instructions like AES-NI or NEON crypto into the output file when the compile flags hadn't enabled them, we had to do nasty stuff with compiler-specific pragmas or attributes. Now we can do the detection at cmake time, and enable advanced instructions in the more sensible way, by compile-time flags. So I've broken up each of these modules into lots of sub-pieces: a file called (e.g.) 'foo-common.c' containing common definitions across all implementations (such as round constants), one called 'foo-select.c' containing the top-level vtable(s), and a separate file for each implementation exporting just the vtable(s) for that implementation. One advantage of this is that it depends a lot less on compiler- specific bodgery. My particular least favourite part of the previous setup was the part where I had to _manually_ define some Arm ACLE feature macros before including <arm_neon.h>, so that it would define the intrinsics I wanted. Now I'm enabling interesting architecture features in the normal way, on the compiler command line, there's no need for that kind of trick: the right feature macros are already defined and <arm_neon.h> does the right thing. Another change in this reorganisation is that I've stopped assuming there's just one hardware implementation per platform. Previously, the accelerated vtables were called things like sha256_hw, and varied between FOO-NI and NEON depending on platform; and the selection code would simply ask 'is hw available? if so, use hw, else sw'. Now, each HW acceleration strategy names its vtable its own way, and the selection vtable has a whole list of possibilities to iterate over looking for a supported one. So if someone feels like writing a second accelerated implementation of something for a given platform - for example, I've heard you can use plain NEON to speed up AES somewhat even without the crypto extension - then it will now have somewhere to drop in alongside the existing ones.
2021-04-19 05:42:12 +00:00
typedef uint8x16_t (*aes_neon_fn)(uint8x16_t v, const uint8x16_t *keysched);
static inline void aes_cbc_neon_encrypt(
ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
{
aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
blk < finish; blk += 16) {
uint8x16_t plaintext = vld1q_u8(blk);
uint8x16_t cipher_input = veorq_u8(plaintext, ctx->iv);
uint8x16_t ciphertext = encrypt(cipher_input, ctx->keysched_e);
vst1q_u8(blk, ciphertext);
ctx->iv = ciphertext;
}
}
static inline void aes_cbc_neon_decrypt(
ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn decrypt)
{
aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
blk < finish; blk += 16) {
uint8x16_t ciphertext = vld1q_u8(blk);
uint8x16_t decrypted = decrypt(ciphertext, ctx->keysched_d);
uint8x16_t plaintext = veorq_u8(decrypted, ctx->iv);
vst1q_u8(blk, plaintext);
ctx->iv = ciphertext;
}
}
static inline void aes_sdctr_neon(
ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
{
aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
blk < finish; blk += 16) {
uint8x16_t counter = aes_neon_sdctr_reverse(ctx->iv);
uint8x16_t keystream = encrypt(counter, ctx->keysched_e);
uint8x16_t input = vld1q_u8(blk);
uint8x16_t output = veorq_u8(input, keystream);
vst1q_u8(blk, output);
ctx->iv = aes_neon_sdctr_increment(ctx->iv);
}
}
Implement AES-GCM using the @openssh.com protocol IDs. I only recently found out that OpenSSH defined their own protocol IDs for AES-GCM, defined to work the same as the standard ones except that they fixed the semantics for how you select the linked cipher+MAC pair during key exchange. (RFC 5647 defines protocol ids for AES-GCM in both the cipher and MAC namespaces, and requires that you MUST select both or neither - but this contradicts the selection policy set out in the base SSH RFCs, and there's no discussion of how you resolve a conflict between them! OpenSSH's answer is to do it the same way ChaCha20-Poly1305 works, because that will ensure the two suites don't fight.) People do occasionally ask us for this linked cipher/MAC pair, and now I know it's actually feasible, I've implemented it, including a pair of vector implementations for x86 and Arm using their respective architecture extensions for multiplying polynomials over GF(2). Unlike ChaCha20-Poly1305, I've kept the cipher and MAC implementations in separate objects, with an arm's-length link between them that the MAC uses when it needs to encrypt single cipher blocks to use as the inputs to the MAC algorithm. That enables the cipher and the MAC to be independently selected from their hardware-accelerated versions, just in case someone runs on a system that has polynomial multiplication instructions but not AES acceleration, or vice versa. There's a fourth implementation of the GCM MAC, which is a pure software implementation of the same algorithm used in the vectorised versions. It's too slow to use live, but I've kept it in the code for future testing needs, and because it's a convenient place to dump my design comments. The vectorised implementations are fairly crude as far as optimisation goes. I'm sure serious x86 _or_ Arm optimisation engineers would look at them and laugh. But GCM is a fast MAC compared to HMAC-SHA-256 (indeed compared to HMAC-anything-at-all), so it should at least be good enough to use. And we've got a working version with some tests now, so if someone else wants to improve them, they can.
2022-08-16 17:36:58 +00:00
static inline void aes_encrypt_ecb_block_neon(
ssh_cipher *ciph, void *blk, aes_neon_fn encrypt)
{
aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
uint8x16_t plaintext = vld1q_u8(blk);
uint8x16_t ciphertext = encrypt(plaintext, ctx->keysched_e);
vst1q_u8(blk, ciphertext);
}
static inline void aes_gcm_neon(
ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt)
{
aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph);
for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
blk < finish; blk += 16) {
uint8x16_t counter = aes_neon_sdctr_reverse(ctx->iv);
uint8x16_t keystream = encrypt(counter, ctx->keysched_e);
uint8x16_t input = vld1q_u8(blk);
uint8x16_t output = veorq_u8(input, keystream);
vst1q_u8(blk, output);
ctx->iv = aes_neon_gcm_increment(ctx->iv);
}
}
Break up crypto modules containing HW acceleration. This applies to all of AES, SHA-1, SHA-256 and SHA-512. All those source files previously contained multiple implementations of the algorithm, enabled or disabled by ifdefs detecting whether they would work on a given compiler. And in order to get advanced machine instructions like AES-NI or NEON crypto into the output file when the compile flags hadn't enabled them, we had to do nasty stuff with compiler-specific pragmas or attributes. Now we can do the detection at cmake time, and enable advanced instructions in the more sensible way, by compile-time flags. So I've broken up each of these modules into lots of sub-pieces: a file called (e.g.) 'foo-common.c' containing common definitions across all implementations (such as round constants), one called 'foo-select.c' containing the top-level vtable(s), and a separate file for each implementation exporting just the vtable(s) for that implementation. One advantage of this is that it depends a lot less on compiler- specific bodgery. My particular least favourite part of the previous setup was the part where I had to _manually_ define some Arm ACLE feature macros before including <arm_neon.h>, so that it would define the intrinsics I wanted. Now I'm enabling interesting architecture features in the normal way, on the compiler command line, there's no need for that kind of trick: the right feature macros are already defined and <arm_neon.h> does the right thing. Another change in this reorganisation is that I've stopped assuming there's just one hardware implementation per platform. Previously, the accelerated vtables were called things like sha256_hw, and varied between FOO-NI and NEON depending on platform; and the selection code would simply ask 'is hw available? if so, use hw, else sw'. Now, each HW acceleration strategy names its vtable its own way, and the selection vtable has a whole list of possibilities to iterate over looking for a supported one. So if someone feels like writing a second accelerated implementation of something for a given platform - for example, I've heard you can use plain NEON to speed up AES somewhat even without the crypto extension - then it will now have somewhere to drop in alongside the existing ones.
2021-04-19 05:42:12 +00:00
#define NEON_ENC_DEC(len) \
static void aes##len##_neon_cbc_encrypt( \
ssh_cipher *ciph, void *vblk, int blklen) \
{ aes_cbc_neon_encrypt(ciph, vblk, blklen, aes_neon_##len##_e); } \
static void aes##len##_neon_cbc_decrypt( \
ssh_cipher *ciph, void *vblk, int blklen) \
{ aes_cbc_neon_decrypt(ciph, vblk, blklen, aes_neon_##len##_d); } \
static void aes##len##_neon_sdctr( \
ssh_cipher *ciph, void *vblk, int blklen) \
{ aes_sdctr_neon(ciph, vblk, blklen, aes_neon_##len##_e); } \
Implement AES-GCM using the @openssh.com protocol IDs. I only recently found out that OpenSSH defined their own protocol IDs for AES-GCM, defined to work the same as the standard ones except that they fixed the semantics for how you select the linked cipher+MAC pair during key exchange. (RFC 5647 defines protocol ids for AES-GCM in both the cipher and MAC namespaces, and requires that you MUST select both or neither - but this contradicts the selection policy set out in the base SSH RFCs, and there's no discussion of how you resolve a conflict between them! OpenSSH's answer is to do it the same way ChaCha20-Poly1305 works, because that will ensure the two suites don't fight.) People do occasionally ask us for this linked cipher/MAC pair, and now I know it's actually feasible, I've implemented it, including a pair of vector implementations for x86 and Arm using their respective architecture extensions for multiplying polynomials over GF(2). Unlike ChaCha20-Poly1305, I've kept the cipher and MAC implementations in separate objects, with an arm's-length link between them that the MAC uses when it needs to encrypt single cipher blocks to use as the inputs to the MAC algorithm. That enables the cipher and the MAC to be independently selected from their hardware-accelerated versions, just in case someone runs on a system that has polynomial multiplication instructions but not AES acceleration, or vice versa. There's a fourth implementation of the GCM MAC, which is a pure software implementation of the same algorithm used in the vectorised versions. It's too slow to use live, but I've kept it in the code for future testing needs, and because it's a convenient place to dump my design comments. The vectorised implementations are fairly crude as far as optimisation goes. I'm sure serious x86 _or_ Arm optimisation engineers would look at them and laugh. But GCM is a fast MAC compared to HMAC-SHA-256 (indeed compared to HMAC-anything-at-all), so it should at least be good enough to use. And we've got a working version with some tests now, so if someone else wants to improve them, they can.
2022-08-16 17:36:58 +00:00
static void aes##len##_neon_gcm( \
ssh_cipher *ciph, void *vblk, int blklen) \
{ aes_gcm_neon(ciph, vblk, blklen, aes_neon_##len##_e); } \
static void aes##len##_neon_encrypt_ecb_block( \
ssh_cipher *ciph, void *vblk) \
{ aes_encrypt_ecb_block_neon(ciph, vblk, aes_neon_##len##_e); }
Break up crypto modules containing HW acceleration. This applies to all of AES, SHA-1, SHA-256 and SHA-512. All those source files previously contained multiple implementations of the algorithm, enabled or disabled by ifdefs detecting whether they would work on a given compiler. And in order to get advanced machine instructions like AES-NI or NEON crypto into the output file when the compile flags hadn't enabled them, we had to do nasty stuff with compiler-specific pragmas or attributes. Now we can do the detection at cmake time, and enable advanced instructions in the more sensible way, by compile-time flags. So I've broken up each of these modules into lots of sub-pieces: a file called (e.g.) 'foo-common.c' containing common definitions across all implementations (such as round constants), one called 'foo-select.c' containing the top-level vtable(s), and a separate file for each implementation exporting just the vtable(s) for that implementation. One advantage of this is that it depends a lot less on compiler- specific bodgery. My particular least favourite part of the previous setup was the part where I had to _manually_ define some Arm ACLE feature macros before including <arm_neon.h>, so that it would define the intrinsics I wanted. Now I'm enabling interesting architecture features in the normal way, on the compiler command line, there's no need for that kind of trick: the right feature macros are already defined and <arm_neon.h> does the right thing. Another change in this reorganisation is that I've stopped assuming there's just one hardware implementation per platform. Previously, the accelerated vtables were called things like sha256_hw, and varied between FOO-NI and NEON depending on platform; and the selection code would simply ask 'is hw available? if so, use hw, else sw'. Now, each HW acceleration strategy names its vtable its own way, and the selection vtable has a whole list of possibilities to iterate over looking for a supported one. So if someone feels like writing a second accelerated implementation of something for a given platform - for example, I've heard you can use plain NEON to speed up AES somewhat even without the crypto extension - then it will now have somewhere to drop in alongside the existing ones.
2021-04-19 05:42:12 +00:00
NEON_ENC_DEC(128)
NEON_ENC_DEC(192)
NEON_ENC_DEC(256)
AES_EXTRA(_neon);
AES_ALL_VTABLES(_neon, "NEON accelerated");