diff --git a/cmake/cmake.h.in b/cmake/cmake.h.in index 6ed24b51..4ce869f4 100644 --- a/cmake/cmake.h.in +++ b/cmake/cmake.h.in @@ -49,7 +49,9 @@ #cmakedefine01 HAVE_AES_NI #cmakedefine01 HAVE_SHA_NI #cmakedefine01 HAVE_SHAINTRIN_H +#cmakedefine01 HAVE_CLMUL #cmakedefine01 HAVE_NEON_CRYPTO +#cmakedefine01 HAVE_NEON_PMULL #cmakedefine01 HAVE_NEON_SHA512 #cmakedefine01 HAVE_NEON_SHA512_INTRINSICS #cmakedefine01 USE_ARM64_NEON_H diff --git a/config.c b/config.c index 59b6976a..747af814 100644 --- a/config.c +++ b/config.c @@ -491,6 +491,7 @@ static void cipherlist_handler(dlgcontrol *ctrl, dlgparam *dlg, static const struct { const char *s; int c; } ciphers[] = { { "ChaCha20 (SSH-2 only)", CIPHER_CHACHA20 }, + { "AES-GCM (SSH-2 only)", CIPHER_AESGCM }, { "3DES", CIPHER_3DES }, { "Blowfish", CIPHER_BLOWFISH }, { "DES", CIPHER_DES }, diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt index 08de141e..ff04efb5 100644 --- a/crypto/CMakeLists.txt +++ b/crypto/CMakeLists.txt @@ -2,6 +2,10 @@ add_sources_from_current_dir(crypto aes-common.c aes-select.c aes-sw.c + aesgcm-common.c + aesgcm-select.c + aesgcm-sw.c + aesgcm-ref-poly.c arcfour.c argon2.c bcrypt.c @@ -123,6 +127,16 @@ if(HAVE_WMMINTRIN_H) volatile __m128i r, a, b, c; int main(void) { r = _mm_sha256rnds2_epu32(a, b, c); }" ADD_SOURCES_IF_SUCCESSFUL sha256-ni.c sha1-ni.c) + + test_compile_with_flags(HAVE_CLMUL + GNU_FLAGS -msse4.1 -mpclmul + TEST_SOURCE " + #include + #include + volatile __m128i r, a, b; + int main(void) { r = _mm_clmulepi64_si128(a, b, 5); + r = _mm_shuffle_epi8(r, a); }" + ADD_SOURCES_IF_SUCCESSFUL aesgcm-clmul.c) endif() # ---------------------------------------------------------------------- @@ -170,6 +184,17 @@ if(neon) int main(void) { r = vaeseq_u8(a, b); s = vsha256hq_u32(x, y, z); }" ADD_SOURCES_IF_SUCCESSFUL aes-neon.c sha256-neon.c sha1-neon.c) + test_compile_with_flags(HAVE_NEON_PMULL + GNU_FLAGS -march=armv8-a+crypto + MSVC_FLAGS -D_ARM_USE_NEW_NEON_INTRINSICS + TEST_SOURCE " + #include <${neon_header}> + volatile poly128_t r; + volatile poly64_t a, b; + volatile poly64x2_t u, v; + int main(void) { r = vmull_p64(a, b); r = vmull_high_p64(u, v); }" + ADD_SOURCES_IF_SUCCESSFUL aesgcm-neon.c) + # The 'sha3' architecture extension, despite the name, includes # support for SHA-512 (from the SHA-2 standard) as well as SHA-3 # proper. diff --git a/crypto/aes-common.c b/crypto/aes-common.c index e1c41ddf..3bed2af1 100644 --- a/crypto/aes-common.c +++ b/crypto/aes-common.c @@ -12,3 +12,9 @@ const uint8_t aes_key_setup_round_constants[10] = { * regardless of the key. */ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, }; + +void aesgcm_cipher_crypt_length( + ssh_cipher *cipher, void *blk, int len, unsigned long seq) +{ + /* Do nothing: lengths are sent in clear for this cipher. */ +} diff --git a/crypto/aes-neon.c b/crypto/aes-neon.c index f3b92832..5cd9f2d1 100644 --- a/crypto/aes-neon.c +++ b/crypto/aes-neon.c @@ -176,6 +176,18 @@ static inline uint8x16_t aes_neon_sdctr_increment(uint8x16_t in) vsubq_u64(vreinterpretq_u64_u8(in), subtrahend)); } +/* + * Much simpler auxiliary routine to increment the counter for GCM + * mode. This only has to increment the low word. + */ +static inline uint8x16_t aes_neon_gcm_increment(uint8x16_t in) +{ + uint32x4_t inw = vreinterpretq_u32_u8(in); + uint32x4_t ONE = vcombine_u32(vcreate_u32(0), vcreate_u32(1)); + inw = vaddq_u32(inw, ONE); + return vreinterpretq_u8_u32(inw); +} + /* * The SSH interface and the cipher modes. */ @@ -227,6 +239,28 @@ static void aes_neon_setiv_sdctr(ssh_cipher *ciph, const void *iv) ctx->iv = aes_neon_sdctr_reverse(counter); } +static void aes_neon_setiv_gcm(ssh_cipher *ciph, const void *iv) +{ + aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); + uint8x16_t counter = vld1q_u8(iv); + ctx->iv = aes_neon_sdctr_reverse(counter); + ctx->iv = vreinterpretq_u8_u32(vsetq_lane_u32( + 1, vreinterpretq_u32_u8(ctx->iv), 2)); +} + +static void aes_neon_next_message_gcm(ssh_cipher *ciph) +{ + aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); + uint32x4_t iv = vreinterpretq_u32_u8(ctx->iv); + uint64_t msg_counter = vgetq_lane_u32(iv, 0); + msg_counter = (msg_counter << 32) | vgetq_lane_u32(iv, 3); + msg_counter++; + iv = vsetq_lane_u32(msg_counter >> 32, iv, 0); + iv = vsetq_lane_u32(msg_counter, iv, 3); + iv = vsetq_lane_u32(1, iv, 2); + ctx->iv = vreinterpretq_u8_u32(iv); +} + typedef uint8x16_t (*aes_neon_fn)(uint8x16_t v, const uint8x16_t *keysched); static inline void aes_cbc_neon_encrypt( @@ -275,6 +309,31 @@ static inline void aes_sdctr_neon( } } +static inline void aes_encrypt_ecb_block_neon( + ssh_cipher *ciph, void *blk, aes_neon_fn encrypt) +{ + aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); + uint8x16_t plaintext = vld1q_u8(blk); + uint8x16_t ciphertext = encrypt(plaintext, ctx->keysched_e); + vst1q_u8(blk, ciphertext); +} + +static inline void aes_gcm_neon( + ssh_cipher *ciph, void *vblk, int blklen, aes_neon_fn encrypt) +{ + aes_neon_context *ctx = container_of(ciph, aes_neon_context, ciph); + + for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; + blk < finish; blk += 16) { + uint8x16_t counter = aes_neon_sdctr_reverse(ctx->iv); + uint8x16_t keystream = encrypt(counter, ctx->keysched_e); + uint8x16_t input = vld1q_u8(blk); + uint8x16_t output = veorq_u8(input, keystream); + vst1q_u8(blk, output); + ctx->iv = aes_neon_gcm_increment(ctx->iv); + } +} + #define NEON_ENC_DEC(len) \ static void aes##len##_neon_cbc_encrypt( \ ssh_cipher *ciph, void *vblk, int blklen) \ @@ -285,6 +344,12 @@ static inline void aes_sdctr_neon( static void aes##len##_neon_sdctr( \ ssh_cipher *ciph, void *vblk, int blklen) \ { aes_sdctr_neon(ciph, vblk, blklen, aes_neon_##len##_e); } \ + static void aes##len##_neon_gcm( \ + ssh_cipher *ciph, void *vblk, int blklen) \ + { aes_gcm_neon(ciph, vblk, blklen, aes_neon_##len##_e); } \ + static void aes##len##_neon_encrypt_ecb_block( \ + ssh_cipher *ciph, void *vblk) \ + { aes_encrypt_ecb_block_neon(ciph, vblk, aes_neon_##len##_e); } NEON_ENC_DEC(128) NEON_ENC_DEC(192) diff --git a/crypto/aes-ni.c b/crypto/aes-ni.c index 22348de4..67d82b86 100644 --- a/crypto/aes-ni.c +++ b/crypto/aes-ni.c @@ -137,6 +137,16 @@ static inline __m128i aes_ni_sdctr_increment(__m128i v) return v; } +/* + * Much simpler auxiliary routine to increment the counter for GCM + * mode. This only has to increment the low word. + */ +static inline __m128i aes_ni_gcm_increment(__m128i v) +{ + const __m128i ONE = _mm_setr_epi32(1,0,0,0); + return _mm_add_epi32(v, ONE); +} + /* * Auxiliary routine to reverse the byte order of a vector, so that * the SDCTR IV can be made big-endian for feeding to the cipher. @@ -214,6 +224,25 @@ static void aes_ni_setiv_sdctr(ssh_cipher *ciph, const void *iv) ctx->iv = aes_ni_sdctr_reverse(counter); } +static void aes_ni_setiv_gcm(ssh_cipher *ciph, const void *iv) +{ + aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); + __m128i counter = _mm_loadu_si128(iv); + ctx->iv = aes_ni_sdctr_reverse(counter); + ctx->iv = _mm_insert_epi32(ctx->iv, 1, 0); +} + +static void aes_ni_next_message_gcm(ssh_cipher *ciph) +{ + aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); + uint32_t fixed = _mm_extract_epi32(ctx->iv, 3); + uint64_t msg_counter = _mm_extract_epi32(ctx->iv, 2); + msg_counter <<= 32; + msg_counter |= (uint32_t)_mm_extract_epi32(ctx->iv, 1); + msg_counter++; + ctx->iv = _mm_set_epi32(fixed, msg_counter >> 32, msg_counter, 1); +} + typedef __m128i (*aes_ni_fn)(__m128i v, const __m128i *keysched); static inline void aes_cbc_ni_encrypt( @@ -262,6 +291,31 @@ static inline void aes_sdctr_ni( } } +static inline void aes_encrypt_ecb_block_ni( + ssh_cipher *ciph, void *blk, aes_ni_fn encrypt) +{ + aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); + __m128i plaintext = _mm_loadu_si128(blk); + __m128i ciphertext = encrypt(plaintext, ctx->keysched_e); + _mm_storeu_si128(blk, ciphertext); +} + +static inline void aes_gcm_ni( + ssh_cipher *ciph, void *vblk, int blklen, aes_ni_fn encrypt) +{ + aes_ni_context *ctx = container_of(ciph, aes_ni_context, ciph); + + for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; + blk < finish; blk += 16) { + __m128i counter = aes_ni_sdctr_reverse(ctx->iv); + __m128i keystream = encrypt(counter, ctx->keysched_e); + __m128i input = _mm_loadu_si128((const __m128i *)blk); + __m128i output = _mm_xor_si128(input, keystream); + _mm_storeu_si128((__m128i *)blk, output); + ctx->iv = aes_ni_gcm_increment(ctx->iv); + } +} + #define NI_ENC_DEC(len) \ static void aes##len##_ni_cbc_encrypt( \ ssh_cipher *ciph, void *vblk, int blklen) \ @@ -272,6 +326,12 @@ static inline void aes_sdctr_ni( static void aes##len##_ni_sdctr( \ ssh_cipher *ciph, void *vblk, int blklen) \ { aes_sdctr_ni(ciph, vblk, blklen, aes_ni_##len##_e); } \ + static void aes##len##_ni_gcm( \ + ssh_cipher *ciph, void *vblk, int blklen) \ + { aes_gcm_ni(ciph, vblk, blklen, aes_ni_##len##_e); } \ + static void aes##len##_ni_encrypt_ecb_block( \ + ssh_cipher *ciph, void *vblk) \ + { aes_encrypt_ecb_block_ni(ciph, vblk, aes_ni_##len##_e); } NI_ENC_DEC(128) NI_ENC_DEC(192) diff --git a/crypto/aes-select.c b/crypto/aes-select.c index f0c5031f..892e7b58 100644 --- a/crypto/aes-select.c +++ b/crypto/aes-select.c @@ -39,7 +39,7 @@ static ssh_cipher *aes_select(const ssh_cipheralg *alg) #define IF_NEON(...) #endif -#define AES_SELECTOR_VTABLE(mode_c, mode_protocol, mode_display, bits) \ +#define AES_SELECTOR_VTABLE(mode_c, mode_protocol, mode_display, bits, ...) \ static const ssh_cipheralg * \ ssh_aes ## bits ## _ ## mode_c ## _impls[] = { \ IF_NI(&ssh_aes ## bits ## _ ## mode_c ## _ni,) \ @@ -56,6 +56,7 @@ static ssh_cipher *aes_select(const ssh_cipheralg *alg) .text_name = "AES-" #bits " " mode_display \ " (dummy selector vtable)", \ .extra = ssh_aes ## bits ## _ ## mode_c ## _impls, \ + __VA_ARGS__ \ } AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 128); @@ -64,6 +65,17 @@ AES_SELECTOR_VTABLE(cbc, "cbc", "CBC", 256); AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 128); AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 192); AES_SELECTOR_VTABLE(sdctr, "ctr", "SDCTR", 256); +AES_SELECTOR_VTABLE(gcm, "gcm@openssh.com", "GCM", 128, + .required_mac = &ssh2_aesgcm_mac); +AES_SELECTOR_VTABLE(gcm, "gcm@openssh.com", "GCM", 256, + .required_mac = &ssh2_aesgcm_mac); + +/* 192-bit AES-GCM is included only so that testcrypt can run standard + * test vectors against it. OpenSSH doesn't define a protocol id for + * it. Hence the silly macro trick here to set its ssh2_id to 0, and + * more importantly, leaving it out of aesgcm_list[] below. */ +AES_SELECTOR_VTABLE(gcm, ?NULL:NULL, "GCM", 192, + .required_mac = &ssh2_aesgcm_mac); static const ssh_cipheralg ssh_rijndael_lysator = { /* Same as aes256_cbc, but with a different protocol ID */ @@ -87,3 +99,12 @@ static const ssh_cipheralg *const aes_list[] = { }; const ssh2_ciphers ssh2_aes = { lenof(aes_list), aes_list }; + +static const ssh_cipheralg *const aesgcm_list[] = { + /* OpenSSH only defines protocol ids for 128- and 256-bit AES-GCM, + * not 192-bit. */ + &ssh_aes128_gcm, + &ssh_aes256_gcm, +}; + +const ssh2_ciphers ssh2_aesgcm = { lenof(aesgcm_list), aesgcm_list }; diff --git a/crypto/aes-sw.c b/crypto/aes-sw.c index f8512388..aaa3c475 100644 --- a/crypto/aes-sw.c +++ b/crypto/aes-sw.c @@ -827,6 +827,18 @@ struct aes_sw_context { uint8_t keystream[SLICE_PARALLELISM * 16]; uint8_t *keystream_pos; } sdctr; + struct { + /* In GCM mode, the cipher preimage consists of three + * sections: one fixed, one that increments per message + * sent and MACed, and one that increments per cipher + * block. */ + uint64_t msg_counter; + uint32_t fixed_iv, block_counter; + /* But we keep the precomputed keystream chunks just like + * SDCTR mode. */ + uint8_t keystream[SLICE_PARALLELISM * 16]; + uint8_t *keystream_pos; + } gcm; } iv; ssh_cipher ciph; }; @@ -874,6 +886,31 @@ static void aes_sw_setiv_sdctr(ssh_cipher *ciph, const void *viv) ctx->iv.sdctr.keystream + sizeof(ctx->iv.sdctr.keystream); } +static void aes_sw_setiv_gcm(ssh_cipher *ciph, const void *viv) +{ + aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph); + const uint8_t *iv = (const uint8_t *)viv; + + ctx->iv.gcm.fixed_iv = GET_32BIT_MSB_FIRST(iv); + ctx->iv.gcm.msg_counter = GET_64BIT_MSB_FIRST(iv + 4); + ctx->iv.gcm.block_counter = 1; + + /* Set keystream_pos to indicate that the keystream cache is + * currently empty */ + ctx->iv.gcm.keystream_pos = + ctx->iv.gcm.keystream + sizeof(ctx->iv.gcm.keystream); +} + +static void aes_sw_next_message_gcm(ssh_cipher *ciph) +{ + aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph); + + ctx->iv.gcm.msg_counter++; + ctx->iv.gcm.block_counter = 1; + ctx->iv.gcm.keystream_pos = + ctx->iv.gcm.keystream + sizeof(ctx->iv.gcm.keystream); +} + typedef void (*aes_sw_fn)(uint32_t v[4], const uint32_t *keysched); static inline void memxor16(void *vout, const void *vlhs, const void *vrhs) @@ -1021,6 +1058,56 @@ static inline void aes_sdctr_sw( } } +static inline void aes_encrypt_ecb_block_sw(ssh_cipher *ciph, void *blk) +{ + aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph); + aes_sliced_e_serial(blk, blk, &ctx->sk); +} + +static inline void aes_gcm_sw( + ssh_cipher *ciph, void *vblk, int blklen) +{ + aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph); + + /* + * GCM encrypt/decrypt looks just like SDCTR, except that the + * method of generating more keystream varies slightly. + */ + + uint8_t *keystream_end = + ctx->iv.gcm.keystream + sizeof(ctx->iv.gcm.keystream); + + for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen; + blk < finish; blk += 16) { + + if (ctx->iv.gcm.keystream_pos == keystream_end) { + /* + * Generate some keystream. + */ + for (uint8_t *block = ctx->iv.gcm.keystream; + block < keystream_end; block += 16) { + /* Format the counter value into the buffer. */ + PUT_32BIT_MSB_FIRST(block, ctx->iv.gcm.fixed_iv); + PUT_64BIT_MSB_FIRST(block + 4, ctx->iv.gcm.msg_counter); + PUT_32BIT_MSB_FIRST(block + 12, ctx->iv.gcm.block_counter); + + /* Increment the counter. */ + ctx->iv.gcm.block_counter++; + } + + /* Encrypt all those counter blocks. */ + aes_sliced_e_parallel(ctx->iv.gcm.keystream, + ctx->iv.gcm.keystream, &ctx->sk); + + /* Reset keystream_pos to the start of the buffer. */ + ctx->iv.gcm.keystream_pos = ctx->iv.gcm.keystream; + } + + memxor16(blk, blk, ctx->iv.gcm.keystream_pos); + ctx->iv.gcm.keystream_pos += 16; + } +} + #define SW_ENC_DEC(len) \ static void aes##len##_sw_cbc_encrypt( \ ssh_cipher *ciph, void *vblk, int blklen) \ @@ -1030,7 +1117,13 @@ static inline void aes_sdctr_sw( { aes_cbc_sw_decrypt(ciph, vblk, blklen); } \ static void aes##len##_sw_sdctr( \ ssh_cipher *ciph, void *vblk, int blklen) \ - { aes_sdctr_sw(ciph, vblk, blklen); } + { aes_sdctr_sw(ciph, vblk, blklen); } \ + static void aes##len##_sw_gcm( \ + ssh_cipher *ciph, void *vblk, int blklen) \ + { aes_gcm_sw(ciph, vblk, blklen); } \ + static void aes##len##_sw_encrypt_ecb_block( \ + ssh_cipher *ciph, void *vblk) \ + { aes_encrypt_ecb_block_sw(ciph, vblk); } SW_ENC_DEC(128) SW_ENC_DEC(192) diff --git a/crypto/aes.h b/crypto/aes.h index 433306ab..cab5b989 100644 --- a/crypto/aes.h +++ b/crypto/aes.h @@ -15,6 +15,11 @@ struct aes_extra { /* Point to a writable substructure. */ struct aes_extra_mutable *mut; + + /* Extra API function specific to AES, to encrypt a single block + * in ECB mode without touching the IV. Used by AES-GCM MAC + * setup. */ + void (*encrypt_ecb_block)(ssh_cipher *, void *); }; struct aes_extra_mutable { bool checked_availability; @@ -30,6 +35,17 @@ static inline bool check_availability(const struct aes_extra *extra) return extra->mut->is_available; } +/* Shared stub function for all the AES-GCM vtables. */ +void aesgcm_cipher_crypt_length( + ssh_cipher *cipher, void *blk, int len, unsigned long seq); + +/* External entry point for the encrypt_ecb_block function. */ +static inline void aes_encrypt_ecb_block(ssh_cipher *ciph, void *blk) +{ + const struct aes_extra *extra = ciph->vt->extra; + extra->encrypt_ecb_block(ciph, blk); +} + /* * Macros to define vtables for AES variants. There are a lot of * these, because of the cross product between cipher modes, key @@ -37,13 +53,19 @@ static inline bool check_availability(const struct aes_extra *extra) * some effort here to reduce the boilerplate in the sub-files. */ -#define AES_EXTRA(impl_c) \ +#define AES_EXTRA_BITS(impl_c, bits) \ static struct aes_extra_mutable aes ## impl_c ## _extra_mut; \ - static const struct aes_extra aes ## impl_c ## _extra = { \ + static const struct aes_extra aes ## bits ## impl_c ## _extra = { \ .check_available = aes ## impl_c ## _available, \ .mut = &aes ## impl_c ## _extra_mut, \ + .encrypt_ecb_block = &aes ## bits ## impl_c ## _encrypt_ecb_block, \ } +#define AES_EXTRA(impl_c) \ + AES_EXTRA_BITS(impl_c, 128); \ + AES_EXTRA_BITS(impl_c, 192); \ + AES_EXTRA_BITS(impl_c, 256) + #define AES_CBC_VTABLE(impl_c, impl_display, bits) \ const ssh_cipheralg ssh_aes ## bits ## _cbc ## impl_c = { \ .new = aes ## impl_c ## _new, \ @@ -59,7 +81,7 @@ static inline bool check_availability(const struct aes_extra *extra) .padded_keybytes = bits/8, \ .flags = SSH_CIPHER_IS_CBC, \ .text_name = "AES-" #bits " CBC (" impl_display ")", \ - .extra = &aes ## impl_c ## _extra, \ + .extra = &aes ## bits ## impl_c ## _extra, \ } #define AES_SDCTR_VTABLE(impl_c, impl_display, bits) \ @@ -77,7 +99,31 @@ static inline bool check_availability(const struct aes_extra *extra) .padded_keybytes = bits/8, \ .flags = 0, \ .text_name = "AES-" #bits " SDCTR (" impl_display ")", \ - .extra = &aes ## impl_c ## _extra, \ + .extra = &aes ## bits ## impl_c ## _extra, \ + } + +#define AES_GCM_VTABLE(impl_c, impl_display, bits) \ + const ssh_cipheralg ssh_aes ## bits ## _gcm ## impl_c = { \ + .new = aes ## impl_c ## _new, \ + .free = aes ## impl_c ## _free, \ + .setiv = aes ## impl_c ## _setiv_gcm, \ + .setkey = aes ## impl_c ## _setkey, \ + .encrypt = aes ## bits ## impl_c ## _gcm, \ + .decrypt = aes ## bits ## impl_c ## _gcm, \ + .encrypt_length = aesgcm_cipher_crypt_length, \ + .decrypt_length = aesgcm_cipher_crypt_length, \ + .next_message = aes ## impl_c ## _next_message_gcm, \ + /* 192-bit AES-GCM is included only so that testcrypt can run \ + * standard test vectors against it. OpenSSH doesn't define a \ + * protocol id for it. So we set its ssh2_id to NULL. */ \ + .ssh2_id = bits==192 ? NULL : "aes" #bits "-gcm@openssh.com", \ + .blksize = 16, \ + .real_keybits = bits, \ + .padded_keybytes = bits/8, \ + .flags = SSH_CIPHER_SEPARATE_LENGTH, \ + .text_name = "AES-" #bits " GCM (" impl_display ")", \ + .required_mac = &ssh2_aesgcm_mac, \ + .extra = &aes ## bits ## impl_c ## _extra, \ } #define AES_ALL_VTABLES(impl_c, impl_display) \ @@ -86,7 +132,10 @@ static inline bool check_availability(const struct aes_extra *extra) AES_CBC_VTABLE(impl_c, impl_display, 256); \ AES_SDCTR_VTABLE(impl_c, impl_display, 128); \ AES_SDCTR_VTABLE(impl_c, impl_display, 192); \ - AES_SDCTR_VTABLE(impl_c, impl_display, 256) + AES_SDCTR_VTABLE(impl_c, impl_display, 256); \ + AES_GCM_VTABLE(impl_c, impl_display, 128); \ + AES_GCM_VTABLE(impl_c, impl_display, 192); \ + AES_GCM_VTABLE(impl_c, impl_display, 256) /* * Macros to repeat a piece of code particular numbers of times that diff --git a/crypto/aesgcm-clmul.c b/crypto/aesgcm-clmul.c new file mode 100644 index 00000000..cfb72e26 --- /dev/null +++ b/crypto/aesgcm-clmul.c @@ -0,0 +1,180 @@ +/* + * Implementation of the GCM polynomial hash using the x86 CLMUL + * extension, which provides 64x64->128 polynomial multiplication (or + * 'carry-less', which is what the CL stands for). + * + * Follows the reference implementation in aesgcm-ref-poly.c; see + * there for comments on the underlying technique. Here the comments + * just discuss the x86-specific details. + */ + +#include +#include + +#if defined(__clang__) || defined(__GNUC__) +#include +#define GET_CPU_ID(out) __cpuid(1, (out)[0], (out)[1], (out)[2], (out)[3]) +#else +#define GET_CPU_ID(out) __cpuid(out, 1) +#endif + +#include "ssh.h" +#include "aesgcm.h" + +typedef struct aesgcm_clmul { + AESGCM_COMMON_FIELDS; + __m128i var, acc, mask; + void *ptr_to_free; +} aesgcm_clmul; + +static bool aesgcm_clmul_available(void) +{ + /* + * Determine if CLMUL is available on this CPU. + */ + unsigned int CPUInfo[4]; + GET_CPU_ID(CPUInfo); + return (CPUInfo[2] & (1 << 1)); +} + +/* + * __m128i has to be aligned to 16 bytes, and x86 mallocs may not + * guarantee that, so we must over-allocate to make sure a large + * enough 16-byte region can be found, and ensure the aesgcm_clmul + * struct pointer is at least that well aligned. + */ +#define SPECIAL_ALLOC +static aesgcm_clmul *aesgcm_clmul_alloc(void) +{ + char *p = smalloc(sizeof(aesgcm_clmul) + 15); + uintptr_t ip = (uintptr_t)p; + ip = (ip + 15) & ~15; + aesgcm_clmul *ctx = (aesgcm_clmul *)ip; + memset(ctx, 0, sizeof(aesgcm_clmul)); + ctx->ptr_to_free = p; + return ctx; +} + +#define SPECIAL_FREE +static void aesgcm_clmul_free(aesgcm_clmul *ctx) +{ + void *ptf = ctx->ptr_to_free; + smemclr(ctx, sizeof(*ctx)); + sfree(ptf); +} + +/* Helper function to reverse the 16 bytes in a 128-bit vector */ +static inline __m128i mm_byteswap(__m128i vec) +{ + const __m128i reverse = _mm_set_epi64x( + 0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL); + return _mm_shuffle_epi8(vec, reverse); +} + +/* Helper function to swap the two 64-bit words in a 128-bit vector */ +static inline __m128i mm_wordswap(__m128i vec) +{ + return _mm_shuffle_epi32(vec, 0x4E); +} + +/* Load and store a 128-bit vector in big-endian fashion */ +static inline __m128i mm_load_be(const void *p) +{ + return mm_byteswap(_mm_loadu_si128(p)); +} +static inline void mm_store_be(void *p, __m128i vec) +{ + _mm_storeu_si128(p, mm_byteswap(vec)); +} + +/* + * Key setup is just like in aesgcm-ref-poly.c. There's no point using + * vector registers to accelerate this, because it happens rarely. + */ +static void aesgcm_clmul_setkey_impl(aesgcm_clmul *ctx, + const unsigned char *var) +{ + uint64_t hi = GET_64BIT_MSB_FIRST(var); + uint64_t lo = GET_64BIT_MSB_FIRST(var + 8); + + uint64_t bit = 1 & (hi >> 63); + hi = (hi << 1) ^ (lo >> 63); + lo = (lo << 1) ^ bit; + hi ^= 0xC200000000000000 & -bit; + + ctx->var = _mm_set_epi64x(hi, lo); +} + +static inline void aesgcm_clmul_setup(aesgcm_clmul *ctx, + const unsigned char *mask) +{ + ctx->mask = mm_load_be(mask); + ctx->acc = _mm_set_epi64x(0, 0); +} + +/* + * Folding a coefficient into the accumulator is done by essentially + * the algorithm in aesgcm-ref-poly.c. I don't speak these intrinsics + * all that well, so in the parts where I needed to XOR half of one + * vector into half of another, I did a lot of faffing about with + * masks like 0xFFFFFFFFFFFFFFFF0000000000000000. Very likely this can + * be streamlined by a better x86-speaker than me. Patches welcome. + */ +static inline void aesgcm_clmul_coeff(aesgcm_clmul *ctx, + const unsigned char *coeff) +{ + ctx->acc = _mm_xor_si128(ctx->acc, mm_load_be(coeff)); + + /* Compute ah^al and bh^bl by word-swapping each of a and b and + * XORing with the original. That does more work than necessary - + * you end up with each of the desired values repeated twice - + * but I don't know of a neater way. */ + __m128i aswap = mm_wordswap(ctx->acc); + __m128i vswap = mm_wordswap(ctx->var); + aswap = _mm_xor_si128(ctx->acc, aswap); + vswap = _mm_xor_si128(ctx->var, vswap); + + /* Do the three multiplications required by Karatsuba */ + __m128i md = _mm_clmulepi64_si128(aswap, vswap, 0x00); + __m128i lo = _mm_clmulepi64_si128(ctx->acc, ctx->var, 0x00); + __m128i hi = _mm_clmulepi64_si128(ctx->acc, ctx->var, 0x11); + /* Combine lo and hi into md */ + md = _mm_xor_si128(md, lo); + md = _mm_xor_si128(md, hi); + + /* Now we must XOR the high half of md into the low half of hi, + * and the low half of md into the high half of hi. Simplest thing + * is to swap the words of md (so that each one lines up with the + * register it's going to end up in), and then mask one off in + * each case. */ + md = mm_wordswap(md); + lo = _mm_xor_si128(lo, _mm_and_si128(md, _mm_set_epi64x(~0ULL, 0ULL))); + hi = _mm_xor_si128(hi, _mm_and_si128(md, _mm_set_epi64x(0ULL, ~0ULL))); + + /* The reduction stage is transformed similarly from the version + * in aesgcm-ref-poly.c. */ + __m128i r1 = _mm_clmulepi64_si128(_mm_set_epi64x(0, 0xC200000000000000), + lo, 0x00); + r1 = mm_wordswap(r1); + r1 = _mm_xor_si128(r1, lo); + hi = _mm_xor_si128(hi, _mm_and_si128(r1, _mm_set_epi64x(~0ULL, 0ULL))); + + __m128i r2 = _mm_clmulepi64_si128(_mm_set_epi64x(0, 0xC200000000000000), + r1, 0x10); + hi = _mm_xor_si128(hi, r2); + hi = _mm_xor_si128(hi, _mm_and_si128(r1, _mm_set_epi64x(0ULL, ~0ULL))); + + ctx->acc = hi; +} + +static inline void aesgcm_clmul_output(aesgcm_clmul *ctx, + unsigned char *output) +{ + mm_store_be(output, _mm_xor_si128(ctx->acc, ctx->mask)); + smemclr(&ctx->acc, 16); + smemclr(&ctx->mask, 16); +} + +#define AESGCM_FLAVOUR clmul +#define AESGCM_NAME "CLMUL accelerated" +#include "aesgcm-footer.h" diff --git a/crypto/aesgcm-common.c b/crypto/aesgcm-common.c new file mode 100644 index 00000000..1e20c87b --- /dev/null +++ b/crypto/aesgcm-common.c @@ -0,0 +1,8 @@ +#include "ssh.h" +#include "aesgcm.h" + +void aesgcm_set_prefix_lengths(ssh2_mac *mac, size_t skip, size_t aad) +{ + const struct aesgcm_extra *extra = mac->vt->extra; + extra->set_prefix_lengths(mac, skip, aad); +} diff --git a/crypto/aesgcm-footer.h b/crypto/aesgcm-footer.h new file mode 100644 index 00000000..981905da --- /dev/null +++ b/crypto/aesgcm-footer.h @@ -0,0 +1,368 @@ +/* + * Common footer included by every implementation of the AES-GCM MAC. + * + * The difficult part of AES-GCM, which is done differently depending + * on what hardware acceleration is available, is the actual + * evaluation of a polynomial over GF(2^128) whose coefficients are + * 128-bit chunks of data. But preparing those chunks in the first + * place (out of the ciphertext, associated data, and an + * administrative block containing the lengths of both) is done in the + * same way no matter what technique is used for the evaluation, so + * that's centralised into this file, along with as much of the other + * functionality as posible. + * + * This footer file is #included by each implementation, but each one + * will define its own struct type for the state, so that each alloc + * function will test sizeof() a different structure, and similarly + * for free when it zeroes out the state on cleanup. + * + * The functions in the source file may be defined as 'inline' so that + * the functions in here can inline them. The 'coeff' function in + * particular probably should be, because that's called once per + * 16-byte block, so eliminating function call overheads is especially + * useful there. + * + * This footer has the following expectations from the source file + * that #includes it: + * + * - define AESGCM_FLAVOUR to be a fragment of a C identifier that + * will be included in all the function names (both the ones + * defined in the implementation source file and those in here). + * For example purposes below I'll suppose that this is 'foo'. + * + * - define AESGCM_NAME to be a string literal that will be included + * in the display name of the implementation. + * + * - define a typedef 'aesgcm_foo' to be the state structure for the + * implementation, and inside that structure, expand the macro + * AESGCM_COMMON_FIELDS defined in aesgcm.h + * + * - define the following functions: + * + * // Determine whether this implementation is available at run time + * static bool aesgcm_foo_available(void); + * + * // Set up the 'key' of the polynomial part of the MAC, that is, + * // the value at which the polynomial will be evaluated. 'var' is + * // a 16-byte data block in the byte order it comes out of AES. + * static void aesgcm_foo_setkey_impl(aesgcm_foo *ctx, + * const unsigned char *var); + * + * // Set up at the start of evaluating an individual polynomial. + * // 'mask' is the 16-byte data block that will be XORed into the + * // output value of the polynomial, also in AES byte order. This + * // function should store 'mask' in whatever form is most + * // convenient, and initialise an accumulator to zero. + * static void aesgcm_foo_setup(aesgcm_foo *ctx, + * const unsigned char *mask); + * + * // Fold in a coefficient of the polynomial, by means of XORing + * // it into the accumulator and then multiplying the accumulator + * // by the variable passed to setkey_impl() above. + * // + * // 'coeff' points to the 16-byte block of data that the + * // polynomial coefficient will be made out of. + * // + * // You probably want to mark this function 'inline'. + * static void aesgcm_foo_coeff(aesgcm_foo *ctx, + * const unsigned char *coeff); + * + * // Generate the output MAC, by XORing the accumulator's final + * // value with the mask passed to setup() above. + * // + * // 'output' points to a 16-byte region of memory to write the + * // result to. + * static void aesgcm_foo_output(aesgcm_foo *ctx, + * unsigned char *output); + * + * - if allocation of the state structure must be done in a + * non-standard way (e.g. x86 needs this to force greater alignment + * than standard malloc provides), then #define SPECIAL_ALLOC and + * define this additional function: + * + * // Allocate a state structure, zero out its contents, and return it. + * static aesgcm_foo *aesgcm_foo_alloc(void); + * + * - if freeing must also be done in an unusual way, #define + * SPECIAL_FREE and define this function: + * + * // Zero out the state structure to avoid information leaks if the + * // memory is reused, and then free it. + * static void aesgcm_foo_free(aesgcm_foo *ctx); + */ + +#ifndef AESGCM_FLAVOUR +#error AESGCM_FLAVOUR must be defined by any module including this footer +#endif +#ifndef AESGCM_NAME +#error AESGCM_NAME must be defined by any module including this footer +#endif + +#define CONTEXT CAT(aesgcm_, AESGCM_FLAVOUR) +#define PREFIX(name) CAT(CAT(aesgcm_, AESGCM_FLAVOUR), CAT(_, name)) + +#include "aes.h" // for aes_encrypt_ecb_block + +static const char *PREFIX(mac_text_name)(ssh2_mac *mac) +{ + return "AES-GCM (" AESGCM_NAME ")"; +} + +static void PREFIX(mac_next_message)(ssh2_mac *mac) +{ + CONTEXT *ctx = container_of(mac, CONTEXT, mac); + + /* + * Make the mask value for a single MAC instance, by encrypting + * the all-zeroes word using the associated AES instance in its + * ordinary GCM fashion. This consumes the first block of + * keystream (with per-block counter equal to 1), leaving the + * second block of keystream ready to be used on the first block + * of plaintext. + */ + unsigned char buf[16]; + memset(buf, 0, 16); + ssh_cipher_encrypt(ctx->cipher, buf, 16); + PREFIX(setup)(ctx, buf); /* give it to the implementation to store */ + smemclr(buf, sizeof(buf)); +} + +static void PREFIX(mac_setkey)(ssh2_mac *mac, ptrlen key) +{ + CONTEXT *ctx = container_of(mac, CONTEXT, mac); + + /* + * Make the value of the polynomial variable, by encrypting the + * all-zeroes word using the associated AES instance in the + * special ECB mode. This is done via the special AES-specific API + * function encrypt_ecb_block, which doesn't touch the counter + * state at all. + */ + unsigned char var[16]; + memset(var, 0, 16); + aes_encrypt_ecb_block(ctx->cipher, var); + PREFIX(setkey_impl)(ctx, var); + smemclr(var, sizeof(var)); + + PREFIX(mac_next_message)(mac); /* set up mask */ +} + +static void PREFIX(mac_start)(ssh2_mac *mac) +{ + CONTEXT *ctx = container_of(mac, CONTEXT, mac); + + ctx->skipgot = ctx->aadgot = ctx->ciphertextlen = ctx->partlen = 0; +} + +/* + * Handle receiving data via the BinarySink API and turning it into a + * collection of 16-byte blocks to use as polynomial coefficients. + * + * This code is written in a fully general way, which is able to + * handle an arbitrary number of bytes at the start of the data to + * ignore completely (necessary for PuTTY integration), and an + * arbitrary number to treat as associated data, and the rest will be + * regarded as ciphertext. The stream can be interrupted at any byte + * position and resumed later; a partial block will be stored as + * necessary. + * + * At the time of writing this comment, in live use most of that + * generality isn't required: the full data is passed to this function + * in just one call. But there's no guarantee of that staying true in + * future, so we do the full deal here just in case, and the test + * vectors in cryptsuite.py will test it. (And they'll use + * set_prefix_lengths to set up different configurations from the SSH + * usage.) + */ +static void PREFIX(mac_BinarySink_write)( + BinarySink *bs, const void *blkv, size_t len) +{ + CONTEXT *ctx = BinarySink_DOWNCAST(bs, CONTEXT); + const unsigned char *blk = (const unsigned char *)blkv; + + /* + * Skip the prefix sequence number used as implicit extra data in + * SSH MACs. This is not included in the associated data field for + * GCM, because the IV incrementation policy provides its own + * sequence numbering. + */ + if (ctx->skipgot < ctx->skiplen) { + size_t n = ctx->skiplen - ctx->skipgot; + if (n > len) + n = len; + blk += n; + len -= n; + ctx->skipgot += n; + + if (len == 0) + return; + } + + /* + * Read additional authenticated data and fold it in to the MAC. + */ + while (ctx->aadgot < ctx->aadlen) { + size_t n = ctx->aadlen - ctx->aadgot; + if (n > len) + n = len; + + if (ctx->partlen || n < 16) { + /* + * Fold data into the partial block. + */ + if (n > 16 - ctx->partlen) + n = 16 - ctx->partlen; + memcpy(ctx->partblk + ctx->partlen, blk, n); + ctx->partlen += n; + } else if (n >= 16) { + /* + * Consume a whole block of AAD. + */ + PREFIX(coeff)(ctx, blk); + n = 16; + } + blk += n; + len -= n; + ctx->aadgot += n; + + if (ctx->partlen == 16) { + PREFIX(coeff)(ctx, ctx->partblk); + ctx->partlen = 0; + } + + if (ctx->aadgot == ctx->aadlen && ctx->partlen) { + memset(ctx->partblk + ctx->partlen, 0, 16 - ctx->partlen); + PREFIX(coeff)(ctx, ctx->partblk); + ctx->partlen = 0; + } + + if (len == 0) + return; + } + + /* + * Read the main ciphertext and fold it in to the MAC. + */ + while (len > 0) { + size_t n = len; + + if (ctx->partlen || n < 16) { + /* + * Fold data into the partial block. + */ + if (n > 16 - ctx->partlen) + n = 16 - ctx->partlen; + memcpy(ctx->partblk + ctx->partlen, blk, n); + ctx->partlen += n; + } else if (n >= 16) { + /* + * Consume a whole block of ciphertext. + */ + PREFIX(coeff)(ctx, blk); + n = 16; + } + blk += n; + len -= n; + ctx->ciphertextlen += n; + + if (ctx->partlen == 16) { + PREFIX(coeff)(ctx, ctx->partblk); + ctx->partlen = 0; + } + } +} + +static void PREFIX(mac_genresult)(ssh2_mac *mac, unsigned char *output) +{ + CONTEXT *ctx = container_of(mac, CONTEXT, mac); + + /* + * Consume any partial block of ciphertext remaining. + */ + if (ctx->partlen) { + memset(ctx->partblk + ctx->partlen, 0, 16 - ctx->partlen); + PREFIX(coeff)(ctx, ctx->partblk); + } + + /* + * Consume the final block giving the lengths of the AAD and ciphertext. + */ + unsigned char blk[16]; + memset(blk, 0, 16); + PUT_64BIT_MSB_FIRST(blk, ctx->aadlen * 8); + PUT_64BIT_MSB_FIRST(blk + 8, ctx->ciphertextlen * 8); + PREFIX(coeff)(ctx, blk); + + /* + * And call the implementation's output function. + */ + PREFIX(output)(ctx, output); + + smemclr(blk, sizeof(blk)); + smemclr(ctx->partblk, 16); +} + +static ssh2_mac *PREFIX(mac_new)(const ssh2_macalg *alg, ssh_cipher *cipher) +{ + const struct aesgcm_extra *extra = alg->extra; + if (!check_aesgcm_availability(extra)) + return NULL; + +#ifdef SPECIAL_ALLOC + CONTEXT *ctx = PREFIX(alloc)(); +#else + CONTEXT *ctx = snew(CONTEXT); + memset(ctx, 0, sizeof(CONTEXT)); +#endif + + ctx->mac.vt = alg; + ctx->cipher = cipher; + /* Default values for SSH-2, overridable by set_prefix_lengths for + * testcrypt purposes */ + ctx->skiplen = 4; + ctx->aadlen = 4; + BinarySink_INIT(ctx, PREFIX(mac_BinarySink_write)); + BinarySink_DELEGATE_INIT(&ctx->mac, ctx); + return &ctx->mac; +} + +static void PREFIX(set_prefix_lengths)(ssh2_mac *mac, size_t skip, size_t aad) +{ + CONTEXT *ctx = container_of(mac, CONTEXT, mac); + ctx->skiplen = skip; + ctx->aadlen = aad; +} + +static void PREFIX(mac_free)(ssh2_mac *mac) +{ + CONTEXT *ctx = container_of(mac, CONTEXT, mac); +#ifdef SPECIAL_FREE + PREFIX(free)(ctx); +#else + smemclr(ctx, sizeof(*ctx)); + sfree(ctx); +#endif +} + +static struct aesgcm_extra_mutable PREFIX(extra_mut); + +static const struct aesgcm_extra PREFIX(extra) = { + .check_available = PREFIX(available), + .mut = &PREFIX(extra_mut), + .set_prefix_lengths = PREFIX(set_prefix_lengths), +}; + +const ssh2_macalg CAT(ssh2_aesgcm_mac_, AESGCM_FLAVOUR) = { + .new = PREFIX(mac_new), + .free = PREFIX(mac_free), + .setkey = PREFIX(mac_setkey), + .start = PREFIX(mac_start), + .genresult = PREFIX(mac_genresult), + .next_message = PREFIX(mac_next_message), + .text_name = PREFIX(mac_text_name), + .name = "", + .etm_name = "", /* Not selectable independently */ + .len = 16, + .keylen = 0, + .extra = &PREFIX(extra), +}; diff --git a/crypto/aesgcm-neon.c b/crypto/aesgcm-neon.c new file mode 100644 index 00000000..dd7b83cc --- /dev/null +++ b/crypto/aesgcm-neon.c @@ -0,0 +1,156 @@ +/* + * Implementation of the GCM polynomial hash using Arm NEON vector + * intrinsics, in particular the multiplication operation for + * polynomials over GF(2). + * + * Follows the reference implementation in aesgcm-ref-poly.c; see + * there for comments on the underlying technique. Here the comments + * just discuss the NEON-specific details. + */ + +#include "ssh.h" +#include "aesgcm.h" + +#if USE_ARM64_NEON_H +#include +#else +#include +#endif + +typedef struct aesgcm_neon { + AESGCM_COMMON_FIELDS; + poly128_t var, acc, mask; +} aesgcm_neon; + +static bool aesgcm_neon_available(void) +{ + return platform_pmull_neon_available(); +} + +/* + * The NEON types involved are: + * + * 'poly128_t' is a type that lives in a 128-bit vector register and + * represents a 128-bit polynomial over GF(2) + * + * 'poly64x2_t' is a type that lives in a 128-bit vector register and + * represents a vector of two 64-bit polynomials. These appear as + * intermediate results in some of the helper functions below, but we + * never need to actually have a variable of that type. + * + * 'poly64x1_t' is a type that lives in a 128-bit vector register and + * represents a vector of one 64-bit polynomial. + * + * That is distinct from 'poly64_t', which is a type that lives in + * ordinary scalar registers and is a typedef for an integer type. + * + * Generally here we try to work in terms of poly128_t and 64-bit + * integer types, and let everything else be handled as internal + * details of these helper functions. + */ + +/* Make a poly128_t from two halves */ +static inline poly128_t create_p128(poly64_t hi, poly64_t lo) +{ + return vreinterpretq_p128_p64( + vcombine_p64(vcreate_p64(lo), vcreate_p64(hi))); +} + +/* Retrieve the high and low halves of a poly128_t */ +static inline poly64_t hi_half(poly128_t v) +{ + return vgetq_lane_p64(vreinterpretq_p64_p128(v), 1); +} +static inline poly64_t lo_half(poly128_t v) +{ + return vgetq_lane_p64(vreinterpretq_p64_p128(v), 0); +} + +/* 64x64 -> 128 bit polynomial multiplication, the largest we can do + * in one CPU operation */ +static inline poly128_t pmul(poly64_t v, poly64_t w) +{ + return vmull_p64(v, w); +} + +/* Load and store a poly128_t in the form of big-endian bytes. This + * involves separately swapping the halves of the register and + * reversing the bytes within each half. */ +static inline poly128_t load_p128_be(const void *p) +{ + poly128_t swapped = vreinterpretq_p128_u8(vrev64q_u8(vld1q_u8(p))); + return create_p128(lo_half(swapped), hi_half(swapped)); +} +static inline void store_p128_be(void *p, poly128_t v) +{ + poly128_t swapped = create_p128(lo_half(v), hi_half(v)); + vst1q_u8(p, vrev64q_u8(vreinterpretq_u8_p128(swapped))); +} + +/* + * Key setup is just like in aesgcm-ref-poly.c. There's no point using + * vector registers to accelerate this, because it happens rarely. + */ +static void aesgcm_neon_setkey_impl(aesgcm_neon *ctx, const unsigned char *var) +{ + uint64_t hi = GET_64BIT_MSB_FIRST(var); + uint64_t lo = GET_64BIT_MSB_FIRST(var + 8); + + uint64_t bit = 1 & (hi >> 63); + hi = (hi << 1) ^ (lo >> 63); + lo = (lo << 1) ^ bit; + hi ^= 0xC200000000000000 & -bit; + + ctx->var = create_p128(hi, lo); +} + +static inline void aesgcm_neon_setup(aesgcm_neon *ctx, + const unsigned char *mask) +{ + ctx->mask = load_p128_be(mask); + ctx->acc = create_p128(0, 0); +} + +/* + * Folding a coefficient into the accumulator is done by exactly the + * algorithm in aesgcm-ref-poly.c, translated line by line. + * + * It's possible that this could be improved by some clever manoeuvres + * that avoid having to break vectors in half and put them together + * again. Patches welcome if anyone has better ideas. + */ +static inline void aesgcm_neon_coeff(aesgcm_neon *ctx, + const unsigned char *coeff) +{ + ctx->acc = vaddq_p128(ctx->acc, load_p128_be(coeff)); + + poly64_t ah = hi_half(ctx->acc), al = lo_half(ctx->acc); + poly64_t bh = hi_half(ctx->var), bl = lo_half(ctx->var); + poly128_t md = pmul(ah ^ al, bh ^ bl); + poly128_t lo = pmul(al, bl); + poly128_t hi = pmul(ah, bh); + md = vaddq_p128(md, vaddq_p128(hi, lo)); + hi = create_p128(hi_half(hi), lo_half(hi) ^ hi_half(md)); + lo = create_p128(hi_half(lo) ^ lo_half(md), lo_half(lo)); + + poly128_t r1 = pmul((poly64_t)0xC200000000000000, lo_half(lo)); + hi = create_p128(hi_half(hi), lo_half(hi) ^ lo_half(lo) ^ hi_half(r1)); + lo = create_p128(hi_half(lo) ^ lo_half(r1), lo_half(lo)); + + poly128_t r2 = pmul((poly64_t)0xC200000000000000, hi_half(lo)); + hi = vaddq_p128(hi, r2); + hi = create_p128(hi_half(hi) ^ hi_half(lo), lo_half(hi)); + + ctx->acc = hi; +} + +static inline void aesgcm_neon_output(aesgcm_neon *ctx, unsigned char *output) +{ + store_p128_be(output, vaddq_p128(ctx->acc, ctx->mask)); + ctx->acc = create_p128(0, 0); + ctx->mask = create_p128(0, 0); +} + +#define AESGCM_FLAVOUR neon +#define AESGCM_NAME "NEON accelerated" +#include "aesgcm-footer.h" diff --git a/crypto/aesgcm-ref-poly.c b/crypto/aesgcm-ref-poly.c new file mode 100644 index 00000000..f6ca0fa5 --- /dev/null +++ b/crypto/aesgcm-ref-poly.c @@ -0,0 +1,364 @@ +/* + * Implementation of the GCM polynomial hash in pure software, but + * based on a primitive that performs 64x64->128 bit polynomial + * multiplication over GF(2). + * + * This implementation is technically correct (should even be + * side-channel safe as far as I can see), but it's hopelessly slow, + * so no live SSH connection should ever use it. Therefore, it's + * deliberately not included in the lists in aesgcm-select.c. For pure + * software GCM in live use, you want aesgcm-sw.c, and that's what the + * selection system will choose. + * + * However, this implementation _is_ made available to testcrypt, so + * all the GCM tests in cryptsuite.py are run over this as well as the + * other implementations. + * + * The reason why this code exists at all is to act as a reference for + * GCM implementations that use a CPU-specific polynomial multiply + * intrinsic or asm statement. This version will run on whatever + * platform you're trying to port to, and will generate all the same + * intermediate results you expect the CPU-specific one to go through. + * So you can insert parallel diagnostics in this version and in your + * new version, to see where the two diverge. + * + * Also, this version is a good place to put long comments explaining + * the entire strategy of this implementation and its rationale. That + * avoids those comments having to be duplicated in the multiple + * platform-specific implementations, which can focus on commenting + * the way the local platform's idioms match up to this version, and + * refer to this file for the explanation of the underlying technique. + */ + +#include "ssh.h" +#include "aesgcm.h" + +/* + * Store a 128-bit value in the most convenient form standard C will + * let us, namely two uint64_t giving its most and least significant + * halves. + */ +typedef struct { + uint64_t hi, lo; +} value128_t; + +typedef struct aesgcm_ref_poly { + AESGCM_COMMON_FIELDS; + + /* + * The state of our GCM implementation is represented entirely by + * three 128-bit values: + */ + + /* + * The value at which we're evaluating the polynomial. The GCM + * spec calls this 'H'. It's defined once at GCM key setup time, + * by encrypting the all-zeroes value with our block cipher. + */ + value128_t var; + + /* + * Accumulator containing the result of evaluating the polynomial + * so far. + */ + value128_t acc; + + /* + * The mask value that is XORed into the final value of 'acc' to + * produce the output MAC. This is different for every MAC + * generated, because its purpose is to ensure that no information + * gathered from a legal MAC can be used to help the forgery of + * another one, and that comparing two legal MACs gives you no + * useful information about the text they cover, because in each + * case, the masks are different and pseudorandom. + */ + value128_t mask; +} aesgcm_ref_poly; + +static bool aesgcm_ref_poly_available(void) +{ + return true; /* pure software implementation, always available */ +} + +/* + * Primitive function that takes two uint64_t values representing + * polynomials, multiplies them, and returns a value128_t struct + * containing the full product. + * + * Because the input polynomials have maximum degree 63, the output + * has max degree 63+63 = 127, not 128. As a result, the topmost bit + * of the output is always zero. + * + * The inside of this function is implemented in the simplest way, + * with no attention paid to performance. The important feature of + * this implementation is not what's _inside_ this function, but + * what's _outside_ it: aesgcm_ref_poly_coeff() tries to minimise the + * number of these operations. + */ +static value128_t pmul(uint64_t x, uint64_t y) +{ + value128_t r; + r.hi = r.lo = 0; + + uint64_t bit = 1 & y; + r.lo ^= x & -bit; + + for (unsigned i = 1; i < 64; i++) { + bit = 1 & (y >> i); + uint64_t z = x & -bit; + r.lo ^= z << i; + r.hi ^= z >> (64-i); + } + + return r; +} + +/* + * OK, I promised a long comment explaining what's going on in this + * implementation, and now it's time. + * + * The way AES-GCM _itself_ is defined by its own spec, its finite + * field consists of polynomials over GF(2), constrained to be 128 + * bits long by reducing them modulo P = x^128 + x^7 + x^2 + x + 1. + * Using the usual binary representation in which bit i is the + * coefficient of x^i, that's 0x100000000000000000000000000000087. + * + * That is, whenever you multiply two polynomials and find a term + * x^128, you can replace it with x^7+x^2+x+1. More generally, + * x^(128+n) can be replaced with x^(7+n)+x^(2+n)+x^(1+n)+x^n. In + * binary terms, a 1 bit at the 128th position or above is replaced by + * 0x87 exactly 128 bits further down. + * + * So you'd think that multiplying two 128-bit polynomials mod P would + * be a matter of generating their full 256-bit product in the form of + * four words HI:HU:LU:LO, and then reducing it mod P by a two-stage + * process of computing HI * 0x87 and XORing it into HU:LU, then + * computing HU * 0x87 and XORing it into LU:LO. + * + * But it's not! + * + * The reason why not is because when AES-GCM is applied to SSH, + * somehow the _bit_ order got reversed. A 16-byte block of data in + * memory is converted into a polynomial by regarding bit 7 of the + * first byte as the constant term, bit 0 of the first byte as the x^7 + * coefficient, ..., bit 0 of the last byte as the x^127 coefficient. + * So if we load that 16-byte block as a big-endian 128-bit integer, + * we end up with it representing a polynomial back to front, with the + * constant term at the top and the x^127 bit at the bottom. + * + * Well, that _shouldn't_ be a problem, right? The nice thing about + * polynomial multiplication is that it's essentially reversible. If + * you reverse the order of the coefficients of two polynomials, then + * the product of the reversed polys is exactly the reversal of the + * product of the original ones. So we bit-reverse our modulo + * polynomial to get 0x1c2000000000000000000000000000001, and we just + * pretend we're working mod that instead. + * + * And that is basically what we're about to do. But there's one + * complication, that arises from the semantics of the polynomial + * multiplication function we're using as our primitive operation. + * + * That function multiplies two polynomials of degree at most 63, to + * give one with degree at most 127. So it returns a 128-bit integer + * whose low bit is the constant term, and its very highest bit is 0, + * and its _next_ highest bit is the product of the high bits of the + * two inputs. + * + * That operation is _not_ symmetric in bit-reversal. If you give it + * the 64-bit-wise reversals of two polynomials P,Q, then its output + * is not the 128-bit-wise reversal of their product PQ, because that + * would require the constant term of PQ to appear in bit 127 of the + * output, and in fact it appears in bit 126. So in fact, what we get + * is offset by one bit from where we'd like it: it's the bit-reversal + * of PQx, not of PQ. + * + * There's more than one way we could fix this. One approach would be + * to work around this off-by-one error by taking the 128-bit output + * of pmul() and shifting it left by a bit. Then it _is_ the bitwise + * reversal of the 128-bit value we'd have wanted to get, and we could + * implement the exact algorithm described above, in fully + * bit-reversed form. + * + * But a 128-bit left shift is not a trivial operation in the vector + * architectures that this code is acting as a reference for. So we'd + * prefer to find a fix that doesn't need compensation during the + * actual per-block multiplication step. + * + * If we did the obvious thing anyway - compute the unshifted 128-bit + * product representing the bit-reversal of PQx, and reduce it mod + * 0x1c2000000000000000000000000000001 - then we'd get a result which + * is exactly what we want, except that it's got a factor of x in it + * that we need to get rid of. The obvious answer is to divide by x + * (which is legal and safe, since mod P, x is invertible). + * + * Dividing a 128-bit polynomial by x is easy in principle. Shift left + * (because we're still bit-reversed), and if that shifted a 1 bit off + * the top, XOR 0xc2000000000000000000000000000001 into the remaining + * 128 bits. + * + * But we're back to having that expensive left shift. What can we do + * about that? + * + * Happily, one of the two input values to our per-block multiply + * operation is fixed! It's given to us at key setup stage, and never + * changed until the next rekey. So if at key setup time we do this + * left shift business _once_, replacing the logical value Q with Q/x, + * then that exactly cancels out the unwanted factor of x that shows + * up in our multiply operation. And now it doesn't matter that it's + * expensive (in the sense of 'a few more instructions than you'd + * like'), because it only happens once per SSH key exchange, not once + * per 16 bytes of data transferred. + */ + +static void aesgcm_ref_poly_setkey_impl(aesgcm_ref_poly *ctx, + const unsigned char *var) +{ + /* + * Key setup function. We copy the provided 16-byte 'var' + * value into our polynomial. But, as discussed above, we also + * need to divide it by x. + */ + + ctx->var.hi = GET_64BIT_MSB_FIRST(var); + ctx->var.lo = GET_64BIT_MSB_FIRST(var + 8); + + uint64_t bit = 1 & (ctx->var.hi >> 63); + ctx->var.hi = (ctx->var.hi << 1) ^ (ctx->var.lo >> 63); + ctx->var.lo = (ctx->var.lo << 1) ^ bit; + ctx->var.hi ^= 0xC200000000000000 & -bit; +} + +static inline void aesgcm_ref_poly_setup(aesgcm_ref_poly *ctx, + const unsigned char *mask) +{ + /* + * Set up to start evaluating a particular MAC. Copy in the mask + * value for this packet, and initialise acc to zero. + */ + + ctx->mask.hi = GET_64BIT_MSB_FIRST(mask); + ctx->mask.lo = GET_64BIT_MSB_FIRST(mask + 8); + ctx->acc.hi = ctx->acc.lo = 0; +} + +static inline void aesgcm_ref_poly_coeff(aesgcm_ref_poly *ctx, + const unsigned char *coeff) +{ + /* + * One step of Horner's-rule polynomial evaluation (with each + * coefficient of the polynomial being an element of GF(2^128), + * itself composed of polynomials over GF(2) mod P). + * + * We take our accumulator value, add the incoming coefficient + * (which means XOR, by GF(2) rules), and multiply by x (that is, + * 'var'). + */ + + /* + * The addition first, which is easy. + */ + ctx->acc.hi ^= GET_64BIT_MSB_FIRST(coeff); + ctx->acc.lo ^= GET_64BIT_MSB_FIRST(coeff + 8); + + /* + * First, create the 256-bit product of the two 128-bit + * polynomials over GF(2) stored in ctx->acc and ctx->var. + * + * The obvious way to do this is by four smaller multiplications + * of 64x64 -> 128 bits. But we can do better using a single + * iteration of the Karatsuba technique, which is actually more + * convenient in polynomials over GF(2) than it is in integers, + * because there aren't all those awkward carries complicating + * things. + * + * Letting B denote x^64, and imagining our two inputs are split + * up into 64-bit chunks ah,al,bh,bl, the product we want is + * + * (ah B + al) (bh B + bl) + * = (ah bh) B^2 + (al bh + ah bl) B + (al bl) + * + * which looks like four smaller multiplications of each of ah,al + * with each of bh,bl. But Karatsuba's trick is to first compute + * + * (ah + al) (bh + bl) + * = ah bh + al bh + ah bl + al bl + * + * and then subtract the terms (ah bh) and (al bl), which we had + * to compute anyway, to get the middle two terms (al bh + ah bl) + * which are our coefficient of B. + * + * This involves more bookkeeping instructions like XORs, but with + * any luck those are faster than the main multiplication. + */ + uint64_t ah = ctx->acc.hi, al = ctx->acc.lo; + uint64_t bh = ctx->var.hi, bl = ctx->var.lo; + /* Compute the outer two terms */ + value128_t lo = pmul(al, bl); + value128_t hi = pmul(ah, bh); + /* Compute the trick product (ah+al)(bh+bl) */ + value128_t md = pmul(ah ^ al, bh ^ bl); + /* Subtract off the outer two terms to get md = al bh + ah bl */ + md.hi ^= lo.hi ^ hi.hi; + md.lo ^= lo.lo ^ hi.lo; + /* And add that into the 256-bit value given by hi * x^128 + lo */ + lo.hi ^= md.lo; + hi.lo ^= md.hi; + + /* + * OK. Now hi and lo together make up the 256-bit full product. + * Now reduce it mod the reversal of the GCM modulus polynomial. + * As discussed above, that's 0x1c2000000000000000000000000000001. + * + * We want the _topmost_ 128 bits of this, because we're working + * in a bit-reversed world. So what we fundamentally want to do is + * to take our 256-bit product, and add to it the product of its + * low 128 bits with 0x1c2000000000000000000000000000001. Then the + * top 128 bits will be the output we want. + * + * Since there's no carrying in this arithmetic, it's enough to + * discard the 1 bit at the bottom of that, because it won't + * affect anything in the half we're keeping. So it's enough to + * add 0x1c2000000000000000000000000000000 * lo to (hi:lo). + * + * We can only work with 64 bits at a time, so the first thing we + * do is to break that up: + * + * - add 0x1c200000000000000 * lo.lo to (hi.lo : lo.hi) + * - add 0x1c200000000000000 * lo.hi to (hi.hi : hi.lo) + * + * But there's still a problem: 0x1c200000000000000 is just too + * _big_ to fit in 64 bits. So we have to break it up into the low + * 64 bits 0xc200000000000000, and its leading 1. So each of those + * steps of the form 'add 0x1c200000000000000 * x to y:z' becomes + * 'add 0xc200000000000000 * x to y:z' followed by 'add x to y', + * the latter step dealing with the leading 1. + */ + + /* First step, adding to the middle two words of our number. After + * this the lowest word (in lo.lo) is ignored. */ + value128_t r1 = pmul(0xC200000000000000, lo.lo); + hi.lo ^= r1.hi ^ lo.lo; + lo.hi ^= r1.lo; + + /* Second of those steps, adding to the top two words, and + * discarding lo.hi. */ + value128_t r2 = pmul(0xC200000000000000, lo.hi); + hi.hi ^= r2.hi ^ lo.hi; + hi.lo ^= r2.lo; + + /* Now 'hi' is precisely what we have left. */ + ctx->acc = hi; +} + +static inline void aesgcm_ref_poly_output(aesgcm_ref_poly *ctx, + unsigned char *output) +{ + PUT_64BIT_MSB_FIRST(output, ctx->acc.hi ^ ctx->mask.hi); + PUT_64BIT_MSB_FIRST(output + 8, ctx->acc.lo ^ ctx->mask.lo); + smemclr(&ctx->acc, 16); + smemclr(&ctx->mask, 16); +} + +#define AESGCM_FLAVOUR ref_poly +#define AESGCM_NAME "reference polynomial-based implementation" +#include "aesgcm-footer.h" diff --git a/crypto/aesgcm-select.c b/crypto/aesgcm-select.c new file mode 100644 index 00000000..eefe7148 --- /dev/null +++ b/crypto/aesgcm-select.c @@ -0,0 +1,38 @@ +#include "ssh.h" +#include "aesgcm.h" + +static ssh2_mac *aesgcm_mac_selector_new(const ssh2_macalg *alg, + ssh_cipher *cipher) +{ + static const ssh2_macalg *const real_algs[] = { +#if HAVE_CLMUL + &ssh2_aesgcm_mac_clmul, +#endif +#if HAVE_NEON_PMULL + &ssh2_aesgcm_mac_neon, +#endif + &ssh2_aesgcm_mac_sw, + NULL, + }; + + for (size_t i = 0; real_algs[i]; i++) { + const ssh2_macalg *alg = real_algs[i]; + const struct aesgcm_extra *alg_extra = + (const struct aesgcm_extra *)alg->extra; + if (check_aesgcm_availability(alg_extra)) + return ssh2_mac_new(alg, cipher); + } + + /* We should never reach the NULL at the end of the list, because + * the last non-NULL entry should be software-only GCM, which is + * always available. */ + unreachable("aesgcm_select ran off the end of its list"); +} + +const ssh2_macalg ssh2_aesgcm_mac = { + .new = aesgcm_mac_selector_new, + .name = "", + .etm_name = "", /* Not selectable independently */ + .len = 16, + .keylen = 0, +}; diff --git a/crypto/aesgcm-sw.c b/crypto/aesgcm-sw.c new file mode 100644 index 00000000..f322ae30 --- /dev/null +++ b/crypto/aesgcm-sw.c @@ -0,0 +1,145 @@ +/* + * Implementation of the GCM polynomial hash in pure software. + * + * I don't know of a faster way to do this in a side-channel safe + * manner than by precomputing a giant table and iterating over the + * whole thing. + * + * The original GCM reference suggests that you precompute the effects + * of multiplying a 128-bit value by the fixed key, in the form of a + * table indexed by some number of bits of the input value, so that + * you end up computing something of the form + * + * table1[x & 0xFF] ^ table2[(x>>8) & 0xFF] ^ ... ^ table15[(x>>120) & 0xFF] + * + * But that was obviously written before cache and timing leaks were + * known about. What's a time-safe approach? + * + * Well, the above technique isn't fixed to 8 bits of input per table. + * You could trade off the number of tables against the size of each + * table. At one extreme of this tradeoff, you have 128 tables each + * indexed by a single input bit - which is to say, you have 128 + * values, each 128 bits wide, and you XOR together the subset of + * those values corresponding to the input bits, which you can do by + * making a bitmask out of each input bit using standard constant- + * time-coding bit twiddling techniques. + * + * That's pretty unpleasant when GCM is supposed to be a fast + * algorithm, but I don't know of a better approach that meets current + * security standards! Suggestions welcome, if they can get through + * testsc. + */ + +#include "ssh.h" +#include "aesgcm.h" + +/* + * Store a 128-bit value in the most convenient form standard C will + * let us, namely two uint64_t giving its most and least significant + * halves. + */ +typedef struct { + uint64_t hi, lo; +} value128_t; + +typedef struct aesgcm_sw { + AESGCM_COMMON_FIELDS; + + /* Accumulator for the current evaluation, and mask that will be + * XORed in at the end. High */ + value128_t acc, mask; + + /* + * Table of values to XOR in for each bit, representing the effect + * of multiplying by the fixed key. The key itself doesn't need to + * be stored separately, because it's never used. (However, it is + * also the first entry in the table, so if you _do_ need it, + * there it is.) + * + * Table is indexed from the low bit of the input upwards. + */ + value128_t table[128]; +} aesgcm_sw; + +static bool aesgcm_sw_available(void) +{ + return true; /* pure software implementation, always available */ +} + +static void aesgcm_sw_setkey_impl(aesgcm_sw *gcm, const unsigned char *var) +{ + value128_t v; + v.hi = GET_64BIT_MSB_FIRST(var); + v.lo = GET_64BIT_MSB_FIRST(var + 8); + + /* + * Prepare the table. This has to be done in reverse order, so + * that the original value of the variable corresponds to + * table[127], because AES-GCM works in the bit-reversal of its + * logical specification so that's where the logical constant term + * lives. (See more detailed comment in aesgcm-ref-poly.c.) + */ + for (size_t i = 0; i < 128; i++) { + gcm->table[127 - i] = v; + + /* Multiply v by x, which means shifting right (bit reversal + * again) and then adding 0xE1 at the top if we shifted a 1 out. */ + uint64_t lobit = v.lo & 1; + v.lo = (v.lo >> 1) ^ (v.hi << 63); + v.hi = (v.hi >> 1) ^ (0xE100000000000000ULL & -lobit); + } +} + +static inline void aesgcm_sw_setup(aesgcm_sw *gcm, const unsigned char *mask) +{ + gcm->mask.hi = GET_64BIT_MSB_FIRST(mask); + gcm->mask.lo = GET_64BIT_MSB_FIRST(mask + 8); + gcm->acc.hi = gcm->acc.lo = 0; +} + +static inline void aesgcm_sw_coeff(aesgcm_sw *gcm, const unsigned char *coeff) +{ + /* XOR in the new coefficient */ + gcm->acc.hi ^= GET_64BIT_MSB_FIRST(coeff); + gcm->acc.lo ^= GET_64BIT_MSB_FIRST(coeff + 8); + + /* And now just loop over the bits of acc, making up a new value + * by XORing together the entries of 'table' corresponding to set + * bits. */ + + value128_t out; + out.lo = out.hi = 0; + + const value128_t *tableptr = gcm->table; + + for (size_t i = 0; i < 64; i++) { + uint64_t bit = 1 & gcm->acc.lo; + gcm->acc.lo >>= 1; + uint64_t mask = -bit; + out.hi ^= mask & tableptr->hi; + out.lo ^= mask & tableptr->lo; + tableptr++; + } + for (size_t i = 0; i < 64; i++) { + uint64_t bit = 1 & gcm->acc.hi; + gcm->acc.hi >>= 1; + uint64_t mask = -bit; + out.hi ^= mask & tableptr->hi; + out.lo ^= mask & tableptr->lo; + tableptr++; + } + + gcm->acc = out; +} + +static inline void aesgcm_sw_output(aesgcm_sw *gcm, unsigned char *output) +{ + PUT_64BIT_MSB_FIRST(output, gcm->acc.hi ^ gcm->mask.hi); + PUT_64BIT_MSB_FIRST(output + 8, gcm->acc.lo ^ gcm->mask.lo); + smemclr(&gcm->acc, 16); + smemclr(&gcm->mask, 16); +} + +#define AESGCM_FLAVOUR sw +#define AESGCM_NAME "unaccelerated" +#include "aesgcm-footer.h" diff --git a/crypto/aesgcm.h b/crypto/aesgcm.h new file mode 100644 index 00000000..48077004 --- /dev/null +++ b/crypto/aesgcm.h @@ -0,0 +1,44 @@ +/* + * Common parts of the state structure for AESGCM MAC implementations. + */ +#define AESGCM_COMMON_FIELDS \ + ssh_cipher *cipher; \ + unsigned char partblk[16]; \ + size_t skiplen, aadlen, ciphertextlen; \ + size_t skipgot, aadgot, partlen; \ + BinarySink_IMPLEMENTATION; \ + ssh2_mac mac + +/* + * The 'extra' structure is used to include information about how to + * check if a given implementation is available at run time, and + * whether we've already checked. + */ +struct aesgcm_extra_mutable; +struct aesgcm_extra { + /* Function to check availability. Might be expensive, so we don't + * want to call it more than once. */ + bool (*check_available)(void); + + /* Point to a writable substructure. */ + struct aesgcm_extra_mutable *mut; + + /* + * Extra API function specific to this MAC type that allows + * testcrypt to set more general lengths for skiplen and aadlen. + */ + void (*set_prefix_lengths)(ssh2_mac *mac, size_t skip, size_t aad); +}; +struct aesgcm_extra_mutable { + bool checked_availability; + bool is_available; +}; +static inline bool check_aesgcm_availability(const struct aesgcm_extra *extra) +{ + if (!extra->mut->checked_availability) { + extra->mut->is_available = extra->check_available(); + extra->mut->checked_availability = true; + } + + return extra->mut->is_available; +} diff --git a/putty.h b/putty.h index 727a07f7..175f5969 100644 --- a/putty.h +++ b/putty.h @@ -453,6 +453,7 @@ enum { CIPHER_DES, CIPHER_ARCFOUR, CIPHER_CHACHA20, + CIPHER_AESGCM, CIPHER_MAX /* no. ciphers (inc warn) */ }; diff --git a/settings.c b/settings.c index 40e26b8d..cc2176ce 100644 --- a/settings.c +++ b/settings.c @@ -17,6 +17,7 @@ static const struct keyvalwhere ciphernames[] = { { "aes", CIPHER_AES, -1, -1 }, { "chacha20", CIPHER_CHACHA20, CIPHER_AES, +1 }, + { "aesgcm", CIPHER_AESGCM, CIPHER_CHACHA20, +1 }, { "3des", CIPHER_3DES, -1, -1 }, { "WARN", CIPHER_WARN, -1, -1 }, { "des", CIPHER_DES, -1, -1 }, diff --git a/ssh.h b/ssh.h index e14b44c6..d3ee5065 100644 --- a/ssh.h +++ b/ssh.h @@ -1072,6 +1072,10 @@ extern const ssh_cipheralg ssh_aes256_sdctr; extern const ssh_cipheralg ssh_aes256_sdctr_ni; extern const ssh_cipheralg ssh_aes256_sdctr_neon; extern const ssh_cipheralg ssh_aes256_sdctr_sw; +extern const ssh_cipheralg ssh_aes256_gcm; +extern const ssh_cipheralg ssh_aes256_gcm_ni; +extern const ssh_cipheralg ssh_aes256_gcm_neon; +extern const ssh_cipheralg ssh_aes256_gcm_sw; extern const ssh_cipheralg ssh_aes256_cbc; extern const ssh_cipheralg ssh_aes256_cbc_ni; extern const ssh_cipheralg ssh_aes256_cbc_neon; @@ -1080,6 +1084,10 @@ extern const ssh_cipheralg ssh_aes192_sdctr; extern const ssh_cipheralg ssh_aes192_sdctr_ni; extern const ssh_cipheralg ssh_aes192_sdctr_neon; extern const ssh_cipheralg ssh_aes192_sdctr_sw; +extern const ssh_cipheralg ssh_aes192_gcm; +extern const ssh_cipheralg ssh_aes192_gcm_ni; +extern const ssh_cipheralg ssh_aes192_gcm_neon; +extern const ssh_cipheralg ssh_aes192_gcm_sw; extern const ssh_cipheralg ssh_aes192_cbc; extern const ssh_cipheralg ssh_aes192_cbc_ni; extern const ssh_cipheralg ssh_aes192_cbc_neon; @@ -1088,6 +1096,10 @@ extern const ssh_cipheralg ssh_aes128_sdctr; extern const ssh_cipheralg ssh_aes128_sdctr_ni; extern const ssh_cipheralg ssh_aes128_sdctr_neon; extern const ssh_cipheralg ssh_aes128_sdctr_sw; +extern const ssh_cipheralg ssh_aes128_gcm; +extern const ssh_cipheralg ssh_aes128_gcm_ni; +extern const ssh_cipheralg ssh_aes128_gcm_neon; +extern const ssh_cipheralg ssh_aes128_gcm_sw; extern const ssh_cipheralg ssh_aes128_cbc; extern const ssh_cipheralg ssh_aes128_cbc_ni; extern const ssh_cipheralg ssh_aes128_cbc_neon; @@ -1103,6 +1115,7 @@ extern const ssh2_ciphers ssh2_aes; extern const ssh2_ciphers ssh2_blowfish; extern const ssh2_ciphers ssh2_arcfour; extern const ssh2_ciphers ssh2_ccp; +extern const ssh2_ciphers ssh2_aesgcm; extern const ssh_hashalg ssh_md5; extern const ssh_hashalg ssh_sha1; extern const ssh_hashalg ssh_sha1_ni; @@ -1163,12 +1176,20 @@ extern const ssh2_macalg ssh_hmac_sha1_96; extern const ssh2_macalg ssh_hmac_sha1_96_buggy; extern const ssh2_macalg ssh_hmac_sha256; extern const ssh2_macalg ssh2_poly1305; +extern const ssh2_macalg ssh2_aesgcm_mac; +extern const ssh2_macalg ssh2_aesgcm_mac_sw; +extern const ssh2_macalg ssh2_aesgcm_mac_ref_poly; +extern const ssh2_macalg ssh2_aesgcm_mac_clmul; +extern const ssh2_macalg ssh2_aesgcm_mac_neon; extern const ssh_compression_alg ssh_zlib; /* Special constructor: BLAKE2b can be instantiated with any hash * length up to 128 bytes */ ssh_hash *blake2b_new_general(unsigned hashlen); +/* Special test function for AES-GCM */ +void aesgcm_set_prefix_lengths(ssh2_mac *mac, size_t skip, size_t aad); + /* * On some systems, you have to detect hardware crypto acceleration by * asking the local OS API rather than OS-agnostically asking the CPU @@ -1176,6 +1197,7 @@ ssh_hash *blake2b_new_general(unsigned hashlen); * platform subdirectory. */ bool platform_aes_neon_available(void); +bool platform_pmull_neon_available(void); bool platform_sha256_neon_available(void); bool platform_sha1_neon_available(void); bool platform_sha512_neon_available(void); diff --git a/ssh/bpp2.c b/ssh/bpp2.c index a3ab99f9..e019dd2e 100644 --- a/ssh/bpp2.c +++ b/ssh/bpp2.c @@ -132,6 +132,12 @@ void ssh2_bpp_new_outgoing_crypto( s->out.etm_mode = etm_mode; if (mac) { s->out.mac = ssh2_mac_new(mac, s->out.cipher); + /* + * Important that mac_setkey comes after cipher_setkey, + * because in the case where the MAC makes use of the cipher + * (e.g. AES-GCM), it will need the cipher to be keyed + * already. + */ ssh2_mac_setkey(s->out.mac, make_ptrlen(mac_key, mac->keylen)); bpp_logevent("Initialised %s outbound MAC algorithm%s%s", @@ -189,6 +195,7 @@ void ssh2_bpp_new_incoming_crypto( s->in.etm_mode = etm_mode; if (mac) { s->in.mac = ssh2_mac_new(mac, s->in.cipher); + /* MAC setkey has to follow cipher, just as in outgoing_crypto above */ ssh2_mac_setkey(s->in.mac, make_ptrlen(mac_key, mac->keylen)); bpp_logevent("Initialised %s inbound MAC algorithm%s%s", diff --git a/ssh/transport2.c b/ssh/transport2.c index 705df466..aba8cd0b 100644 --- a/ssh/transport2.c +++ b/ssh/transport2.c @@ -600,6 +600,9 @@ static void ssh2_write_kexinit_lists( case CIPHER_CHACHA20: preferred_ciphers[n_preferred_ciphers++] = &ssh2_ccp; break; + case CIPHER_AESGCM: + preferred_ciphers[n_preferred_ciphers++] = &ssh2_aesgcm; + break; case CIPHER_WARN: /* Flag for later. Don't bother if it's the last in * the list. */ diff --git a/test/cryptsuite.py b/test/cryptsuite.py index 35114a16..69b492e8 100755 --- a/test/cryptsuite.py +++ b/test/cryptsuite.py @@ -145,6 +145,11 @@ def get_aes_impls(): for impl in get_implementations("aes128_cbc") if impl.startswith("aes128_cbc_")] +def get_aesgcm_impls(): + return [impl.split("_", 1)[1] + for impl in get_implementations("aesgcm") + if impl.startswith("aesgcm_")] + class MyTestBase(unittest.TestCase): "Intermediate class that adds useful helper methods." def assertEqualBin(self, x, y): @@ -2933,6 +2938,184 @@ Private-MAC: 5b1f6f4cc43eb0060d2c3e181bc0129343adba2b per_base_keytype_tests('rsa', run_ca_rsa_tests=True, ca_signflags=2) per_base_keytype_tests('rsa', run_ca_rsa_tests=True, ca_signflags=4) + def testAESGCMBlockBoundaries(self): + # For standard AES-GCM test vectors, see the separate tests in + # standard_test_vectors.testAESGCM. This function will test + # the local interface, including the skip length and the + # machinery for incremental MAC update. + + def aesgcm(key, iv, aes_impl, gcm_impl): + c = ssh_cipher_new('aes{:d}_gcm_{}'.format(8*len(key), aes_impl)) + m = ssh2_mac_new('aesgcm_{}'.format(gcm_impl), c) + if m is None: return # skip test if HW GCM not available + c.setkey(key) + c.setiv(iv + b'\0'*4) + m.setkey(b'') + return c, m + + def test_one(aes_impl, gcm_impl): + # An actual test from a session with OpenSSH, which + # demonstrates that the implementation in practice matches up + # to what the test vectors say. This is its SSH2_MSG_EXT_INFO + # packet. + key = unhex('dbf98b2f56c83fb2f9476aa876511225') + iv = unhex('9af15ecccf2bacaaa9625a6a') + plain = unhex('1007000000020000000f736572766572' + '2d7369672d616c6773000000db737368' + '2d656432353531392c736b2d7373682d' + '65643235353139406f70656e7373682e' + '636f6d2c7373682d7273612c7273612d' + '736861322d3235362c7273612d736861' + '322d3531322c7373682d6473732c6563' + '6473612d736861322d6e697374703235' + '362c65636473612d736861322d6e6973' + '74703338342c65636473612d73686132' + '2d6e697374703532312c736b2d656364' + '73612d736861322d6e69737470323536' + '406f70656e7373682e636f6d2c776562' + '617574686e2d736b2d65636473612d73' + '6861322d6e69737470323536406f7065' + '6e7373682e636f6d0000001f7075626c' + '69636b65792d686f7374626f756e6440' + '6f70656e7373682e636f6d0000000130' + '5935130804ad4b19ed2789210290c438') + aad = unhex('00000130') + cipher = unhex('c4b88f35c1ef8aa6225033c3f185d648' + '3c485d84930d5846f7851daacbff49d5' + '8cf72169fca7ab3c170376df65dd69de' + 'c40a94c6b8e3da6d61161ab19be27466' + '02e0dfa3330faae291ef4173a20e87a4' + 'd40728c645baa72916c1958531ef7b54' + '27228513e53005e6d17b9bb384b8d8c1' + '92b8a10b731459eed5a0fb120c283412' + 'e34445981df1257f1c35a06196731fed' + '1b3115f419e754de0b634bf68768cb02' + '29e70bb2259cedb5101ff6a4ac19aaad' + '46f1c30697361b45d6c152c3069cee6b' + 'd46e9785d65ea6bf7fca41f0ac3c8e93' + 'ce940b0059c39d51e49c17f60d48d633' + '5bae4402faab61d8d65221b24b400e65' + '89f941ff48310231a42641851ea00832' + '2c2d188f4cc6a4ec6002161c407d0a92' + 'f1697bb319fbec1ca63fa8e7ac171c85' + '5b60142bfcf4e5b0a9ada3451799866e') + + c, m = aesgcm(key, iv, aes_impl, gcm_impl) + len_dec = c.decrypt_length(aad, 123) + self.assertEqual(len_dec, aad) # length not actually encrypted + m.start() + # We expect 4 bytes skipped (the sequence number that + # ChaCha20-Poly1305 wants at the start of its MAC), and 4 + # bytes AAD. These were initialised by the call to + # encrypt_length. + m.update(b'fake' + aad + cipher) + self.assertEqualBin(m.genresult(), + unhex('4a5a6d57d54888b4e58c57a96e00b73a')) + self.assertEqualBin(c.decrypt(cipher), plain) + + c, m = aesgcm(key, iv, aes_impl, gcm_impl) + len_enc = c.encrypt_length(aad, 123) + self.assertEqual(len_enc, aad) # length not actually encrypted + self.assertEqualBin(c.encrypt(plain), cipher) + + # Test incremental update. + def testIncremental(skiplen, aad, plain): + key, iv = b'SomeRandomKeyVal', b'SomeRandomIV' + mac_input = b'x' * skiplen + aad + plain + + c, m = aesgcm(key, iv, aes_impl, gcm_impl) + aesgcm_set_prefix_lengths(m, skiplen, len(aad)) + + m.start() + m.update(mac_input) + reference_mac = m.genresult() + + # Break the input just once, at each possible byte + # position. + for i in range(1, len(mac_input)): + c.setiv(iv + b'\0'*4) + m.setkey(b'') + aesgcm_set_prefix_lengths(m, skiplen, len(aad)) + m.start() + m.update(mac_input[:i]) + m.update(mac_input[i:]) + self.assertEqualBin(m.genresult(), reference_mac) + + # Feed the entire input in a byte at a time. + c.setiv(iv + b'\0'*4) + m.setkey(b'') + aesgcm_set_prefix_lengths(m, skiplen, len(aad)) + m.start() + for i in range(len(mac_input)): + m.update(mac_input[i:i+1]) + self.assertEqualBin(m.genresult(), reference_mac) + + # Incremental test with more than a full block of each thing + testIncremental(23, b'abcdefghijklmnopqrst', + b'Lorem ipsum dolor sit amet') + + # Incremental test with exactly a full block of each thing + testIncremental(16, b'abcdefghijklmnop', + b'Lorem ipsum dolo') + + # Incremental test with less than a full block of each thing + testIncremental(7, b'abcdefghij', + b'Lorem ipsum') + + for aes_impl in get_aes_impls(): + for gcm_impl in get_aesgcm_impls(): + with self.subTest(aes_impl=aes_impl, gcm_impl=gcm_impl): + test_one(aes_impl, gcm_impl) + + def testAESGCMIV(self): + key = b'SomeRandomKeyVal' + + def test(gcm, cbc, iv_fixed, iv_msg): + gcm.setiv(ssh_uint32(iv_fixed) + ssh_uint64(iv_msg) + b'fake') + + cbc.setiv(b'\0' * 16) + preimage = cbc.decrypt(gcm.encrypt(b'\0' * 16)) + self.assertEqualBin(preimage, ssh_uint32(iv_fixed) + + ssh_uint64(iv_msg) + ssh_uint32(1)) + cbc.setiv(b'\0' * 16) + preimage = cbc.decrypt(gcm.encrypt(b'\0' * 16)) + self.assertEqualBin(preimage, ssh_uint32(iv_fixed) + + ssh_uint64(iv_msg) + ssh_uint32(2)) + + gcm.next_message() + iv_msg = (iv_msg + 1) & ((1<<64)-1) + + cbc.setiv(b'\0' * 16) + preimage = cbc.decrypt(gcm.encrypt(b'\0' * 16)) + self.assertEqualBin(preimage, ssh_uint32(iv_fixed) + + ssh_uint64(iv_msg) + ssh_uint32(1)) + cbc.setiv(b'\0' * 16) + preimage = cbc.decrypt(gcm.encrypt(b'\0' * 16)) + self.assertEqualBin(preimage, ssh_uint32(iv_fixed) + + ssh_uint64(iv_msg) + ssh_uint32(2)) + + + for impl in get_aes_impls(): + with self.subTest(aes_impl=impl): + gcm = ssh_cipher_new('aes{:d}_gcm_{}'.format(8*len(key), impl)) + gcm.setkey(key) + + cbc = ssh_cipher_new('aes{:d}_cbc_{}'.format(8*len(key), impl)) + cbc.setkey(key) + + # A simple test to ensure the low word gets + # incremented and that the whole IV looks basically + # the way we expect it to + test(gcm, cbc, 0x27182818, 0x3141592653589793) + + # Test that carries are propagated into the high word + test(gcm, cbc, 0x27182818, 0x00000000FFFFFFFF) + + # Test that carries _aren't_ propagated out of the + # high word of the message counter into the fixed word + # at the top + test(gcm, cbc, 0x27182818, 0xFFFFFFFFFFFFFFFF) + class standard_test_vectors(MyTestBase): def testAES(self): def vector(cipher, key, plaintext, ciphertext): @@ -3726,6 +3909,178 @@ class standard_test_vectors(MyTestBase): b'opaque="HRPCssKJSGjCrkzDg8OhwpzCiGPChXYjwrI2QmXDnsOS", ' b'userhash=true') + def testAESGCM(self): + def test(key, iv, plaintext, aad, ciphertext, mac): + c = ssh_cipher_new('aes{:d}_gcm'.format(8*len(key))) + m = ssh2_mac_new('aesgcm_{}'.format(impl), c) + if m is None: return # skip test if HW GCM not available + c.setkey(key) + c.setiv(iv + b'\0'*4) + m.setkey(b'') + aesgcm_set_prefix_lengths(m, 0, len(aad)) + + # Some test cases have plaintext/ciphertext that is not a + # multiple of the cipher block size. Our MAC + # implementation supports this, but the cipher + # implementation expects block-granular input. + padlen = 15 & -len(plaintext) + ciphertext_got = c.encrypt(plaintext + b'0' * padlen)[ + :len(plaintext)] + + m.start() + m.update(aad + ciphertext) + mac_got = m.genresult() + + self.assertEqualBin(ciphertext_got, ciphertext) + self.assertEqualBin(mac_got, mac) + + c.setiv(iv + b'\0'*4) + + for impl in get_aesgcm_impls(): + # 'The Galois/Counter Mode of Operation', McGrew and + # Viega, Appendix B. All the tests except the ones whose + # IV is the wrong length, because handling that requires + # an extra evaluation of the polynomial hash, which is + # never used in an SSH context, so I didn't implement it + # just for the sake of test vectors. + + # Test Case 1 + test(unhex('00000000000000000000000000000000'), + unhex('000000000000000000000000'), + unhex(''), unhex(''), unhex(''), + unhex('58e2fccefa7e3061367f1d57a4e7455a')) + + # Test Case 2 + test(unhex('00000000000000000000000000000000'), + unhex('000000000000000000000000'), + unhex('00000000000000000000000000000000'), + unhex(''), + unhex('0388dace60b6a392f328c2b971b2fe78'), + unhex('ab6e47d42cec13bdf53a67b21257bddf')) + + # Test Case 3 + test(unhex('feffe9928665731c6d6a8f9467308308'), + unhex('cafebabefacedbaddecaf888'), + unhex('d9313225f88406e5a55909c5aff5269a' + '86a7a9531534f7da2e4c303d8a318a72' + '1c3c0c95956809532fcf0e2449a6b525' + 'b16aedf5aa0de657ba637b391aafd255'), + unhex(''), + unhex('42831ec2217774244b7221b784d0d49c' + 'e3aa212f2c02a4e035c17e2329aca12e' + '21d514b25466931c7d8f6a5aac84aa05' + '1ba30b396a0aac973d58e091473f5985'), + unhex('4d5c2af327cd64a62cf35abd2ba6fab4')) + + # Test Case 4 + test(unhex('feffe9928665731c6d6a8f9467308308'), + unhex('cafebabefacedbaddecaf888'), + unhex('d9313225f88406e5a55909c5aff5269a' + '86a7a9531534f7da2e4c303d8a318a72' + '1c3c0c95956809532fcf0e2449a6b525' + 'b16aedf5aa0de657ba637b39'), + unhex('feedfacedeadbeeffeedfacedeadbeef' + 'abaddad2'), + unhex('42831ec2217774244b7221b784d0d49c' + 'e3aa212f2c02a4e035c17e2329aca12e' + '21d514b25466931c7d8f6a5aac84aa05' + '1ba30b396a0aac973d58e091'), + unhex('5bc94fbc3221a5db94fae95ae7121a47')) + + # Test Case 7 + test(unhex('00000000000000000000000000000000' + '0000000000000000'), + unhex('000000000000000000000000'), + unhex(''), unhex(''), unhex(''), + unhex('cd33b28ac773f74ba00ed1f312572435')) + + # Test Case 8 + test(unhex('00000000000000000000000000000000' + '0000000000000000'), + unhex('000000000000000000000000'), + unhex('00000000000000000000000000000000'), + unhex(''), + unhex('98e7247c07f0fe411c267e4384b0f600'), + unhex('2ff58d80033927ab8ef4d4587514f0fb')) + + # Test Case 9 + test(unhex('feffe9928665731c6d6a8f9467308308' + 'feffe9928665731c'), + unhex('cafebabefacedbaddecaf888'), + unhex('d9313225f88406e5a55909c5aff5269a' + '86a7a9531534f7da2e4c303d8a318a72' + '1c3c0c95956809532fcf0e2449a6b525' + 'b16aedf5aa0de657ba637b391aafd255'), + unhex(''), + unhex('3980ca0b3c00e841eb06fac4872a2757' + '859e1ceaa6efd984628593b40ca1e19c' + '7d773d00c144c525ac619d18c84a3f47' + '18e2448b2fe324d9ccda2710acade256'), + unhex('9924a7c8587336bfb118024db8674a14')) + + # Test Case 10 + test(unhex('feffe9928665731c6d6a8f9467308308' + 'feffe9928665731c'), + unhex('cafebabefacedbaddecaf888'), + unhex('d9313225f88406e5a55909c5aff5269a' + '86a7a9531534f7da2e4c303d8a318a72' + '1c3c0c95956809532fcf0e2449a6b525' + 'b16aedf5aa0de657ba637b39'), + unhex('feedfacedeadbeeffeedfacedeadbeef' + 'abaddad2'), + unhex('3980ca0b3c00e841eb06fac4872a2757' + '859e1ceaa6efd984628593b40ca1e19c' + '7d773d00c144c525ac619d18c84a3f47' + '18e2448b2fe324d9ccda2710'), + unhex('2519498e80f1478f37ba55bd6d27618c')) + + # Test Case 13 + test(unhex('00000000000000000000000000000000' + '00000000000000000000000000000000'), + unhex('000000000000000000000000'), + unhex(''), unhex(''), unhex(''), + unhex('530f8afbc74536b9a963b4f1c4cb738b')) + + # Test Case 14 + test(unhex('00000000000000000000000000000000' + '00000000000000000000000000000000'), + unhex('000000000000000000000000'), + unhex('00000000000000000000000000000000'), + unhex(''), + unhex('cea7403d4d606b6e074ec5d3baf39d18'), + unhex('d0d1c8a799996bf0265b98b5d48ab919')) + + # Test Case 15 + test(unhex('feffe9928665731c6d6a8f9467308308' + 'feffe9928665731c6d6a8f9467308308'), + unhex('cafebabefacedbaddecaf888'), + unhex('d9313225f88406e5a55909c5aff5269a' + '86a7a9531534f7da2e4c303d8a318a72' + '1c3c0c95956809532fcf0e2449a6b525' + 'b16aedf5aa0de657ba637b391aafd255'), + unhex(''), + unhex('522dc1f099567d07f47f37a32a84427d' + '643a8cdcbfe5c0c97598a2bd2555d1aa' + '8cb08e48590dbb3da7b08b1056828838' + 'c5f61e6393ba7a0abcc9f662898015ad'), + unhex('b094dac5d93471bdec1a502270e3cc6c')) + + # Test Case 16 + test(unhex('feffe9928665731c6d6a8f9467308308' + 'feffe9928665731c6d6a8f9467308308'), + unhex('cafebabefacedbaddecaf888'), + unhex('d9313225f88406e5a55909c5aff5269a' + '86a7a9531534f7da2e4c303d8a318a72' + '1c3c0c95956809532fcf0e2449a6b525' + 'b16aedf5aa0de657ba637b39'), + unhex('feedfacedeadbeeffeedfacedeadbeef' + 'abaddad2'), + unhex('522dc1f099567d07f47f37a32a84427d' + '643a8cdcbfe5c0c97598a2bd2555d1aa' + '8cb08e48590dbb3da7b08b1056828838' + 'c5f61e6393ba7a0abcc9f662'), + unhex('76fc6ece0f4e1768cddf8853bb2d551b')) + if __name__ == "__main__": # Run the tests, suppressing automatic sys.exit and collecting the # unittest.TestProgram instance returned by unittest.main instead. diff --git a/test/list-accel.py b/test/list-accel.py index af93d420..ac92d376 100755 --- a/test/list-accel.py +++ b/test/list-accel.py @@ -25,10 +25,14 @@ def list_implementations(alg, checkfn): def list_cipher_implementations(alg): list_implementations(alg, lambda impl: ssh_cipher_new(impl) is not None) +def list_mac_implementations(alg): + list_implementations(alg, lambda impl: ssh2_mac_new(impl, None) is not None) + def list_hash_implementations(alg): list_implementations(alg, lambda impl: ssh_hash_new(impl) is not None) list_cipher_implementations("aes256_cbc") +list_mac_implementations("aesgcm") list_hash_implementations("sha1") list_hash_implementations("sha256") list_hash_implementations("sha512") diff --git a/test/testcrypt-enum.h b/test/testcrypt-enum.h index ac73a766..90458fc0 100644 --- a/test/testcrypt-enum.h +++ b/test/testcrypt-enum.h @@ -36,6 +36,16 @@ BEGIN_ENUM_TYPE(macalg) ENUM_VALUE("hmac_sha1_96_buggy", &ssh_hmac_sha1_96_buggy) ENUM_VALUE("hmac_sha256", &ssh_hmac_sha256) ENUM_VALUE("poly1305", &ssh2_poly1305) + ENUM_VALUE("aesgcm", &ssh2_aesgcm_mac) + ENUM_VALUE("aesgcm", &ssh2_aesgcm_mac) + ENUM_VALUE("aesgcm_sw", &ssh2_aesgcm_mac_sw) + ENUM_VALUE("aesgcm_ref_poly", &ssh2_aesgcm_mac_ref_poly) +#if HAVE_CLMUL + ENUM_VALUE("aesgcm_clmul", &ssh2_aesgcm_mac_clmul) +#endif +#if HAVE_NEON_PMULL + ENUM_VALUE("aesgcm_neon", &ssh2_aesgcm_mac_neon) +#endif END_ENUM_TYPE(macalg) BEGIN_ENUM_TYPE(keyalg) @@ -60,31 +70,43 @@ BEGIN_ENUM_TYPE(cipheralg) ENUM_VALUE("3des_ssh1", &ssh_3des_ssh1) ENUM_VALUE("des_cbc", &ssh_des) ENUM_VALUE("aes256_ctr", &ssh_aes256_sdctr) + ENUM_VALUE("aes256_gcm", &ssh_aes256_gcm) ENUM_VALUE("aes256_cbc", &ssh_aes256_cbc) ENUM_VALUE("aes192_ctr", &ssh_aes192_sdctr) + ENUM_VALUE("aes192_gcm", &ssh_aes192_gcm) ENUM_VALUE("aes192_cbc", &ssh_aes192_cbc) ENUM_VALUE("aes128_ctr", &ssh_aes128_sdctr) + ENUM_VALUE("aes128_gcm", &ssh_aes128_gcm) ENUM_VALUE("aes128_cbc", &ssh_aes128_cbc) ENUM_VALUE("aes256_ctr_sw", &ssh_aes256_sdctr_sw) + ENUM_VALUE("aes256_gcm_sw", &ssh_aes256_gcm_sw) ENUM_VALUE("aes256_cbc_sw", &ssh_aes256_cbc_sw) ENUM_VALUE("aes192_ctr_sw", &ssh_aes192_sdctr_sw) + ENUM_VALUE("aes192_gcm_sw", &ssh_aes192_gcm_sw) ENUM_VALUE("aes192_cbc_sw", &ssh_aes192_cbc_sw) ENUM_VALUE("aes128_ctr_sw", &ssh_aes128_sdctr_sw) + ENUM_VALUE("aes128_gcm_sw", &ssh_aes128_gcm_sw) ENUM_VALUE("aes128_cbc_sw", &ssh_aes128_cbc_sw) #if HAVE_AES_NI ENUM_VALUE("aes256_ctr_ni", &ssh_aes256_sdctr_ni) + ENUM_VALUE("aes256_gcm_ni", &ssh_aes256_gcm_ni) ENUM_VALUE("aes256_cbc_ni", &ssh_aes256_cbc_ni) ENUM_VALUE("aes192_ctr_ni", &ssh_aes192_sdctr_ni) + ENUM_VALUE("aes192_gcm_ni", &ssh_aes192_gcm_ni) ENUM_VALUE("aes192_cbc_ni", &ssh_aes192_cbc_ni) ENUM_VALUE("aes128_ctr_ni", &ssh_aes128_sdctr_ni) + ENUM_VALUE("aes128_gcm_ni", &ssh_aes128_gcm_ni) ENUM_VALUE("aes128_cbc_ni", &ssh_aes128_cbc_ni) #endif #if HAVE_NEON_CRYPTO ENUM_VALUE("aes256_ctr_neon", &ssh_aes256_sdctr_neon) + ENUM_VALUE("aes256_gcm_neon", &ssh_aes256_gcm_neon) ENUM_VALUE("aes256_cbc_neon", &ssh_aes256_cbc_neon) ENUM_VALUE("aes192_ctr_neon", &ssh_aes192_sdctr_neon) + ENUM_VALUE("aes192_gcm_neon", &ssh_aes192_gcm_neon) ENUM_VALUE("aes192_cbc_neon", &ssh_aes192_cbc_neon) ENUM_VALUE("aes128_ctr_neon", &ssh_aes128_sdctr_neon) + ENUM_VALUE("aes128_gcm_neon", &ssh_aes128_gcm_neon) ENUM_VALUE("aes128_cbc_neon", &ssh_aes128_cbc_neon) #endif ENUM_VALUE("blowfish_ctr", &ssh_blowfish_ssh2_ctr) diff --git a/test/testcrypt-func.h b/test/testcrypt-func.h index f79c966e..bd007293 100644 --- a/test/testcrypt-func.h +++ b/test/testcrypt-func.h @@ -277,6 +277,9 @@ FUNC(void, ssh2_mac_next_message, ARG(val_mac, m)) FUNC_WRAPPED(val_string, ssh2_mac_genresult, ARG(val_mac, m)) FUNC(val_string_asciz_const, ssh2_mac_text_name, ARG(val_mac, m)) +FUNC(void, aesgcm_set_prefix_lengths, + ARG(val_mac, m), ARG(uint, skip), ARG(uint, aad)) + /* * The ssh_key abstraction. All the uses of BinarySink and * BinarySource in parameters are replaced with ordinary strings for diff --git a/test/testcrypt.c b/test/testcrypt.c index de09af33..3755ae72 100644 --- a/test/testcrypt.c +++ b/test/testcrypt.c @@ -1329,7 +1329,16 @@ strbuf *get_implementations_commasep(ptrlen alg) strbuf *out = strbuf_new(); put_datapl(out, alg); - if (ptrlen_startswith(alg, PTRLEN_LITERAL("aes"), NULL)) { + if (ptrlen_startswith(alg, PTRLEN_LITERAL("aesgcm"), NULL)) { + put_fmt(out, ",%.*s_sw", PTRLEN_PRINTF(alg)); + put_fmt(out, ",%.*s_ref_poly", PTRLEN_PRINTF(alg)); +#if HAVE_CLMUL + put_fmt(out, ",%.*s_clmul", PTRLEN_PRINTF(alg)); +#endif +#if HAVE_NEON_PMULL + put_fmt(out, ",%.*s_neon", PTRLEN_PRINTF(alg)); +#endif + } else if (ptrlen_startswith(alg, PTRLEN_LITERAL("aes"), NULL)) { put_fmt(out, ",%.*s_sw", PTRLEN_PRINTF(alg)); #if HAVE_AES_NI put_fmt(out, ",%.*s_ni", PTRLEN_PRINTF(alg)); diff --git a/test/testsc.c b/test/testsc.c index 6068dd86..0a643e97 100644 --- a/test/testsc.c +++ b/test/testsc.c @@ -259,6 +259,12 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x)) #define IF_SHA_NI(x) #endif +#if HAVE_CLMUL +#define IF_CLMUL(x) x +#else +#define IF_CLMUL(x) +#endif + #if HAVE_NEON_CRYPTO #define IF_NEON_CRYPTO(x) x #else @@ -271,6 +277,12 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x)) #define IF_NEON_SHA512(x) #endif +#if HAVE_NEON_PMULL +#define IF_NEON_PMULL(x) x +#else +#define IF_NEON_PMULL(x) +#endif + /* Ciphers that we expect to pass this test. Blowfish and Arcfour are * intentionally omitted, because we already know they don't. */ #define CIPHERS(X, Y) \ @@ -280,28 +292,40 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x)) X(Y, ssh_des) \ X(Y, ssh_des_sshcom_ssh2) \ X(Y, ssh_aes256_sdctr) \ + X(Y, ssh_aes256_gcm) \ X(Y, ssh_aes256_cbc) \ X(Y, ssh_aes192_sdctr) \ + X(Y, ssh_aes192_gcm) \ X(Y, ssh_aes192_cbc) \ X(Y, ssh_aes128_sdctr) \ + X(Y, ssh_aes128_gcm) \ X(Y, ssh_aes128_cbc) \ X(Y, ssh_aes256_sdctr_sw) \ + X(Y, ssh_aes256_gcm_sw) \ X(Y, ssh_aes256_cbc_sw) \ X(Y, ssh_aes192_sdctr_sw) \ + X(Y, ssh_aes192_gcm_sw) \ X(Y, ssh_aes192_cbc_sw) \ X(Y, ssh_aes128_sdctr_sw) \ + X(Y, ssh_aes128_gcm_sw) \ X(Y, ssh_aes128_cbc_sw) \ IF_AES_NI(X(Y, ssh_aes256_sdctr_ni)) \ + IF_AES_NI(X(Y, ssh_aes256_gcm_ni)) \ IF_AES_NI(X(Y, ssh_aes256_cbc_ni)) \ IF_AES_NI(X(Y, ssh_aes192_sdctr_ni)) \ + IF_AES_NI(X(Y, ssh_aes192_gcm_ni)) \ IF_AES_NI(X(Y, ssh_aes192_cbc_ni)) \ IF_AES_NI(X(Y, ssh_aes128_sdctr_ni)) \ + IF_AES_NI(X(Y, ssh_aes128_gcm_ni)) \ IF_AES_NI(X(Y, ssh_aes128_cbc_ni)) \ IF_NEON_CRYPTO(X(Y, ssh_aes256_sdctr_neon)) \ + IF_NEON_CRYPTO(X(Y, ssh_aes256_gcm_neon)) \ IF_NEON_CRYPTO(X(Y, ssh_aes256_cbc_neon)) \ IF_NEON_CRYPTO(X(Y, ssh_aes192_sdctr_neon)) \ + IF_NEON_CRYPTO(X(Y, ssh_aes192_gcm_neon)) \ IF_NEON_CRYPTO(X(Y, ssh_aes192_cbc_neon)) \ IF_NEON_CRYPTO(X(Y, ssh_aes128_sdctr_neon)) \ + IF_NEON_CRYPTO(X(Y, ssh_aes128_gcm_neon)) \ IF_NEON_CRYPTO(X(Y, ssh_aes128_cbc_neon)) \ X(Y, ssh2_chacha20_poly1305) \ /* end of list */ @@ -317,9 +341,17 @@ VOLATILE_WRAPPED_DEFN(static, size_t, looplimit, (size_t x)) X(Y, ssh_hmac_sha256) \ /* end of list */ -#define ALL_MACS(X, Y) \ - SIMPLE_MACS(X, Y) \ - X(Y, poly1305) \ +#define ALL_MACS(X, Y) \ + SIMPLE_MACS(X, Y) \ + X(Y, poly1305) \ + X(Y, aesgcm_sw_sw) \ + X(Y, aesgcm_sw_refpoly) \ + IF_AES_NI(X(Y, aesgcm_ni_sw)) \ + IF_NEON_CRYPTO(X(Y, aesgcm_neon_sw)) \ + IF_CLMUL(X(Y, aesgcm_sw_clmul)) \ + IF_NEON_PMULL(X(Y, aesgcm_sw_neon)) \ + IF_AES_NI(IF_CLMUL(X(Y, aesgcm_ni_clmul))) \ + IF_NEON_CRYPTO(IF_NEON_PMULL(X(Y, aesgcm_neon_neon))) \ /* end of list */ #define MAC_TESTLIST(X, name) X(mac_ ## name) @@ -1473,6 +1505,58 @@ static void test_mac_poly1305(void) test_mac(&ssh2_poly1305, &ssh2_chacha20_poly1305); } +static void test_mac_aesgcm_sw_sw(void) +{ + test_mac(&ssh2_aesgcm_mac_sw, &ssh_aes128_gcm_sw); +} + +static void test_mac_aesgcm_sw_refpoly(void) +{ + test_mac(&ssh2_aesgcm_mac_ref_poly, &ssh_aes128_gcm_sw); +} + +#if HAVE_AES_NI +static void test_mac_aesgcm_ni_sw(void) +{ + test_mac(&ssh2_aesgcm_mac_sw, &ssh_aes128_gcm_ni); +} +#endif + +#if HAVE_NEON_CRYPTO +static void test_mac_aesgcm_neon_sw(void) +{ + test_mac(&ssh2_aesgcm_mac_sw, &ssh_aes128_gcm_neon); +} +#endif + +#if HAVE_CLMUL +static void test_mac_aesgcm_sw_clmul(void) +{ + test_mac(&ssh2_aesgcm_mac_clmul, &ssh_aes128_gcm_sw); +} +#endif + +#if HAVE_NEON_PMULL +static void test_mac_aesgcm_sw_neon(void) +{ + test_mac(&ssh2_aesgcm_mac_neon, &ssh_aes128_gcm_sw); +} +#endif + +#if HAVE_AES_NI && HAVE_CLMUL +static void test_mac_aesgcm_ni_clmul(void) +{ + test_mac(&ssh2_aesgcm_mac_clmul, &ssh_aes128_gcm_ni); +} +#endif + +#if HAVE_NEON_CRYPTO && HAVE_NEON_PMULL +static void test_mac_aesgcm_neon_neon(void) +{ + test_mac(&ssh2_aesgcm_mac_neon, &ssh_aes128_gcm_neon); +} +#endif + static void test_hash(const ssh_hashalg *halg) { ssh_hash *h = ssh_hash_new(halg); diff --git a/unix/utils/arm_arch_queries.c b/unix/utils/arm_arch_queries.c index d6dc97bc..c3dc286b 100644 --- a/unix/utils/arm_arch_queries.c +++ b/unix/utils/arm_arch_queries.c @@ -27,6 +27,21 @@ bool platform_aes_neon_available(void) #endif } +bool platform_pmull_neon_available(void) +{ +#if defined HWCAP_PMULL + return getauxval(AT_HWCAP) & HWCAP_PMULL; +#elif defined HWCAP2_PMULL + return getauxval(AT_HWCAP2) & HWCAP2_PMULL; +#elif defined __APPLE__ + SysctlResult res = test_sysctl_flag("hw.optional.arm.FEAT_PMULL"); + /* As above, treat 'missing' as enabled */ + return res != SYSCTL_OFF; +#else + return false; +#endif +} + bool platform_sha256_neon_available(void) { #if defined HWCAP_SHA2 diff --git a/windows/utils/arm_arch_queries.c b/windows/utils/arm_arch_queries.c index 439a59fb..b683ac15 100644 --- a/windows/utils/arm_arch_queries.c +++ b/windows/utils/arm_arch_queries.c @@ -20,6 +20,11 @@ bool platform_aes_neon_available(void) return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); } +bool platform_pmull_neon_available(void) +{ + return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); +} + bool platform_sha256_neon_available(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);