From 2222cd104dc5bd424fe025b98c133c91195cf9f3 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Wed, 12 Oct 2022 12:54:36 +0100 Subject: [PATCH] AES-GCM NEON: cope with missing vaddq_p128. In some compilers (I'm told clang 10, in particular), the NEON intrinsic vaddq_p128 is missing, even though its input type poly128_t is provided. vaddq_p128 is just an XOR of two vector registers, so that's easy to work around by casting to a more mundane type and back. Added a configure-time test for that intrinsic, and a workaround to be used in its absence. --- cmake/cmake.h.in | 1 + crypto/CMakeLists.txt | 8 ++++++++ crypto/aesgcm-neon.c | 8 ++++++++ 3 files changed, 17 insertions(+) diff --git a/cmake/cmake.h.in b/cmake/cmake.h.in index 91d52d78..5ad32515 100644 --- a/cmake/cmake.h.in +++ b/cmake/cmake.h.in @@ -54,6 +54,7 @@ #cmakedefine01 HAVE_CLMUL #cmakedefine01 HAVE_NEON_CRYPTO #cmakedefine01 HAVE_NEON_PMULL +#cmakedefine01 HAVE_NEON_VADDQ_P128 #cmakedefine01 HAVE_NEON_SHA512 #cmakedefine01 HAVE_NEON_SHA512_INTRINSICS #cmakedefine01 USE_ARM64_NEON_H diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt index ff04efb5..4b0aa907 100644 --- a/crypto/CMakeLists.txt +++ b/crypto/CMakeLists.txt @@ -195,6 +195,14 @@ if(neon) int main(void) { r = vmull_p64(a, b); r = vmull_high_p64(u, v); }" ADD_SOURCES_IF_SUCCESSFUL aesgcm-neon.c) + test_compile_with_flags(HAVE_NEON_VADDQ_P128 + GNU_FLAGS -march=armv8-a+crypto + MSVC_FLAGS -D_ARM_USE_NEW_NEON_INTRINSICS + TEST_SOURCE " + #include <${neon_header}> + volatile poly128_t r; + int main(void) { r = vaddq_p128(r, r); }") + # The 'sha3' architecture extension, despite the name, includes # support for SHA-512 (from the SHA-2 standard) as well as SHA-3 # proper. diff --git a/crypto/aesgcm-neon.c b/crypto/aesgcm-neon.c index dd7b83cc..64bc8349 100644 --- a/crypto/aesgcm-neon.c +++ b/crypto/aesgcm-neon.c @@ -87,6 +87,14 @@ static inline void store_p128_be(void *p, poly128_t v) vst1q_u8(p, vrev64q_u8(vreinterpretq_u8_p128(swapped))); } +#if !HAVE_NEON_VADDQ_P128 +static inline poly128_t vaddq_p128(poly128_t a, poly128_t b) +{ + return vreinterpretq_p128_u32(veorq_u32( + vreinterpretq_u32_p128(a), vreinterpretq_u32_p128(b))); +} +#endif + /* * Key setup is just like in aesgcm-ref-poly.c. There's no point using * vector registers to accelerate this, because it happens rarely.