Fix ARM CE byte ordering, expand C/C++ API, and harden build

ARM backends: fix round key byte-swap on little-endian (vrev32q_u8),
rewrite decrypt to pre-process middle keys with InvMixColumns, fix
GHASH PMULL reflect and reduction ordering.

API: add nonce/IV-generating convenience overloads for CTR, CBC, and
GCM (library generates and prepends nonce, appends tag). Add C API
for IV/nonce generation. Rename error codes (TINYAES_OK, Result::Ok,
Result::AuthenticationFailed, etc.).

Build: add MinGW GCC AVX-512 debug alignment fix, harden bench/fuzz
CMake targets (warnings-as-errors, linker hardening), align with
tinysha CMake conventions. Add README.

Tests: expand coverage for nonce-generating API overloads, add NIST
GCM test vectors, improve fuzz target differential testing.
This commit is contained in:
Brandon Lehmann
2026-02-24 21:57:00 -05:00
parent cc49624c7a
commit b4df5d078a
30 changed files with 1646 additions and 277 deletions

View File

@@ -13,8 +13,20 @@ set_target_properties(tinyaes_benchmarks PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
)
# Warning flags for benchmarks
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
target_compile_options(tinyaes_benchmarks PRIVATE -Wall -Wextra -Wpedantic)
target_compile_options(tinyaes_benchmarks PRIVATE -Wall -Wextra -Wpedantic -Werror)
if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
set_property(TARGET tinyaes_benchmarks APPEND_STRING PROPERTY LINK_FLAGS
" -Wl,-z,relro,-z,now -Wl,-z,noexecstack")
endif()
# macOS: -bind_at_load is deprecated on modern macOS (eager binding is the default)
if(MINGW)
set_property(TARGET tinyaes_benchmarks APPEND_STRING PROPERTY LINK_FLAGS
" -Wl,--nxcompat -Wl,--dynamicbase -Wl,--high-entropy-va")
endif()
elseif(MSVC)
target_compile_options(tinyaes_benchmarks PRIVATE /W4)
target_compile_options(tinyaes_benchmarks PRIVATE /W4 /WX)
set_property(TARGET tinyaes_benchmarks APPEND_STRING PROPERTY LINK_FLAGS
" /DYNAMICBASE /NXCOMPAT /HIGHENTROPYVA")
endif()

View File

@@ -2,84 +2,307 @@
// BSD 3-Clause License (see LICENSE)
#include "tinyaes.h"
#include "internal/aes_impl.h"
#include "internal/ghash.h"
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
static constexpr size_t BENCH_SIZE = 1024 * 1024; // 1 MiB
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
#define HAS_RDTSC 1
#if defined(_MSC_VER)
#include <intrin.h>
static inline uint64_t rdtsc()
{
return __rdtsc();
}
#else
static inline uint64_t rdtsc()
{
uint32_t lo, hi;
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
return (static_cast<uint64_t>(hi) << 32) | lo;
}
#endif
#else
#define HAS_RDTSC 0
#endif
static constexpr int ITERATIONS = 100;
template<typename Fn> static double bench(const char *name, Fn &&fn)
struct BenchResult
{
double mib_per_sec;
double cycles_per_byte; // 0 if rdtsc not available
};
template<typename Fn> static BenchResult bench(const char *name, size_t data_size, Fn &&fn)
{
// Warmup
fn();
#if HAS_RDTSC
uint64_t tsc_start = rdtsc();
#endif
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < ITERATIONS; ++i)
{
fn();
}
auto end = std::chrono::high_resolution_clock::now();
#if HAS_RDTSC
uint64_t tsc_end = rdtsc();
#endif
double ms = std::chrono::duration<double, std::milli>(end - start).count();
double total_bytes = static_cast<double>(BENCH_SIZE) * ITERATIONS;
double total_bytes = static_cast<double>(data_size) * ITERATIONS;
double mib_per_sec = (total_bytes / (1024.0 * 1024.0)) / (ms / 1000.0);
std::printf("%-30s %8.2f MiB/s (%6.2f ms total)\n", name, mib_per_sec, ms);
return mib_per_sec;
double cpb = 0.0;
#if HAS_RDTSC
if (total_bytes > 0)
cpb = static_cast<double>(tsc_end - tsc_start) / total_bytes;
#endif
if (cpb > 0)
std::printf(" %-40s %8.2f MiB/s %6.2f c/B (%6.2f ms)\n", name, mib_per_sec, cpb, ms);
else
std::printf(" %-40s %8.2f MiB/s (%6.2f ms)\n", name, mib_per_sec, ms);
return {mib_per_sec, cpb};
}
// Bench a single AES encrypt_block function at low level
template<typename EncFn>
static void bench_block_fn(const char *name, tinyaes::internal::key_expand_fn key_expand, EncFn enc_fn,
const uint8_t *key, size_t key_len)
{
int rounds = tinyaes::internal::aes_rounds(key_len);
uint32_t rk[tinyaes::internal::AES_MAX_RK_WORDS];
key_expand(key, key_len, rk);
uint8_t block[16] = {0};
bench(
name, 16,
[&]()
{
for (int i = 0; i < 1000; ++i)
enc_fn(rk, rounds, block, block);
});
}
static void run_mode_benchmarks()
{
static const size_t sizes[] = {16, 256, 1024, 4096, 16384, 65536, 262144, 1048576};
static const size_t key_lens[] = {16, 24, 32};
static const char *key_names[] = {"128", "192", "256"};
for (size_t ki = 0; ki < 3; ++ki)
{
size_t kl = key_lens[ki];
std::vector<uint8_t> key(kl, 0x42);
std::vector<uint8_t> iv_16(16, 0x01);
std::vector<uint8_t> iv_12(12, 0x01);
std::vector<uint8_t> aad = {0xAA, 0xBB, 0xCC, 0xDD};
std::printf("\n=== AES-%s ===\n", key_names[ki]);
for (size_t sz : sizes)
{
// ECB needs block-aligned
size_t ecb_sz = (sz / 16) * 16;
if (ecb_sz == 0)
ecb_sz = 16;
std::vector<uint8_t> pt_ecb(ecb_sz, 0x55);
std::vector<uint8_t> pt(sz, 0x55);
char label[128];
std::printf("\n--- %zu bytes ---\n", sz);
std::snprintf(label, sizeof(label), "ECB-%s encrypt", key_names[ki]);
bench(
label, ecb_sz,
[&]()
{
std::vector<uint8_t> ct;
tinyaes::ecb_encrypt(key, pt_ecb, ct);
});
std::snprintf(label, sizeof(label), "CBC-%s encrypt", key_names[ki]);
bench(
label, ecb_sz,
[&]()
{
std::vector<uint8_t> ct;
tinyaes::cbc_encrypt(key, iv_16, pt_ecb, ct);
});
std::snprintf(label, sizeof(label), "CTR-%s encrypt", key_names[ki]);
bench(
label, sz,
[&]()
{
std::vector<uint8_t> ct;
tinyaes::ctr_crypt(key, iv_16, pt, ct);
});
std::snprintf(label, sizeof(label), "GCM-%s encrypt", key_names[ki]);
bench(
label, sz,
[&]()
{
std::vector<uint8_t> ct, tag;
tinyaes::gcm_encrypt(key, iv_12, aad, pt, ct, tag);
});
std::snprintf(label, sizeof(label), "GCM-%s decrypt", key_names[ki]);
// Pre-encrypt for decrypt bench
std::vector<uint8_t> ct_pre, tag_pre;
tinyaes::gcm_encrypt(key, iv_12, aad, pt, ct_pre, tag_pre);
bench(
label, sz,
[&]()
{
std::vector<uint8_t> dec;
tinyaes::gcm_decrypt(key, iv_12, aad, ct_pre, tag_pre, dec);
});
}
}
}
static void run_gcm_aad_benchmarks()
{
std::printf("\n=== GCM AAD Variation (AES-128, 4096B plaintext) ===\n");
std::vector<uint8_t> key(16, 0x42);
std::vector<uint8_t> iv(12, 0x01);
std::vector<uint8_t> pt(4096, 0x55);
static const size_t aad_sizes[] = {0, 16, 64, 256, 1024, 4096};
for (size_t aad_sz : aad_sizes)
{
std::vector<uint8_t> aad(aad_sz, 0xAA);
char label[128];
std::snprintf(label, sizeof(label), "GCM-128 AAD=%zuB", aad_sz);
bench(
label, 4096 + aad_sz,
[&]()
{
std::vector<uint8_t> ct, tag;
tinyaes::gcm_encrypt(key, iv, aad, pt, ct, tag);
});
}
}
static void run_key_expansion_benchmarks()
{
std::printf("\n=== Key Expansion ===\n");
static const size_t key_lens[] = {16, 24, 32};
static const char *key_names[] = {"128", "192", "256"};
auto key_expand = tinyaes::internal::get_key_expand();
for (size_t ki = 0; ki < 3; ++ki)
{
std::vector<uint8_t> key(key_lens[ki], 0x42);
uint32_t rk[tinyaes::internal::AES_MAX_RK_WORDS];
char label[128];
std::snprintf(label, sizeof(label), "Key expand AES-%s (dispatched)", key_names[ki]);
bench(
label, key_lens[ki],
[&]()
{
for (int i = 0; i < 1000; ++i)
key_expand(key.data(), key.size(), rk);
});
std::snprintf(label, sizeof(label), "Key expand AES-%s (portable)", key_names[ki]);
bench(
label, key_lens[ki],
[&]()
{
for (int i = 0; i < 1000; ++i)
tinyaes::internal::aes_key_expand_portable(key.data(), key.size(), rk);
});
}
}
static void run_ghash_benchmarks()
{
std::printf("\n=== GHASH ===\n");
// Compute H from a fixed key
uint8_t key_raw[16] = {0x42};
uint32_t rk[tinyaes::internal::AES_MAX_RK_WORDS];
tinyaes::internal::aes_key_expand_portable(key_raw, 16, rk);
uint8_t H[16] = {0};
uint8_t zero[16] = {0};
tinyaes::internal::aes_encrypt_block_portable(rk, 10, zero, H);
auto ghash_dispatch = tinyaes::internal::get_ghash();
static const size_t sizes[] = {16, 256, 1024, 4096, 65536};
for (size_t sz : sizes)
{
std::vector<uint8_t> data(sz, 0x55);
char label[128];
std::snprintf(label, sizeof(label), "GHASH %zuB (dispatched)", sz);
bench(
label, sz,
[&]()
{
uint8_t Y[16] = {0};
ghash_dispatch(H, data.data(), data.size(), Y);
});
std::snprintf(label, sizeof(label), "GHASH %zuB (portable)", sz);
bench(
label, sz,
[&]()
{
uint8_t Y[16] = {0};
tinyaes::internal::ghash_portable(H, data.data(), data.size(), Y);
});
}
}
static void run_backend_comparison()
{
std::printf("\n=== Backend Comparison (single block encrypt x1000) ===\n");
uint8_t key128[16] = {0x42};
uint8_t key256[32] = {0x42};
// Portable
bench_block_fn("AES-128 portable", tinyaes::internal::aes_key_expand_portable,
tinyaes::internal::aes_encrypt_block_portable, key128, 16);
bench_block_fn("AES-256 portable", tinyaes::internal::aes_key_expand_portable,
tinyaes::internal::aes_encrypt_block_portable, key256, 32);
// Dispatched (may be AES-NI, ARM CE, or portable)
bench_block_fn("AES-128 dispatched", tinyaes::internal::get_key_expand(), tinyaes::internal::get_encrypt_block(),
key128, 16);
bench_block_fn("AES-256 dispatched", tinyaes::internal::get_key_expand(), tinyaes::internal::get_encrypt_block(),
key256, 32);
}
int main()
{
std::vector<uint8_t> key_128(16, 0x42);
std::vector<uint8_t> key_256(32, 0x42);
std::vector<uint8_t> iv_16(16, 0x01);
std::vector<uint8_t> iv_12(12, 0x01);
std::vector<uint8_t> plaintext(BENCH_SIZE, 0x55);
std::vector<uint8_t> aad = {0xAA, 0xBB, 0xCC, 0xDD};
std::printf("TinyAES Benchmarks (%zu bytes x %d iterations)\n", BENCH_SIZE, ITERATIONS);
std::printf("TinyAES Benchmarks (%d iterations per measurement)\n", ITERATIONS);
std::printf("================================================================\n");
// ECB
bench("ECB-128 encrypt", [&]() {
std::vector<uint8_t> ct;
tinyaes::ecb_encrypt(key_128, plaintext, ct);
});
bench("ECB-256 encrypt", [&]() {
std::vector<uint8_t> ct;
tinyaes::ecb_encrypt(key_256, plaintext, ct);
});
// CBC
bench("CBC-128 encrypt", [&]() {
std::vector<uint8_t> ct;
tinyaes::cbc_encrypt(key_128, iv_16, plaintext, ct);
});
bench("CBC-256 encrypt", [&]() {
std::vector<uint8_t> ct;
tinyaes::cbc_encrypt(key_256, iv_16, plaintext, ct);
});
// CTR
bench("CTR-128 encrypt", [&]() {
std::vector<uint8_t> ct;
tinyaes::ctr_crypt(key_128, iv_16, plaintext, ct);
});
bench("CTR-256 encrypt", [&]() {
std::vector<uint8_t> ct;
tinyaes::ctr_crypt(key_256, iv_16, plaintext, ct);
});
// GCM
bench("GCM-128 encrypt", [&]() {
std::vector<uint8_t> ct, tag;
tinyaes::gcm_encrypt(key_128, iv_12, aad, plaintext, ct, tag);
});
bench("GCM-256 encrypt", [&]() {
std::vector<uint8_t> ct, tag;
tinyaes::gcm_encrypt(key_256, iv_12, aad, plaintext, ct, tag);
});
run_backend_comparison();
run_key_expansion_benchmarks();
run_ghash_benchmarks();
run_mode_benchmarks();
run_gcm_aad_benchmarks();
return 0;
}