Files
micro-AES/x86-improvements
2024-12-14 14:48:41 +03:30

323 lines
9.4 KiB
Plaintext

/** Since µAES code is optimized for 8-bit CPUs, it might be much less efficient
* for a 32-bit machine. We can apply a few tweaks, especially in the process of
* mixing columns, to boost the performance on such systems. It's safe to assume
* that the `unsigned` keyword is equivalent to `uint32_t` type. But endian-ness
* of the system must be known or pre-determined, either by including <endian.h>
* or evaluating an expression. For example suppose `SYSTEM_IS_BIG_ENDIAN` macro
* indicates whether a system is big-endian or not! Then, the following piece of
* code can be used to replace the lines #88 to #134 of the "micro_aes.c" source
* file —starting with `#if DISCARD_SUBROUTINES`. Next, the body of `MixColumns`
* and `InvMixColumns` functions must be updated to incorporate these changes */
#if CHECK_ENDIANNES_AT_RUNTIME
static const unsigned ShiftLE = 16;
#define RL8(x) rotl( x, 8U + *(char*) &ShiftLE )
#elif SYSTEM_IS_BIG_ENDIAN /* ! use appropriate standard macros */
#define RL8(x) rotl( x, 8 )
#else
#define RL8(x) rotl( x, 24 )
#endif
#define R16(x) rotl( x, 16 )
#define RRR(x) rotl( x, 8 ) ^ rotl( x, 16 ) ^ rotl( x, 24 )
/** This method must simply compile to a bit-rotate CPU instruction (ror/rol) */
static unsigned rotl( unsigned value, unsigned shift )
{
return (value << shift) | (value >> (32 - shift));
}
/** XOR two 128-bit numbers (blocks) src and dest: optimized for 32-bit CPUs. */
static void xorBlock( const block_t src, block_t dest )
{
XOR32BITS( src[ 0], dest[ 0] );
XOR32BITS( src[ 4], dest[ 4] );
XOR32BITS( src[ 8], dest[ 8] );
XOR32BITS( src[12], dest[12] );
}
/** all 4 bytes of an unsigned integer are doubled [i.e. xtime-ed] in GF(2^8) */
static void quad_xtime( unsigned* x )
{
unsigned cc = (*x >> 7 & 0x1010101L) * 0x1b;
*x = (*x << 1 & ~0x1010101L) ^ cc;
}
--------------------------------------------------------------------------------
MixColumns:
unsigned rt, i, *s = (unsigned*) &state[0];
for (i = Nb; i--; ++s)
{
rt = RRR( *s );
*s ^= RL8( *s );
quad_xtime( s );
*s ^= rt;
}
InvMixColumns:
unsigned rt, i, *s = (unsigned*) &state[0];
for (i = Nb; i--; ++s)
{
rt = RRR( *s );
quad_xtime( s );
rt ^= RL8( *s ) ^ *s;
quad_xtime( s );
rt ^= R16( *s ) ^ *s;
quad_xtime( s );
*s ^= RRR( *s ) ^ rt;
}
================================================================================
a sample compiled assembly output, given by gcc using `-S -Os` flags:
xorBlock:
mov rax, QWORD PTR [rdi]
xor QWORD PTR [rsi], rax
mov rax, QWORD PTR [rdi+8]
xor QWORD PTR [rsi+8], rax
ret
quad_xtime:
mov edx, DWORD PTR [rdi]
mov eax, edx
add edx, edx
shr eax, 7
and edx, -16843010
and eax, 16843009
imul eax, eax, 27
xor eax, edx
mov DWORD PTR [rdi], eax
ret
KeyExpansion:
movups xmm0, XMMWORD PTR [rdi]
mov eax, 16
mov dl, 1
movaps XMMWORD PTR RoundKey[rip], xmm0
.L7:
test al, 15
jne .L4
movaps xmm1, XMMWORD PTR RoundKey[rax-16]
movups XMMWORD PTR RoundKey[rax], xmm1
test dl, dl
jne .L5
mov dl, 27
.L5:
movzx ecx, BYTE PTR RoundKey[rax-3]
mov cl, BYTE PTR sbox[rcx]
xor cl, BYTE PTR RoundKey[rax]
xor ecx, edx
add edx, edx
mov BYTE PTR RoundKey[rax], cl
movzx ecx, BYTE PTR RoundKey[rax-2]
mov cl, BYTE PTR sbox[rcx]
xor BYTE PTR RoundKey[rax+1], cl
movzx ecx, BYTE PTR RoundKey[rax-1]
mov cl, BYTE PTR sbox[rcx]
xor BYTE PTR RoundKey[rax+2], cl
movzx ecx, BYTE PTR RoundKey[rax-4]
mov cl, BYTE PTR sbox[rcx]
xor BYTE PTR RoundKey[rax+3], cl
jmp .L6
.L4:
mov ecx, DWORD PTR RoundKey[rax]
xor ecx, DWORD PTR RoundKey[rax-4]
mov DWORD PTR RoundKey[rax], ecx
.L6:
add rax, 4
cmp rax, 176
jne .L7
ret
SubBytes:
xor eax, eax
.L12:
movzx edx, BYTE PTR [rdi+rax]
mov dl, BYTE PTR sbox[rdx]
mov BYTE PTR [rdi+rax], dl
inc rax
cmp rax, 16
jne .L12
ret
ShiftRows:
mov dl, BYTE PTR [rdi+5]
mov al, BYTE PTR [rdi+1]
mov BYTE PTR [rdi+1], dl
mov dl, BYTE PTR [rdi+9]
mov BYTE PTR [rdi+5], dl
mov dl, BYTE PTR [rdi+13]
mov BYTE PTR [rdi+13], al
mov al, BYTE PTR [rdi+2]
mov BYTE PTR [rdi+9], dl
mov dl, BYTE PTR [rdi+10]
mov BYTE PTR [rdi+10], al
mov al, BYTE PTR [rdi+6]
mov BYTE PTR [rdi+2], dl
mov dl, BYTE PTR [rdi+14]
mov BYTE PTR [rdi+14], al
mov al, BYTE PTR [rdi+3]
mov BYTE PTR [rdi+6], dl
mov dl, BYTE PTR [rdi+15]
mov BYTE PTR [rdi+3], dl
mov dl, BYTE PTR [rdi+11]
mov BYTE PTR [rdi+15], dl
mov dl, BYTE PTR [rdi+7]
mov BYTE PTR [rdi+7], al
mov BYTE PTR [rdi+11], dl
ret
MixColumns:
lea r8, [rdi+16]
.L16:
mov eax, DWORD PTR [rdi]
mov esi, eax
mov ecx, eax
mov edx, eax
ror esi, 8
rol edx, 16
xor eax, esi
rol ecx, 8
mov DWORD PTR [rdi], eax
xor ecx, edx
call quad_xtime
xor ecx, DWORD PTR [rdi]
add rdi, 4
xor ecx, esi
mov DWORD PTR [rdi-4], ecx
cmp rdi, r8
jne .L16
ret
rijndaelEncrypt:
mov r9, rsi
push rdx
cmp rsi, rdi
je .L19
movups xmm0, XMMWORD PTR [rdi]
movups XMMWORD PTR [rsi], xmm0
.L19:
mov r10d, OFFSET FLAT:RoundKey
.L21:
mov rdi, r10
mov rsi, r9
add r10, 16
call xorBlock
mov rdi, r9
call SubBytes
call ShiftRows
cmp r10, OFFSET FLAT:RoundKey+160
je .L20
call MixColumns
jmp .L21
.L20:
mov edi, OFFSET FLAT:RoundKey+160
pop rax
jmp xorBlock
InvSubBytes:
xor eax, eax
.L24:
movzx edx, BYTE PTR [rdi+rax]
mov dl, BYTE PTR rsbox[rdx]
mov BYTE PTR [rdi+rax], dl
inc rax
cmp rax, 16
jne .L24
ret
InvShiftRows:
mov dl, BYTE PTR [rdi+9]
mov al, BYTE PTR [rdi+13]
mov BYTE PTR [rdi+13], dl
mov dl, BYTE PTR [rdi+5]
mov BYTE PTR [rdi+9], dl
mov dl, BYTE PTR [rdi+1]
mov BYTE PTR [rdi+1], al
mov al, BYTE PTR [rdi+2]
mov BYTE PTR [rdi+5], dl
mov dl, BYTE PTR [rdi+10]
mov BYTE PTR [rdi+10], al
mov al, BYTE PTR [rdi+6]
mov BYTE PTR [rdi+2], dl
mov dl, BYTE PTR [rdi+14]
mov BYTE PTR [rdi+14], al
mov al, BYTE PTR [rdi+3]
mov BYTE PTR [rdi+6], dl
mov dl, BYTE PTR [rdi+7]
mov BYTE PTR [rdi+3], dl
mov dl, BYTE PTR [rdi+11]
mov BYTE PTR [rdi+7], dl
mov dl, BYTE PTR [rdi+15]
mov BYTE PTR [rdi+15], al
mov BYTE PTR [rdi+11], dl
ret
InvMixColumns:
mov rcx, rdi
lea rsi, [rdi+16]
.L28:
mov rdi, rcx
mov r8d, DWORD PTR [rcx]
add rcx, 4
call quad_xtime
mov r10d, DWORD PTR [rcx-4]
call quad_xtime
mov r9d, DWORD PTR [rcx-4]
call quad_xtime
mov edx, DWORD PTR [rcx-4]
mov eax, r10d
mov edi, r8d
rol edi, 8
xor eax, r9d
ror r10d, 8
xor eax, edx
rol r9d, 16
xor eax, edi
mov edi, r8d
ror r8d, 8
rol edi, 16
xor eax, edi
mov edi, edx
xor eax, r8d
rol edi, 8
xor eax, r10d
xor eax, r9d
xor eax, edi
mov edi, edx
ror edx, 8
rol edi, 16
xor eax, edi
xor eax, edx
mov DWORD PTR [rcx-4], eax
cmp rsi, rcx
jne .L28
ret
rijndaelDecrypt:
push rbp
mov r11, rsi
push rbx
push rdx
cmp rsi, rdi
je .L31
movups xmm0, XMMWORD PTR [rdi]
movups XMMWORD PTR [rsi], xmm0
.L31:
mov ebp, OFFSET FLAT:RoundKey+144
mov bl, 10
.L34:
cmp bl, 10
je .L32
mov rdi, r11
call InvMixColumns
jmp .L33
.L32:
mov rsi, r11
mov edi, OFFSET FLAT:RoundKey+160
call xorBlock
.L33:
mov rdi, r11
mov rsi, r11
call InvShiftRows
call InvSubBytes
mov rdi, rbp
sub rbp, 16
call xorBlock
dec bl
jne .L34
pop rax
pop rbx
pop rbp
ret