323 lines
9.4 KiB
Plaintext
323 lines
9.4 KiB
Plaintext
/** Since µAES code is optimized for 8-bit CPUs, it might be much less efficient
|
|
* for a 32-bit machine. We can apply a few tweaks, especially in the process of
|
|
* mixing columns, to boost the performance on such systems. It's safe to assume
|
|
* that the `unsigned` keyword is equivalent to `uint32_t` type. But endian-ness
|
|
* of the system must be known or pre-determined, either by including <endian.h>
|
|
* or evaluating an expression. For example suppose `SYSTEM_IS_BIG_ENDIAN` macro
|
|
* indicates whether a system is big-endian or not! Then, the following piece of
|
|
* code can be used to replace the lines #88 to #134 of the "micro_aes.c" source
|
|
* file —starting with `#if DISCARD_SUBROUTINES`. Next, the body of `MixColumns`
|
|
* and `InvMixColumns` functions must be updated to incorporate these changes */
|
|
|
|
#if CHECK_ENDIANNES_AT_RUNTIME
|
|
static const unsigned ShiftLE = 16;
|
|
#define RL8(x) rotl( x, 8U + *(char*) &ShiftLE )
|
|
|
|
#elif SYSTEM_IS_BIG_ENDIAN /* ! use appropriate standard macros */
|
|
#define RL8(x) rotl( x, 8 )
|
|
#else
|
|
#define RL8(x) rotl( x, 24 )
|
|
#endif
|
|
#define R16(x) rotl( x, 16 )
|
|
#define RRR(x) rotl( x, 8 ) ^ rotl( x, 16 ) ^ rotl( x, 24 )
|
|
|
|
/** This method must simply compile to a bit-rotate CPU instruction (ror/rol) */
|
|
static unsigned rotl( unsigned value, unsigned shift )
|
|
{
|
|
return (value << shift) | (value >> (32 - shift));
|
|
}
|
|
|
|
/** XOR two 128-bit numbers (blocks) src and dest: optimized for 32-bit CPUs. */
|
|
static void xorBlock( const block_t src, block_t dest )
|
|
{
|
|
XOR32BITS( src[ 0], dest[ 0] );
|
|
XOR32BITS( src[ 4], dest[ 4] );
|
|
XOR32BITS( src[ 8], dest[ 8] );
|
|
XOR32BITS( src[12], dest[12] );
|
|
}
|
|
|
|
/** all 4 bytes of an unsigned integer are doubled [i.e. xtime-ed] in GF(2^8) */
|
|
static void quad_xtime( unsigned* x )
|
|
{
|
|
unsigned cc = (*x >> 7 & 0x1010101L) * 0x1b;
|
|
*x = (*x << 1 & ~0x1010101L) ^ cc;
|
|
}
|
|
|
|
--------------------------------------------------------------------------------
|
|
MixColumns:
|
|
unsigned rt, i, *s = (unsigned*) &state[0];
|
|
for (i = Nb; i--; ++s)
|
|
{
|
|
rt = RRR( *s );
|
|
*s ^= RL8( *s );
|
|
quad_xtime( s );
|
|
*s ^= rt;
|
|
}
|
|
|
|
InvMixColumns:
|
|
unsigned rt, i, *s = (unsigned*) &state[0];
|
|
for (i = Nb; i--; ++s)
|
|
{
|
|
rt = RRR( *s );
|
|
quad_xtime( s );
|
|
rt ^= RL8( *s ) ^ *s;
|
|
quad_xtime( s );
|
|
rt ^= R16( *s ) ^ *s;
|
|
quad_xtime( s );
|
|
*s ^= RRR( *s ) ^ rt;
|
|
}
|
|
|
|
================================================================================
|
|
a sample compiled assembly output, given by gcc using `-S -Os` flags:
|
|
|
|
xorBlock:
|
|
mov rax, QWORD PTR [rdi]
|
|
xor QWORD PTR [rsi], rax
|
|
mov rax, QWORD PTR [rdi+8]
|
|
xor QWORD PTR [rsi+8], rax
|
|
ret
|
|
quad_xtime:
|
|
mov edx, DWORD PTR [rdi]
|
|
mov eax, edx
|
|
add edx, edx
|
|
shr eax, 7
|
|
and edx, -16843010
|
|
and eax, 16843009
|
|
imul eax, eax, 27
|
|
xor eax, edx
|
|
mov DWORD PTR [rdi], eax
|
|
ret
|
|
KeyExpansion:
|
|
movups xmm0, XMMWORD PTR [rdi]
|
|
mov eax, 16
|
|
mov dl, 1
|
|
movaps XMMWORD PTR RoundKey[rip], xmm0
|
|
.L7:
|
|
test al, 15
|
|
jne .L4
|
|
movaps xmm1, XMMWORD PTR RoundKey[rax-16]
|
|
movups XMMWORD PTR RoundKey[rax], xmm1
|
|
test dl, dl
|
|
jne .L5
|
|
mov dl, 27
|
|
.L5:
|
|
movzx ecx, BYTE PTR RoundKey[rax-3]
|
|
mov cl, BYTE PTR sbox[rcx]
|
|
xor cl, BYTE PTR RoundKey[rax]
|
|
xor ecx, edx
|
|
add edx, edx
|
|
mov BYTE PTR RoundKey[rax], cl
|
|
movzx ecx, BYTE PTR RoundKey[rax-2]
|
|
mov cl, BYTE PTR sbox[rcx]
|
|
xor BYTE PTR RoundKey[rax+1], cl
|
|
movzx ecx, BYTE PTR RoundKey[rax-1]
|
|
mov cl, BYTE PTR sbox[rcx]
|
|
xor BYTE PTR RoundKey[rax+2], cl
|
|
movzx ecx, BYTE PTR RoundKey[rax-4]
|
|
mov cl, BYTE PTR sbox[rcx]
|
|
xor BYTE PTR RoundKey[rax+3], cl
|
|
jmp .L6
|
|
.L4:
|
|
mov ecx, DWORD PTR RoundKey[rax]
|
|
xor ecx, DWORD PTR RoundKey[rax-4]
|
|
mov DWORD PTR RoundKey[rax], ecx
|
|
.L6:
|
|
add rax, 4
|
|
cmp rax, 176
|
|
jne .L7
|
|
ret
|
|
SubBytes:
|
|
xor eax, eax
|
|
.L12:
|
|
movzx edx, BYTE PTR [rdi+rax]
|
|
mov dl, BYTE PTR sbox[rdx]
|
|
mov BYTE PTR [rdi+rax], dl
|
|
inc rax
|
|
cmp rax, 16
|
|
jne .L12
|
|
ret
|
|
ShiftRows:
|
|
mov dl, BYTE PTR [rdi+5]
|
|
mov al, BYTE PTR [rdi+1]
|
|
mov BYTE PTR [rdi+1], dl
|
|
mov dl, BYTE PTR [rdi+9]
|
|
mov BYTE PTR [rdi+5], dl
|
|
mov dl, BYTE PTR [rdi+13]
|
|
mov BYTE PTR [rdi+13], al
|
|
mov al, BYTE PTR [rdi+2]
|
|
mov BYTE PTR [rdi+9], dl
|
|
mov dl, BYTE PTR [rdi+10]
|
|
mov BYTE PTR [rdi+10], al
|
|
mov al, BYTE PTR [rdi+6]
|
|
mov BYTE PTR [rdi+2], dl
|
|
mov dl, BYTE PTR [rdi+14]
|
|
mov BYTE PTR [rdi+14], al
|
|
mov al, BYTE PTR [rdi+3]
|
|
mov BYTE PTR [rdi+6], dl
|
|
mov dl, BYTE PTR [rdi+15]
|
|
mov BYTE PTR [rdi+3], dl
|
|
mov dl, BYTE PTR [rdi+11]
|
|
mov BYTE PTR [rdi+15], dl
|
|
mov dl, BYTE PTR [rdi+7]
|
|
mov BYTE PTR [rdi+7], al
|
|
mov BYTE PTR [rdi+11], dl
|
|
ret
|
|
MixColumns:
|
|
lea r8, [rdi+16]
|
|
.L16:
|
|
mov eax, DWORD PTR [rdi]
|
|
mov esi, eax
|
|
mov ecx, eax
|
|
mov edx, eax
|
|
ror esi, 8
|
|
rol edx, 16
|
|
xor eax, esi
|
|
rol ecx, 8
|
|
mov DWORD PTR [rdi], eax
|
|
xor ecx, edx
|
|
call quad_xtime
|
|
xor ecx, DWORD PTR [rdi]
|
|
add rdi, 4
|
|
xor ecx, esi
|
|
mov DWORD PTR [rdi-4], ecx
|
|
cmp rdi, r8
|
|
jne .L16
|
|
ret
|
|
rijndaelEncrypt:
|
|
mov r9, rsi
|
|
push rdx
|
|
cmp rsi, rdi
|
|
je .L19
|
|
movups xmm0, XMMWORD PTR [rdi]
|
|
movups XMMWORD PTR [rsi], xmm0
|
|
.L19:
|
|
mov r10d, OFFSET FLAT:RoundKey
|
|
.L21:
|
|
mov rdi, r10
|
|
mov rsi, r9
|
|
add r10, 16
|
|
call xorBlock
|
|
mov rdi, r9
|
|
call SubBytes
|
|
call ShiftRows
|
|
cmp r10, OFFSET FLAT:RoundKey+160
|
|
je .L20
|
|
call MixColumns
|
|
jmp .L21
|
|
.L20:
|
|
mov edi, OFFSET FLAT:RoundKey+160
|
|
pop rax
|
|
jmp xorBlock
|
|
InvSubBytes:
|
|
xor eax, eax
|
|
.L24:
|
|
movzx edx, BYTE PTR [rdi+rax]
|
|
mov dl, BYTE PTR rsbox[rdx]
|
|
mov BYTE PTR [rdi+rax], dl
|
|
inc rax
|
|
cmp rax, 16
|
|
jne .L24
|
|
ret
|
|
InvShiftRows:
|
|
mov dl, BYTE PTR [rdi+9]
|
|
mov al, BYTE PTR [rdi+13]
|
|
mov BYTE PTR [rdi+13], dl
|
|
mov dl, BYTE PTR [rdi+5]
|
|
mov BYTE PTR [rdi+9], dl
|
|
mov dl, BYTE PTR [rdi+1]
|
|
mov BYTE PTR [rdi+1], al
|
|
mov al, BYTE PTR [rdi+2]
|
|
mov BYTE PTR [rdi+5], dl
|
|
mov dl, BYTE PTR [rdi+10]
|
|
mov BYTE PTR [rdi+10], al
|
|
mov al, BYTE PTR [rdi+6]
|
|
mov BYTE PTR [rdi+2], dl
|
|
mov dl, BYTE PTR [rdi+14]
|
|
mov BYTE PTR [rdi+14], al
|
|
mov al, BYTE PTR [rdi+3]
|
|
mov BYTE PTR [rdi+6], dl
|
|
mov dl, BYTE PTR [rdi+7]
|
|
mov BYTE PTR [rdi+3], dl
|
|
mov dl, BYTE PTR [rdi+11]
|
|
mov BYTE PTR [rdi+7], dl
|
|
mov dl, BYTE PTR [rdi+15]
|
|
mov BYTE PTR [rdi+15], al
|
|
mov BYTE PTR [rdi+11], dl
|
|
ret
|
|
InvMixColumns:
|
|
mov rcx, rdi
|
|
lea rsi, [rdi+16]
|
|
.L28:
|
|
mov rdi, rcx
|
|
mov r8d, DWORD PTR [rcx]
|
|
add rcx, 4
|
|
call quad_xtime
|
|
mov r10d, DWORD PTR [rcx-4]
|
|
call quad_xtime
|
|
mov r9d, DWORD PTR [rcx-4]
|
|
call quad_xtime
|
|
mov edx, DWORD PTR [rcx-4]
|
|
mov eax, r10d
|
|
mov edi, r8d
|
|
rol edi, 8
|
|
xor eax, r9d
|
|
ror r10d, 8
|
|
xor eax, edx
|
|
rol r9d, 16
|
|
xor eax, edi
|
|
mov edi, r8d
|
|
ror r8d, 8
|
|
rol edi, 16
|
|
xor eax, edi
|
|
mov edi, edx
|
|
xor eax, r8d
|
|
rol edi, 8
|
|
xor eax, r10d
|
|
xor eax, r9d
|
|
xor eax, edi
|
|
mov edi, edx
|
|
ror edx, 8
|
|
rol edi, 16
|
|
xor eax, edi
|
|
xor eax, edx
|
|
mov DWORD PTR [rcx-4], eax
|
|
cmp rsi, rcx
|
|
jne .L28
|
|
ret
|
|
rijndaelDecrypt:
|
|
push rbp
|
|
mov r11, rsi
|
|
push rbx
|
|
push rdx
|
|
cmp rsi, rdi
|
|
je .L31
|
|
movups xmm0, XMMWORD PTR [rdi]
|
|
movups XMMWORD PTR [rsi], xmm0
|
|
.L31:
|
|
mov ebp, OFFSET FLAT:RoundKey+144
|
|
mov bl, 10
|
|
.L34:
|
|
cmp bl, 10
|
|
je .L32
|
|
mov rdi, r11
|
|
call InvMixColumns
|
|
jmp .L33
|
|
.L32:
|
|
mov rsi, r11
|
|
mov edi, OFFSET FLAT:RoundKey+160
|
|
call xorBlock
|
|
.L33:
|
|
mov rdi, r11
|
|
mov rsi, r11
|
|
call InvShiftRows
|
|
call InvSubBytes
|
|
mov rdi, rbp
|
|
sub rbp, 16
|
|
call xorBlock
|
|
dec bl
|
|
jne .L34
|
|
pop rax
|
|
pop rbx
|
|
pop rbp
|
|
ret
|