p7zip-rar/Asm/x86/AesOpt.asm

311 lines
5.3 KiB
NASM

; AesOpt.asm -- Intel's AES.
; 2009-12-12 : Igor Pavlov : Public domain
%include "7zAsm.asm"
MY_ASM_START
%ifndef x64
; FIXME .xmm
%endif
%define rD r2
%define rN r0
%macro MY_PROLOG 1 ; MY_PROLOG macro reg:req
%ifdef x64
%ifdef CYGWIN64
; ivAes : %rcx
; data : %rdx
; numBlocks : %r8
%else
mov RCX,RDI
mov R8 ,RDX
mov RDX,RSI
%endif
; movdqa [r4 + 8], xmm6
; movdqa [r4 + 8 + 16], xmm7
%endif
push r3
push r5
%ifdef x64
%ifdef CYGWIN64
push r6
%endif
mov rN, r8
%else
push r6
mov ecx, [r4 + REG_SIZE * 4]
mov edx, [r4 + REG_SIZE * 5]
mov rN, [r4 + REG_SIZE * 6]
%endif
mov x6, [r1 + 16]
shl x6, 5
movdqa %1, [r1] ; reg
add r1, 32
%endmacro
%macro MY_EPILOG 0
%ifdef x64
%ifdef CYGWIN64
pop r6
%endif
%else
pop r6
%endif
pop r5
pop r3
%ifdef x64
; movdqa xmm6, [r4 + 8]
; movdqa xmm7, [r4 + 8 + 16]
%endif
MY_ENDP
%endmacro
ways equ 4
ways16 equ (ways * 16)
%macro OP_W 2 ; op, op2
%define i 0
%1 xmm0,%2
%define i 1
%1 xmm1,%2
%define i 2
%1 xmm2,%2
%define i 3
%1 xmm3,%2
%endmacro
%macro LOAD_OP 2 ; LOAD_OP macro op:req, offs:req
%1 xmm0, [r1 + r3 %2]
%endmacro
%macro LOAD_OP_W 2 ; LOAD_OP_W macro op:req, offs:req
movdqa xmm7, [r1 + r3 %2]
; OP_W %1, xmm7
%1 xmm0,xmm7
%1 xmm1,xmm7
%1 xmm2,xmm7
%1 xmm3,xmm7
%endmacro
; ---------- AES-CBC Decode ----------
%macro CBC_DEC_UPDATE 2 ; CBC_DEC_UPDATE macro reg, offs
pxor %1, xmm6
movdqa xmm6, [rD + %2]
movdqa [rD + %2], %1
%endmacro
%macro DECODE 1 ; macro op:req
%1 aesdec, +16
%%B:
%1 aesdec, +0
%1 aesdec, -16
sub x3, 32
jnz %%B
%1 aesdeclast, +0
%endmacro
; void AesCbc_Decode_Intel(UInt32 *ivAes, Byte *data, size_t numBlocks)
MY_PROC AesCbc_Decode_Intel, 3
MY_PROLOG xmm6
sub x6, 32
jmp check2
align 16
nextBlocks2:
mov x3, x6
OP_W movdqa, [rD + i * 16]
LOAD_OP_W pxor, +32
DECODE LOAD_OP_W
;OP_W CBC_DEC_UPDATE, i * 16
CBC_DEC_UPDATE xmm0, 0 * 16
CBC_DEC_UPDATE xmm1, 1 * 16
CBC_DEC_UPDATE xmm2, 2 * 16
CBC_DEC_UPDATE xmm3, 3 * 16
add rD, ways16
check2:
sub rN, ways
jnc nextBlocks2
add rN, ways
jmp check
nextBlock:
mov x3, x6
movdqa xmm1, [rD]
LOAD_OP movdqa, +32
pxor xmm0, xmm1
DECODE LOAD_OP
pxor xmm0, xmm6
movdqa [rD], xmm0
movdqa xmm6, xmm1
add rD, 16
check:
sub rN, 1
jnc nextBlock
movdqa [r1 - 32], xmm6
MY_EPILOG
; ---------- AES-CBC Encode ----------
%macro ENCODE 1 ; macro op:req
%1 aesenc, -16
%%B:
%1 aesenc, +0
%1 aesenc, +16
add r3, 32
jnz %%B
%1 aesenclast, +0
%endmacro
MY_PROC AesCbc_Encode_Intel, 3
MY_PROLOG xmm0
add r1, r6
neg r6
add r6, 32
jmp check_e
align 16
nextBlock_e:
mov r3, r6
pxor xmm0, [rD]
pxor xmm0, [r1 + r3 - 32]
ENCODE LOAD_OP
movdqa [rD], xmm0
add rD, 16
check_e:
sub rN, 1
jnc nextBlock_e
movdqa [r1 + r6 - 64], xmm0
MY_EPILOG
; ---------- AES-CTR ----------
%macro XOR_UPD_1 2 ; reg, offs
pxor %1, [rD + %2]
%endmacro
%macro XOR_UPD_2 2 ; reg, offs
movdqa [rD + %2], %1
%endmacro
MY_PROC AesCtr_Code_Intel, 3
MY_PROLOG xmm6
mov r5, r4
shr r5, 4
dec r5
shl r5, 4
mov DWORD [r5], 1
mov DWORD [r5 + 4], 0
mov DWORD [r5 + 8], 0
mov DWORD [r5 + 12], 0
add r1, r6
neg r6
add r6, 32
jmp check2_c
align 16
nextBlocks2_c:
movdqa xmm7, [r5]
; i = 0
; rept ways
; paddq xmm6, xmm7
; movdqa @CatStr(xmm,%i), xmm6
; i = i + 1
; endm
paddq xmm6, xmm7
movdqa xmm0, xmm6
paddq xmm6, xmm7
movdqa xmm1, xmm6
paddq xmm6, xmm7
movdqa xmm2, xmm6
paddq xmm6, xmm7
movdqa xmm3, xmm6
mov r3, r6
LOAD_OP_W pxor, -32
ENCODE LOAD_OP_W
;OP_W XOR_UPD_1, i * 16
XOR_UPD_1 xmm0, 0 * 16
XOR_UPD_1 xmm1, 1 * 16
XOR_UPD_1 xmm2, 2 * 16
XOR_UPD_1 xmm3, 3 * 16
;OP_W XOR_UPD_2, i * 16
XOR_UPD_2 xmm0, 0 * 16
XOR_UPD_2 xmm1, 1 * 16
XOR_UPD_2 xmm2, 2 * 16
XOR_UPD_2 xmm3, 3 * 16
add rD, ways16
check2_c:
sub rN, ways
jnc nextBlocks2_c
add rN, ways
jmp check_c
nextBlock_c:
paddq xmm6, [r5]
mov r3, r6
movdqa xmm0, [r1 + r3 - 32]
pxor xmm0, xmm6
ENCODE LOAD_OP
XOR_UPD_1 xmm0, 0
XOR_UPD_2 xmm0, 0
add rD, 16
check_c:
sub rN, 1
jnc nextBlock_c
movdqa [r1 + r6 - 64], xmm6
MY_EPILOG
; end
%ifidn __OUTPUT_FORMAT__,elf
section .note.GNU-stack noalloc noexec nowrite progbits
%endif