diff --git a/module/icp/asm-aarch64/modes/aes-gcm-armv8_64.asm b/module/icp/asm-aarch64/modes/aes-gcm-armv8_64.asm new file mode 100644 index 000000000000..9ee8bcf58222 --- /dev/null +++ b/module/icp/asm-aarch64/modes/aes-gcm-armv8_64.asm @@ -0,0 +1,6404 @@ +#include "arm_arch.h" + +#if __ARM_MAX_ARCH__>=8 +.fpu neon +#ifdef __thumb2__ +.syntax unified +.thumb +# define INST(a,b,c,d) c,0xef,a,b +#else +.code 32 +# define INST(a,b,c,d) a,b,c,0xf2 +#endif + +.text +.globl aes_gcm_enc_128_kernel +.type aes_gcm_enc_128_kernel,%function +.align 4 +aes_gcm_enc_128_kernel: + AARCH64_VALID_CALL_TARGET + cbz r1, .L128_enc_ret + stp r19, r20, [sp, #-112]! + mov r16, r4 + mov r8, r5 + stp r21, r22, [sp, #16] + stp r23, r24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 +#ifdef __ARMEB__ + rev r10, r10 + rev r11, r11 +#endif + ldp r13, r14, [r8, #160] @ load rk10 +#ifdef __ARMEB__ + ror r13, r13, #32 + ror r14, r14, #32 +#endif + ld1 {v11.16b}, [r3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + lsr r5, r1, #3 @ byte_len + mov r15, r5 + + ld1 {v18.4s}, [r8], #16 @ load rk0 + add r4, r0, r1, lsr #3 @ end_input_ptr + sub r5, r5, #1 @ byte_len - 1 + + lsr r12, r11, #32 + ldr q15, [r3, #112] @ load h4l | h4h +#ifndef __ARMEB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + fmov d1, r10 @ CTR block 1 + rev r12, r12 @ rev_ctr32 + + add r12, r12, #1 @ increment rev_ctr32 + orr r11, r11, r11 + ld1 {v19.4s}, [r8], #16 @ load rk1 + + rev r9, r12 @ CTR block 1 + add r12, r12, #1 @ CTR block 1 + fmov d3, r10 @ CTR block 3 + + orr r9, r11, r9, lsl #32 @ CTR block 1 + ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible + + fmov v1.d[1], r9 @ CTR block 1 + rev r9, r12 @ CTR block 2 + + fmov d2, r10 @ CTR block 2 + orr r9, r11, r9, lsl #32 @ CTR block 2 + add r12, r12, #1 @ CTR block 2 + + fmov v2.d[1], r9 @ CTR block 2 + rev r9, r12 @ CTR block 3 + + orr r9, r11, r9, lsl #32 @ CTR block 3 + ld1 {v20.4s}, [r8], #16 @ load rk2 + + add r12, r12, #1 @ CTR block 3 + fmov v3.d[1], r9 @ CTR block 3 + + ldr q14, [r3, #80] @ load h3l | h3h +#ifndef __ARMEB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese q1, v18.16b + aesmc q1, q1 @ AES block 1 - round 0 + ld1 {v21.4s}, [r8], #16 @ load rk3 + + aese q2, v18.16b + aesmc q2, q2 @ AES block 2 - round 0 + ldr q12, [r3, #32] @ load h1l | h1h +#ifndef __ARMEB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + + aese q0, v18.16b + aesmc q0, q0 @ AES block 0 - round 0 + ld1 {v22.4s}, [r8], #16 @ load rk4 + + aese q3, v18.16b + aesmc q3, q3 @ AES block 3 - round 0 + ld1 {v23.4s}, [r8], #16 @ load rk5 + + aese q2, v19.16b + aesmc q2, q2 @ AES block 2 - round 1 + trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l + + aese q0, v19.16b + aesmc q0, q0 @ AES block 0 - round 1 + ld1 {v24.4s}, [r8], #16 @ load rk6 + + aese q1, v19.16b + aesmc q1, q1 @ AES block 1 - round 1 + ld1 {v25.4s}, [r8], #16 @ load rk7 + + aese q3, v19.16b + aesmc q3, q3 @ AES block 3 - round 1 + trn1 q9, v14.2d, v15.2d @ h4h | h3h + + aese q0, v20.16b + aesmc q0, q0 @ AES block 0 - round 2 + ld1 {v26.4s}, [r8], #16 @ load rk8 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 1 - round 2 + ldr q13, [r3, #64] @ load h2l | h2h +#ifndef __ARMEB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + + aese q3, v20.16b + aesmc q3, q3 @ AES block 3 - round 2 + + aese q2, v20.16b + aesmc q2, q2 @ AES block 2 - round 2 + eor v17.16b, v17.16b, q9 @ h4k | h3k + + aese q0, v21.16b + aesmc q0, q0 @ AES block 0 - round 3 + + aese q1, v21.16b + aesmc q1, q1 @ AES block 1 - round 3 + + aese q2, v21.16b + aesmc q2, q2 @ AES block 2 - round 3 + ld1 {v27.4s}, [r8], #16 @ load rk9 + + aese q3, v21.16b + aesmc q3, q3 @ AES block 3 - round 3 + + and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l + + aese q3, v22.16b + aesmc q3, q3 @ AES block 3 - round 4 + add r5, r5, r0 + + aese q2, v22.16b + aesmc q2, q2 @ AES block 2 - round 4 + cmp r0, r5 @ check if we have <= 4 blocks + + aese q0, v22.16b + aesmc q0, q0 @ AES block 0 - round 4 + + aese q3, v23.16b + aesmc q3, q3 @ AES block 3 - round 5 + + aese q2, v23.16b + aesmc q2, q2 @ AES block 2 - round 5 + + aese q0, v23.16b + aesmc q0, q0 @ AES block 0 - round 5 + + aese q3, v24.16b + aesmc q3, q3 @ AES block 3 - round 6 + + aese q1, v22.16b + aesmc q1, q1 @ AES block 1 - round 4 + + aese q2, v24.16b + aesmc q2, q2 @ AES block 2 - round 6 + trn1 q8, v12.2d, v13.2d @ h2h | h1h + + aese q0, v24.16b + aesmc q0, q0 @ AES block 0 - round 6 + + aese q1, v23.16b + aesmc q1, q1 @ AES block 1 - round 5 + + aese q3, v25.16b + aesmc q3, q3 @ AES block 3 - round 7 + + aese q0, v25.16b + aesmc q0, q0 @ AES block 0 - round 7 + + aese q1, v24.16b + aesmc q1, q1 @ AES block 1 - round 6 + + aese q2, v25.16b + aesmc q2, q2 @ AES block 2 - round 7 + + aese q0, v26.16b + aesmc q0, q0 @ AES block 0 - round 8 + + aese q1, v25.16b + aesmc q1, q1 @ AES block 1 - round 7 + + aese q2, v26.16b + aesmc q2, q2 @ AES block 2 - round 8 + + aese q3, v26.16b + aesmc q3, q3 @ AES block 3 - round 8 + + aese q1, v26.16b + aesmc q1, q1 @ AES block 1 - round 8 + + aese q2, v27.16b @ AES block 2 - round 9 + + aese q0, v27.16b @ AES block 0 - round 9 + + eor v16.16b, v16.16b, q8 @ h2k | h1k + + aese q1, v27.16b @ AES block 1 - round 9 + + aese q3, v27.16b @ AES block 3 - round 9 + bge .L128_enc_tail @ handle tail + + ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext +#ifdef __ARMEB__ + rev r21, r21 + rev r22, r22 +#endif + ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext +#ifdef __ARMEB__ + rev r19, r19 + rev r20, r20 +#endif + ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext +#ifdef __ARMEB__ + rev r23, r23 + rev r24, r24 +#endif + eor r6, r6, r13 @ AES block 0 - round 10 low + eor r7, r7, r14 @ AES block 0 - round 10 high + + eor r21, r21, r13 @ AES block 2 - round 10 low + fmov d4, r6 @ AES block 0 - mov low + + eor r19, r19, r13 @ AES block 1 - round 10 low + eor r22, r22, r14 @ AES block 2 - round 10 high + fmov v4.d[1], r7 @ AES block 0 - mov high + + fmov d5, r19 @ AES block 1 - mov low + eor r20, r20, r14 @ AES block 1 - round 10 high + + eor r23, r23, r13 @ AES block 3 - round 10 low + fmov v5.d[1], r20 @ AES block 1 - mov high + + fmov d6, r21 @ AES block 2 - mov low + eor r24, r24, r14 @ AES block 3 - round 10 high + rev r9, r12 @ CTR block 4 + + fmov v6.d[1], r22 @ AES block 2 - mov high + orr r9, r11, r9, lsl #32 @ CTR block 4 + + eor q4, q4, q0 @ AES block 0 - result + fmov d0, r10 @ CTR block 4 + add r12, r12, #1 @ CTR block 4 + + fmov v0.d[1], r9 @ CTR block 4 + rev r9, r12 @ CTR block 5 + + eor q5, q5, q1 @ AES block 1 - result + fmov d1, r10 @ CTR block 5 + orr r9, r11, r9, lsl #32 @ CTR block 5 + + add r12, r12, #1 @ CTR block 5 + add r0, r0, #64 @ AES input_ptr update + fmov v1.d[1], r9 @ CTR block 5 + + fmov d7, r23 @ AES block 3 - mov low + rev r9, r12 @ CTR block 6 + st1 { q4}, [r2], #16 @ AES block 0 - store result + + fmov v7.d[1], r24 @ AES block 3 - mov high + orr r9, r11, r9, lsl #32 @ CTR block 6 + + add r12, r12, #1 @ CTR block 6 + eor q6, q6, q2 @ AES block 2 - result + st1 { q5}, [r2], #16 @ AES block 1 - store result + + fmov d2, r10 @ CTR block 6 + cmp r0, r5 @ check if we have <= 8 blocks + + fmov v2.d[1], r9 @ CTR block 6 + rev r9, r12 @ CTR block 7 + st1 { q6}, [r2], #16 @ AES block 2 - store result + + orr r9, r11, r9, lsl #32 @ CTR block 7 + + eor q7, q7, q3 @ AES block 3 - result + st1 { q7}, [r2], #16 @ AES block 3 - store result + bge .L128_enc_prepretail @ do prepretail + +.L128_enc_main_loop:@ main loop start + ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext +#ifdef __ARMEB__ + rev r23, r23 + rev r24, r24 +#endif + rev64 q4, q4 @ GHASH block 4k (only t0 is free) + rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) + + aese q2, v18.16b + aesmc q2, q2 @ AES block 4k+6 - round 0 + fmov d3, r10 @ CTR block 4k+3 + + ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 + rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) + + aese q1, v18.16b + aesmc q1, q1 @ AES block 4k+5 - round 0 + add r12, r12, #1 @ CTR block 4k+3 + fmov v3.d[1], r9 @ CTR block 4k+3 + + aese q0, v18.16b + aesmc q0, q0 @ AES block 4k+4 - round 0 + mov d31, v6.d[1] @ GHASH block 4k+2 - mid + + aese q2, v19.16b + aesmc q2, q2 @ AES block 4k+6 - round 1 + mov d30, v5.d[1] @ GHASH block 4k+1 - mid + + aese q1, v19.16b + aesmc q1, q1 @ AES block 4k+5 - round 1 + eor q4, q4, v11.16b @ PRE 1 + + aese q3, v18.16b + aesmc q3, q3 @ AES block 4k+7 - round 0 + eor r24, r24, r14 @ AES block 4k+3 - round 10 high + + pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high + eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid + ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + aese q0, v19.16b + aesmc q0, q0 @ AES block 4k+4 - round 1 + rev r9, r12 @ CTR block 4k+8 + + eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid + mov d8, v4.d[1] @ GHASH block 4k - mid + orr r9, r11, r9, lsl #32 @ CTR block 4k+8 + + pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high + add r12, r12, #1 @ CTR block 4k+8 + mov d10, v17.d[1] @ GHASH block 4k - mid + + aese q0, v20.16b + aesmc q0, q0 @ AES block 4k+4 - round 2 + + pmull v11.1q, q4, v15.1d @ GHASH block 4k - low + eor q8, q8, q4 @ GHASH block 4k - mid + + aese q1, v20.16b + aesmc q1, q1 @ AES block 4k+5 - round 2 + + aese q0, v21.16b + aesmc q0, q0 @ AES block 4k+4 - round 3 + eor q9, q9, v28.16b @ GHASH block 4k+1 - high + + pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low + + pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid + rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) + + pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid + + pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low + ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid + + pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high + eor r7, r7, r14 @ AES block 4k+4 - round 10 high + + eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid + mov d30, v7.d[1] @ GHASH block 4k+3 - mid + + aese q3, v19.16b + aesmc q3, q3 @ AES block 4k+7 - round 1 + eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low + + aese q2, v20.16b + aesmc q2, q2 @ AES block 4k+6 - round 2 + eor r6, r6, r13 @ AES block 4k+4 - round 10 low + + aese q1, v21.16b + aesmc q1, q1 @ AES block 4k+5 - round 3 + eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid + + pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high + + aese q2, v21.16b + aesmc q2, q2 @ AES block 4k+6 - round 3 + eor q9, q9, q8 @ GHASH block 4k+2 - high + + pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid + + pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low + movi q8, #0xc2 + + pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid + eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low + + aese q1, v22.16b + aesmc q1, q1 @ AES block 4k+5 - round 4 + + aese q3, v20.16b + aesmc q3, q3 @ AES block 4k+7 - round 2 + shl d8, d8, #56 @ mod_constant + + aese q0, v22.16b + aesmc q0, q0 @ AES block 4k+4 - round 4 + eor q9, q9, q4 @ GHASH block 4k+3 - high + + aese q1, v23.16b + aesmc q1, q1 @ AES block 4k+5 - round 5 + ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext +#ifdef __ARMEB__ + rev r19, r19 + rev r20, r20 +#endif + aese q3, v21.16b + aesmc q3, q3 @ AES block 4k+7 - round 3 + eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid + + aese q0, v23.16b + aesmc q0, q0 @ AES block 4k+4 - round 5 + ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext +#ifdef __ARMEB__ + rev r21, r21 + rev r22, r22 +#endif + pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid + eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low + + aese q2, v22.16b + aesmc q2, q2 @ AES block 4k+6 - round 4 + eor r19, r19, r13 @ AES block 4k+5 - round 10 low + + aese q3, v22.16b + aesmc q3, q3 @ AES block 4k+7 - round 4 + eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid + + aese q1, v24.16b + aesmc q1, q1 @ AES block 4k+5 - round 6 + eor r23, r23, r13 @ AES block 4k+3 - round 10 low + + aese q2, v23.16b + aesmc q2, q2 @ AES block 4k+6 - round 5 + eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up + + fmov d4, r6 @ AES block 4k+4 - mov low + aese q0, v24.16b + aesmc q0, q0 @ AES block 4k+4 - round 6 + fmov v4.d[1], r7 @ AES block 4k+4 - mov high + + add r0, r0, #64 @ AES input_ptr update + fmov d7, r23 @ AES block 4k+3 - mov low + ext q9, q9, q9, #8 @ MODULO - other top alignment + + aese q3, v23.16b + aesmc q3, q3 @ AES block 4k+7 - round 5 + fmov d5, r19 @ AES block 4k+5 - mov low + + aese q0, v25.16b + aesmc q0, q0 @ AES block 4k+4 - round 7 + eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up + + aese q2, v24.16b + aesmc q2, q2 @ AES block 4k+6 - round 6 + eor r20, r20, r14 @ AES block 4k+5 - round 10 high + + aese q1, v25.16b + aesmc q1, q1 @ AES block 4k+5 - round 7 + fmov v5.d[1], r20 @ AES block 4k+5 - mov high + + aese q0, v26.16b + aesmc q0, q0 @ AES block 4k+4 - round 8 + fmov v7.d[1], r24 @ AES block 4k+3 - mov high + + aese q3, v24.16b + aesmc q3, q3 @ AES block 4k+7 - round 6 + cmp r0, r5 @ .LOOP CONTROL + + aese q1, v26.16b + aesmc q1, q1 @ AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid + + aese q0, v27.16b @ AES block 4k+4 - round 9 + eor r21, r21, r13 @ AES block 4k+6 - round 10 low + eor r22, r22, r14 @ AES block 4k+6 - round 10 high + + aese q3, v25.16b + aesmc q3, q3 @ AES block 4k+7 - round 7 + fmov d6, r21 @ AES block 4k+6 - mov low + + aese q1, v27.16b @ AES block 4k+5 - round 9 + fmov v6.d[1], r22 @ AES block 4k+6 - mov high + + aese q2, v25.16b + aesmc q2, q2 @ AES block 4k+6 - round 7 + eor q4, q4, q0 @ AES block 4k+4 - result + + fmov d0, r10 @ CTR block 4k+8 + aese q3, v26.16b + aesmc q3, q3 @ AES block 4k+7 - round 8 + + fmov v0.d[1], r9 @ CTR block 4k+8 + rev r9, r12 @ CTR block 4k+9 + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + + aese q2, v26.16b + aesmc q2, q2 @ AES block 4k+6 - round 8 + eor q5, q5, q1 @ AES block 4k+5 - result + + add r12, r12, #1 @ CTR block 4k+9 + orr r9, r11, r9, lsl #32 @ CTR block 4k+9 + fmov d1, r10 @ CTR block 4k+9 + + pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low + fmov v1.d[1], r9 @ CTR block 4k+9 + rev r9, r12 @ CTR block 4k+10 + + aese q2, v27.16b @ AES block 4k+6 - round 9 + st1 { q4}, [r2], #16 @ AES block 4k+4 - store result + eor q6, q6, q2 @ AES block 4k+6 - result + orr r9, r11, r9, lsl #32 @ CTR block 4k+10 + + aese q3, v27.16b @ AES block 4k+7 - round 9 + add r12, r12, #1 @ CTR block 4k+10 + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + fmov d2, r10 @ CTR block 4k+10 + + eor v11.16b, v11.16b, q9 @ MODULO - fold into low + st1 { q5}, [r2], #16 @ AES block 4k+5 - store result + + fmov v2.d[1], r9 @ CTR block 4k+10 + st1 { q6}, [r2], #16 @ AES block 4k+6 - store result + rev r9, r12 @ CTR block 4k+11 + + orr r9, r11, r9, lsl #32 @ CTR block 4k+11 + eor q7, q7, q3 @ AES block 4k+3 - result + + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low + st1 { q7}, [r2], #16 @ AES block 4k+3 - store result + blt .L128_enc_main_loop + +.L128_enc_prepretail:@ PREPRETAIL + rev64 q4, q4 @ GHASH block 4k (only t0 is free) + fmov d3, r10 @ CTR block 4k+3 + rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) + + ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 + add r12, r12, #1 @ CTR block 4k+3 + fmov v3.d[1], r9 @ CTR block 4k+3 + + aese q1, v18.16b + aesmc q1, q1 @ AES block 4k+5 - round 0 + rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) + + pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low + + rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) + eor q4, q4, v11.16b @ PRE 1 + + pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high + + aese q3, v18.16b + aesmc q3, q3 @ AES block 4k+7 - round 0 + mov d30, v5.d[1] @ GHASH block 4k+1 - mid + + pmull v11.1q, q4, v15.1d @ GHASH block 4k - low + mov d8, v4.d[1] @ GHASH block 4k - mid + + mov d31, v6.d[1] @ GHASH block 4k+2 - mid + mov d10, v17.d[1] @ GHASH block 4k - mid + + aese q1, v19.16b + aesmc q1, q1 @ AES block 4k+5 - round 1 + eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid + + eor q8, q8, q4 @ GHASH block 4k - mid + + pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high + eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid + + aese q3, v19.16b + aesmc q3, q3 @ AES block 4k+7 - round 1 + + pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid + eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low + + pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid + + aese q0, v18.16b + aesmc q0, q0 @ AES block 4k+4 - round 0 + ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid + + aese q2, v18.16b + aesmc q2, q2 @ AES block 4k+6 - round 0 + + eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid + mov d30, v7.d[1] @ GHASH block 4k+3 - mid + + aese q0, v19.16b + aesmc q0, q0 @ AES block 4k+4 - round 1 + eor q9, q9, v28.16b @ GHASH block 4k+1 - high + + pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid + + pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high + eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid + + pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high + + pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low + + aese q2, v19.16b + aesmc q2, q2 @ AES block 4k+6 - round 1 + eor q9, q9, q8 @ GHASH block 4k+2 - high + + aese q0, v20.16b + aesmc q0, q0 @ AES block 4k+4 - round 2 + + pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low + movi q8, #0xc2 + + aese q2, v20.16b + aesmc q2, q2 @ AES block 4k+6 - round 2 + eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low + + aese q3, v20.16b + aesmc q3, q3 @ AES block 4k+7 - round 2 + + pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid + eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid + + aese q2, v21.16b + aesmc q2, q2 @ AES block 4k+6 - round 3 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 4k+5 - round 2 + eor q9, q9, q4 @ GHASH block 4k+3 - high + + aese q0, v21.16b + aesmc q0, q0 @ AES block 4k+4 - round 3 + + eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid + shl d8, d8, #56 @ mod_constant + + aese q1, v21.16b + aesmc q1, q1 @ AES block 4k+5 - round 3 + eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low + + aese q0, v22.16b + aesmc q0, q0 @ AES block 4k+4 - round 4 + + pmull v28.1q, q9, q8 + eor v10.16b, v10.16b, q9 @ karatsuba tidy up + + aese q1, v22.16b + aesmc q1, q1 @ AES block 4k+5 - round 4 + + aese q0, v23.16b + aesmc q0, q0 @ AES block 4k+4 - round 5 + ext q9, q9, q9, #8 + + aese q3, v21.16b + aesmc q3, q3 @ AES block 4k+7 - round 3 + + aese q2, v22.16b + aesmc q2, q2 @ AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v11.16b + + aese q0, v24.16b + aesmc q0, q0 @ AES block 4k+4 - round 6 + + aese q3, v22.16b + aesmc q3, q3 @ AES block 4k+7 - round 4 + + aese q1, v23.16b + aesmc q1, q1 @ AES block 4k+5 - round 5 + + aese q2, v23.16b + aesmc q2, q2 @ AES block 4k+6 - round 5 + eor v10.16b, v10.16b, v28.16b + + aese q3, v23.16b + aesmc q3, q3 @ AES block 4k+7 - round 5 + + aese q1, v24.16b + aesmc q1, q1 @ AES block 4k+5 - round 6 + + aese q2, v24.16b + aesmc q2, q2 @ AES block 4k+6 - round 6 + + aese q3, v24.16b + aesmc q3, q3 @ AES block 4k+7 - round 6 + eor v10.16b, v10.16b, q9 + + aese q0, v25.16b + aesmc q0, q0 @ AES block 4k+4 - round 7 + + aese q2, v25.16b + aesmc q2, q2 @ AES block 4k+6 - round 7 + + aese q3, v25.16b + aesmc q3, q3 @ AES block 4k+7 - round 7 + + pmull v28.1q, v10.1d, q8 + + aese q1, v25.16b + aesmc q1, q1 @ AES block 4k+5 - round 7 + ext v10.16b, v10.16b, v10.16b, #8 + + aese q3, v26.16b + aesmc q3, q3 @ AES block 4k+7 - round 8 + + aese q0, v26.16b + aesmc q0, q0 @ AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v28.16b + + aese q1, v26.16b + aesmc q1, q1 @ AES block 4k+5 - round 8 + + aese q3, v27.16b @ AES block 4k+7 - round 9 + + aese q2, v26.16b + aesmc q2, q2 @ AES block 4k+6 - round 8 + + aese q0, v27.16b @ AES block 4k+4 - round 9 + + aese q1, v27.16b @ AES block 4k+5 - round 9 + eor v11.16b, v11.16b, v10.16b + + aese q2, v27.16b @ AES block 4k+6 - round 9 +.L128_enc_tail:@ TAIL + + sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process + ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + cmp r5, #48 + + ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag + eor r6, r6, r13 @ AES block 4k+4 - round 10 low + eor r7, r7, r14 @ AES block 4k+4 - round 10 high + + fmov d4, r6 @ AES block 4k+4 - mov low + + fmov v4.d[1], r7 @ AES block 4k+4 - mov high + + eor q5, q4, q0 @ AES block 4k+4 - result + + bgt .L128_enc_blocks_more_than_3 + + sub r12, r12, #1 + movi v11.8b, #0 + mov q3, q2 + + cmp r5, #32 + mov q2, q1 + movi q9, #0 + + movi v10.8b, #0 + bgt .L128_enc_blocks_more_than_2 + + mov q3, q1 + cmp r5, #16 + + sub r12, r12, #1 + bgt .L128_enc_blocks_more_than_1 + + sub r12, r12, #1 + b .L128_enc_blocks_less_than_1 +.L128_enc_blocks_more_than_3:@ blocks left > 3 + st1 { q5}, [r2], #16 @ AES final-3 block - store result + + ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + rev64 q4, q5 @ GHASH final-3 block + + eor q4, q4, q8 @ feed in partial tag + eor r7, r7, r14 @ AES final-2 block - round 10 high + eor r6, r6, r13 @ AES final-2 block - round 10 low + + fmov d5, r6 @ AES final-2 block - mov low + + movi q8, #0 @ suppress further partial tag feed in + fmov v5.d[1], r7 @ AES final-2 block - mov high + + pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low + mov d22, v4.d[1] @ GHASH final-3 block - mid + + pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high + + mov d10, v17.d[1] @ GHASH final-3 block - mid + + eor q5, q5, q1 @ AES final-2 block - result + eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid + + pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid +.L128_enc_blocks_more_than_2:@ blocks left > 2 + + st1 { q5}, [r2], #16 @ AES final-2 block - store result + + rev64 q4, q5 @ GHASH final-2 block + ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + eor q4, q4, q8 @ feed in partial tag + + eor r6, r6, r13 @ AES final-1 block - round 10 low + + fmov d5, r6 @ AES final-1 block - mov low + eor r7, r7, r14 @ AES final-1 block - round 10 high + + pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high + fmov v5.d[1], r7 @ AES final-1 block - mov high + + mov d22, v4.d[1] @ GHASH final-2 block - mid + + pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low + + eor q9, q9, v20.16b @ GHASH final-2 block - high + + eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid + + eor q5, q5, q2 @ AES final-1 block - result + + eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low + + pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid + + movi q8, #0 @ suppress further partial tag feed in + + eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid +.L128_enc_blocks_more_than_1:@ blocks left > 1 + + st1 { q5}, [r2], #16 @ AES final-1 block - store result + + rev64 q4, q5 @ GHASH final-1 block + ldp r6, r7, [r0], #16 @ AES final block - load input low & high +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + eor q4, q4, q8 @ feed in partial tag + + eor r7, r7, r14 @ AES final block - round 10 high + eor r6, r6, r13 @ AES final block - round 10 low + + fmov d5, r6 @ AES final block - mov low + + pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high + fmov v5.d[1], r7 @ AES final block - mov high + + mov d22, v4.d[1] @ GHASH final-1 block - mid + + pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low + + eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid + + eor q5, q5, q3 @ AES final block - result + + ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid + + pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid + + eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low + + eor q9, q9, v20.16b @ GHASH final-1 block - high + + eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid + movi q8, #0 @ suppress further partial tag feed in +.L128_enc_blocks_less_than_1:@ blocks left <= 1 + + and r1, r1, #127 @ bit_length %= 128 + mvn r13, xzr @ rk10_l = 0xffffffffffffffff + + mvn r14, xzr @ rk10_h = 0xffffffffffffffff + sub r1, r1, #128 @ bit_length -= 128 + + neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) + + and r1, r1, #127 @ bit_length %= 128 + + lsr r14, r14, r1 @ rk10_h is mask for top 64b of last block + cmp r1, #64 + + csel r6, r13, r14, lt + csel r7, r14, xzr, lt + + fmov d0, r6 @ ctr0b is mask for last block + + fmov v0.d[1], r7 + + and q5, q5, q0 @ possibly partial last block has zeroes in highest bits + + rev64 q4, q5 @ GHASH final block + + eor q4, q4, q8 @ feed in partial tag + + mov d8, v4.d[1] @ GHASH final block - mid + + pmull v21.1q, q4, v12.1d @ GHASH final block - low + ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored + + eor q8, q8, q4 @ GHASH final block - mid +#ifndef __ARMEB__ + rev r9, r12 +#else + mov r9, r12 +#endif + pmull2 v20.1q, q4, v12.2d @ GHASH final block - high + + pmull v8.1q, q8, v16.1d @ GHASH final block - mid + + eor v11.16b, v11.16b, v21.16b @ GHASH final block - low + + eor q9, q9, v20.16b @ GHASH final block - high + + eor v10.16b, v10.16b, q8 @ GHASH final block - mid + movi q8, #0xc2 + + eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up + + shl d8, d8, #56 @ mod_constant + + eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up + + pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid + + ext q9, q9, q9, #8 @ MODULO - other top alignment + + eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid + + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + + pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low + + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + + bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing + + eor v11.16b, v11.16b, q9 @ MODULO - fold into low + st1 { q5}, [r2] @ store all 16B + + str r9, [r16, #12] @ store the updated counter + + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov r0, r15 + st1 { v11.16b }, [r3] + ldp r21, r22, [sp, #16] + ldp r23, r24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp r19, r20, [sp], #112 + bx lr + +.L128_enc_ret: + mov r0, #0x0 + bx lr +.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel +.globl aes_gcm_dec_128_kernel +.type aes_gcm_dec_128_kernel,%function +.align 4 +aes_gcm_dec_128_kernel: + AARCH64_VALID_CALL_TARGET + cbz r1, .L128_dec_ret + stp r19, r20, [sp, #-112]! + mov r16, r4 + mov r8, r5 + stp r21, r22, [sp, #16] + stp r23, r24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + lsr r5, r1, #3 @ byte_len + mov r15, r5 + ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 +#ifdef __ARMEB__ + rev r10, r10 + rev r11, r11 +#endif + ldp r13, r14, [r8, #160] @ load rk10 +#ifdef __ARMEB__ + ror r14, r14, 32 + ror r13, r13, 32 +#endif + sub r5, r5, #1 @ byte_len - 1 + ld1 {v18.4s}, [r8], #16 @ load rk0 + + and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible + + ldr q13, [r3, #64] @ load h2l | h2h +#ifndef __ARMEB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + lsr r12, r11, #32 + fmov d2, r10 @ CTR block 2 + + ld1 {v19.4s}, [r8], #16 @ load rk1 + orr r11, r11, r11 + rev r12, r12 @ rev_ctr32 + + fmov d1, r10 @ CTR block 1 + add r12, r12, #1 @ increment rev_ctr32 + + aese q0, v18.16b + aesmc q0, q0 @ AES block 0 - round 0 + rev r9, r12 @ CTR block 1 + + orr r9, r11, r9, lsl #32 @ CTR block 1 + ld1 {v20.4s}, [r8], #16 @ load rk2 + add r12, r12, #1 @ CTR block 1 + + fmov v1.d[1], r9 @ CTR block 1 + rev r9, r12 @ CTR block 2 + add r12, r12, #1 @ CTR block 2 + + aese q0, v19.16b + aesmc q0, q0 @ AES block 0 - round 1 + orr r9, r11, r9, lsl #32 @ CTR block 2 + + fmov v2.d[1], r9 @ CTR block 2 + rev r9, r12 @ CTR block 3 + + fmov d3, r10 @ CTR block 3 + orr r9, r11, r9, lsl #32 @ CTR block 3 + add r12, r12, #1 @ CTR block 3 + + fmov v3.d[1], r9 @ CTR block 3 + add r4, r0, r1, lsr #3 @ end_input_ptr + + aese q1, v18.16b + aesmc q1, q1 @ AES block 1 - round 0 + ld1 {v21.4s}, [r8], #16 @ load rk3 + + aese q0, v20.16b + aesmc q0, q0 @ AES block 0 - round 2 + ld1 {v22.4s}, [r8], #16 @ load rk4 + + aese q2, v18.16b + aesmc q2, q2 @ AES block 2 - round 0 + ld1 {v23.4s}, [r8], #16 @ load rk5 + + aese q1, v19.16b + aesmc q1, q1 @ AES block 1 - round 1 + ld1 {v24.4s}, [r8], #16 @ load rk6 + + aese q3, v18.16b + aesmc q3, q3 @ AES block 3 - round 0 + + aese q2, v19.16b + aesmc q2, q2 @ AES block 2 - round 1 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 1 - round 2 + + aese q3, v19.16b + aesmc q3, q3 @ AES block 3 - round 1 + ld1 { v11.16b}, [r3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese q0, v21.16b + aesmc q0, q0 @ AES block 0 - round 3 + ld1 {v25.4s}, [r8], #16 @ load rk7 + + aese q1, v21.16b + aesmc q1, q1 @ AES block 1 - round 3 + + aese q3, v20.16b + aesmc q3, q3 @ AES block 3 - round 2 + + aese q2, v20.16b + aesmc q2, q2 @ AES block 2 - round 2 + ld1 {v26.4s}, [r8], #16 @ load rk8 + + aese q1, v22.16b + aesmc q1, q1 @ AES block 1 - round 4 + + aese q3, v21.16b + aesmc q3, q3 @ AES block 3 - round 3 + + aese q2, v21.16b + aesmc q2, q2 @ AES block 2 - round 3 + ldr q14, [r3, #80] @ load h3l | h3h +#ifndef __ARMEB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese q0, v22.16b + aesmc q0, q0 @ AES block 0 - round 4 + ld1 {v27.4s}, [r8], #16 @ load rk9 + + aese q1, v23.16b + aesmc q1, q1 @ AES block 1 - round 5 + + aese q2, v22.16b + aesmc q2, q2 @ AES block 2 - round 4 + + aese q3, v22.16b + aesmc q3, q3 @ AES block 3 - round 4 + + aese q0, v23.16b + aesmc q0, q0 @ AES block 0 - round 5 + + aese q2, v23.16b + aesmc q2, q2 @ AES block 2 - round 5 + ldr q12, [r3, #32] @ load h1l | h1h +#ifndef __ARMEB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese q3, v23.16b + aesmc q3, q3 @ AES block 3 - round 5 + + aese q0, v24.16b + aesmc q0, q0 @ AES block 0 - round 6 + + aese q1, v24.16b + aesmc q1, q1 @ AES block 1 - round 6 + + aese q3, v24.16b + aesmc q3, q3 @ AES block 3 - round 6 + + aese q2, v24.16b + aesmc q2, q2 @ AES block 2 - round 6 + trn1 q8, v12.2d, v13.2d @ h2h | h1h + + ldr q15, [r3, #112] @ load h4l | h4h +#ifndef __ARMEB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l + add r5, r5, r0 + + aese q1, v25.16b + aesmc q1, q1 @ AES block 1 - round 7 + + aese q2, v25.16b + aesmc q2, q2 @ AES block 2 - round 7 + + aese q0, v25.16b + aesmc q0, q0 @ AES block 0 - round 7 + eor v16.16b, v16.16b, q8 @ h2k | h1k + + aese q3, v25.16b + aesmc q3, q3 @ AES block 3 - round 7 + + aese q1, v26.16b + aesmc q1, q1 @ AES block 1 - round 8 + trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l + + aese q2, v26.16b + aesmc q2, q2 @ AES block 2 - round 8 + + aese q3, v26.16b + aesmc q3, q3 @ AES block 3 - round 8 + + aese q0, v26.16b + aesmc q0, q0 @ AES block 0 - round 8 + trn1 q9, v14.2d, v15.2d @ h4h | h3h + + aese q2, v27.16b @ AES block 2 - round 9 + + aese q3, v27.16b @ AES block 3 - round 9 + + aese q0, v27.16b @ AES block 0 - round 9 + cmp r0, r5 @ check if we have <= 4 blocks + + aese q1, v27.16b @ AES block 1 - round 9 + eor v17.16b, v17.16b, q9 @ h4k | h3k + bge .L128_dec_tail @ handle tail + + ld1 {q4, q5}, [r0], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext + + eor q1, q5, q1 @ AES block 1 - result + ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext + + eor q0, q4, q0 @ AES block 0 - result + rev64 q4, q4 @ GHASH block 0 + rev r9, r12 @ CTR block 4 + + orr r9, r11, r9, lsl #32 @ CTR block 4 + add r12, r12, #1 @ CTR block 4 + ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext + + rev64 q5, q5 @ GHASH block 1 + mov r19, v1.d[0] @ AES block 1 - mov low + + mov r20, v1.d[1] @ AES block 1 - mov high + + mov r6, v0.d[0] @ AES block 0 - mov low + cmp r0, r5 @ check if we have <= 8 blocks + + mov r7, v0.d[1] @ AES block 0 - mov high + + fmov d0, r10 @ CTR block 4 + + fmov v0.d[1], r9 @ CTR block 4 + rev r9, r12 @ CTR block 5 + eor r19, r19, r13 @ AES block 1 - round 10 low +#ifdef __ARMEB__ + rev r19, r19 +#endif + fmov d1, r10 @ CTR block 5 + add r12, r12, #1 @ CTR block 5 + orr r9, r11, r9, lsl #32 @ CTR block 5 + + fmov v1.d[1], r9 @ CTR block 5 + rev r9, r12 @ CTR block 6 + add r12, r12, #1 @ CTR block 6 + + orr r9, r11, r9, lsl #32 @ CTR block 6 + + eor r20, r20, r14 @ AES block 1 - round 10 high +#ifdef __ARMEB__ + rev r20, r20 +#endif + eor r6, r6, r13 @ AES block 0 - round 10 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + eor q2, q6, q2 @ AES block 2 - result + + eor r7, r7, r14 @ AES block 0 - round 10 high +#ifdef __ARMEB__ + rev r7, r7 +#endif + stp r6, r7, [r2], #16 @ AES block 0 - store result + + stp r19, r20, [r2], #16 @ AES block 1 - store result + bge .L128_dec_prepretail @ do prepretail + +.L128_dec_main_loop:@ main loop start + eor q3, q7, q3 @ AES block 4k+3 - result + ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 + mov r21, v2.d[0] @ AES block 4k+2 - mov low + + pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high + mov r22, v2.d[1] @ AES block 4k+2 - mov high + + aese q1, v18.16b + aesmc q1, q1 @ AES block 4k+5 - round 0 + fmov d2, r10 @ CTR block 4k+6 + + rev64 q6, q6 @ GHASH block 4k+2 + fmov v2.d[1], r9 @ CTR block 4k+6 + rev r9, r12 @ CTR block 4k+7 + + mov r23, v3.d[0] @ AES block 4k+3 - mov low + eor q4, q4, v11.16b @ PRE 1 + mov d30, v5.d[1] @ GHASH block 4k+1 - mid + + aese q1, v19.16b + aesmc q1, q1 @ AES block 4k+5 - round 1 + rev64 q7, q7 @ GHASH block 4k+3 + + pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low + mov r24, v3.d[1] @ AES block 4k+3 - mov high + orr r9, r11, r9, lsl #32 @ CTR block 4k+7 + + pmull v11.1q, q4, v15.1d @ GHASH block 4k - low + fmov d3, r10 @ CTR block 4k+7 + eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid + + aese q1, v20.16b + aesmc q1, q1 @ AES block 4k+5 - round 2 + fmov v3.d[1], r9 @ CTR block 4k+7 + + aese q2, v18.16b + aesmc q2, q2 @ AES block 4k+6 - round 0 + mov d10, v17.d[1] @ GHASH block 4k - mid + + pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high + eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low + + pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low + + aese q1, v21.16b + aesmc q1, q1 @ AES block 4k+5 - round 3 + mov d8, v4.d[1] @ GHASH block 4k - mid + + aese q3, v18.16b + aesmc q3, q3 @ AES block 4k+7 - round 0 + eor q9, q9, v28.16b @ GHASH block 4k+1 - high + + aese q0, v18.16b + aesmc q0, q0 @ AES block 4k+4 - round 0 + + pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low + eor q8, q8, q4 @ GHASH block 4k - mid + + aese q3, v19.16b + aesmc q3, q3 @ AES block 4k+7 - round 1 + eor r23, r23, r13 @ AES block 4k+3 - round 10 low +#ifdef __ARMEB__ + rev r23, r23 +#endif + pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid + eor r22, r22, r14 @ AES block 4k+2 - round 10 high +#ifdef __ARMEB__ + rev r22, r22 +#endif + mov d31, v6.d[1] @ GHASH block 4k+2 - mid + + aese q0, v19.16b + aesmc q0, q0 @ AES block 4k+4 - round 1 + eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low + + pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid + + aese q3, v20.16b + aesmc q3, q3 @ AES block 4k+7 - round 2 + eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid + + aese q0, v20.16b + aesmc q0, q0 @ AES block 4k+4 - round 2 + + aese q1, v22.16b + aesmc q1, q1 @ AES block 4k+5 - round 4 + eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid + + pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high + + aese q0, v21.16b + aesmc q0, q0 @ AES block 4k+4 - round 3 + ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid + + pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high + + aese q2, v19.16b + aesmc q2, q2 @ AES block 4k+6 - round 1 + mov d30, v7.d[1] @ GHASH block 4k+3 - mid + + aese q0, v22.16b + aesmc q0, q0 @ AES block 4k+4 - round 4 + eor q9, q9, q8 @ GHASH block 4k+2 - high + + pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid + eor r24, r24, r14 @ AES block 4k+3 - round 10 high +#ifdef __ARMEB__ + rev r24, r24 +#endif + aese q2, v20.16b + aesmc q2, q2 @ AES block 4k+6 - round 2 + eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid + + aese q1, v23.16b + aesmc q1, q1 @ AES block 4k+5 - round 5 + eor r21, r21, r13 @ AES block 4k+2 - round 10 low +#ifdef __ARMEB__ + rev r21, r21 +#endif + aese q0, v23.16b + aesmc q0, q0 @ AES block 4k+4 - round 5 + movi q8, #0xc2 + + aese q2, v21.16b + aesmc q2, q2 @ AES block 4k+6 - round 3 + eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low + + aese q1, v24.16b + aesmc q1, q1 @ AES block 4k+5 - round 6 + + aese q0, v24.16b + aesmc q0, q0 @ AES block 4k+4 - round 6 + eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid + + aese q2, v22.16b + aesmc q2, q2 @ AES block 4k+6 - round 4 + stp r21, r22, [r2], #16 @ AES block 4k+2 - store result + + pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid + eor q9, q9, q4 @ GHASH block 4k+3 - high + ld1 {q4}, [r0], #16 @ AES block 4k+3 - load ciphertext + + aese q1, v25.16b + aesmc q1, q1 @ AES block 4k+5 - round 7 + add r12, r12, #1 @ CTR block 4k+7 + + aese q0, v25.16b + aesmc q0, q0 @ AES block 4k+4 - round 7 + shl d8, d8, #56 @ mod_constant + + aese q2, v23.16b + aesmc q2, q2 @ AES block 4k+6 - round 5 + eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid + + aese q1, v26.16b + aesmc q1, q1 @ AES block 4k+5 - round 8 + stp r23, r24, [r2], #16 @ AES block 4k+3 - store result + + aese q0, v26.16b + aesmc q0, q0 @ AES block 4k+4 - round 8 + eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up + + aese q3, v21.16b + aesmc q3, q3 @ AES block 4k+7 - round 3 + rev r9, r12 @ CTR block 4k+8 + + pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid + ld1 {q5}, [r0], #16 @ AES block 4k+4 - load ciphertext + ext q9, q9, q9, #8 @ MODULO - other top alignment + + aese q0, v27.16b @ AES block 4k+4 - round 9 + orr r9, r11, r9, lsl #32 @ CTR block 4k+8 + + aese q3, v22.16b + aesmc q3, q3 @ AES block 4k+7 - round 4 + eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up + + aese q1, v27.16b @ AES block 4k+5 - round 9 + + aese q2, v24.16b + aesmc q2, q2 @ AES block 4k+6 - round 6 + eor q0, q4, q0 @ AES block 4k+4 - result + + aese q3, v23.16b + aesmc q3, q3 @ AES block 4k+7 - round 5 + ld1 {q6}, [r0], #16 @ AES block 4k+5 - load ciphertext + + add r12, r12, #1 @ CTR block 4k+8 + eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid + eor q1, q5, q1 @ AES block 4k+5 - result + + aese q2, v25.16b + aesmc q2, q2 @ AES block 4k+6 - round 7 + ld1 {q7}, [r0], #16 @ AES block 4k+6 - load ciphertext + + aese q3, v24.16b + aesmc q3, q3 @ AES block 4k+7 - round 6 + + rev64 q5, q5 @ GHASH block 4k+5 + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + mov r7, v0.d[1] @ AES block 4k+4 - mov high + + aese q2, v26.16b + aesmc q2, q2 @ AES block 4k+6 - round 8 + mov r6, v0.d[0] @ AES block 4k+4 - mov low + + aese q3, v25.16b + aesmc q3, q3 @ AES block 4k+7 - round 7 + fmov d0, r10 @ CTR block 4k+8 + + pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low + fmov v0.d[1], r9 @ CTR block 4k+8 + rev r9, r12 @ CTR block 4k+9 + + aese q2, v27.16b @ AES block 4k+6 - round 9 + orr r9, r11, r9, lsl #32 @ CTR block 4k+9 + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + + aese q3, v26.16b + aesmc q3, q3 @ AES block 4k+7 - round 8 + eor r7, r7, r14 @ AES block 4k+4 - round 10 high +#ifdef __ARMEB__ + rev r7, r7 +#endif + eor v11.16b, v11.16b, q8 @ MODULO - fold into low + mov r20, v1.d[1] @ AES block 4k+5 - mov high + eor r6, r6, r13 @ AES block 4k+4 - round 10 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + eor q2, q6, q2 @ AES block 4k+6 - result + mov r19, v1.d[0] @ AES block 4k+5 - mov low + add r12, r12, #1 @ CTR block 4k+9 + + aese q3, v27.16b @ AES block 4k+7 - round 9 + fmov d1, r10 @ CTR block 4k+9 + cmp r0, r5 @ .LOOP CONTROL + + rev64 q4, q4 @ GHASH block 4k+4 + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low + fmov v1.d[1], r9 @ CTR block 4k+9 + + rev r9, r12 @ CTR block 4k+10 + add r12, r12, #1 @ CTR block 4k+10 + + eor r20, r20, r14 @ AES block 4k+5 - round 10 high +#ifdef __ARMEB__ + rev r20, r20 +#endif + stp r6, r7, [r2], #16 @ AES block 4k+4 - store result + + eor r19, r19, r13 @ AES block 4k+5 - round 10 low +#ifdef __ARMEB__ + rev r19, r19 +#endif + stp r19, r20, [r2], #16 @ AES block 4k+5 - store result + + orr r9, r11, r9, lsl #32 @ CTR block 4k+10 + blt .L128_dec_main_loop + +.L128_dec_prepretail:@ PREPRETAIL + ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 + mov r21, v2.d[0] @ AES block 4k+2 - mov low + mov d30, v5.d[1] @ GHASH block 4k+1 - mid + + aese q0, v18.16b + aesmc q0, q0 @ AES block 4k+4 - round 0 + eor q3, q7, q3 @ AES block 4k+3 - result + + aese q1, v18.16b + aesmc q1, q1 @ AES block 4k+5 - round 0 + mov r22, v2.d[1] @ AES block 4k+2 - mov high + + eor q4, q4, v11.16b @ PRE 1 + fmov d2, r10 @ CTR block 4k+6 + rev64 q6, q6 @ GHASH block 4k+2 + + aese q0, v19.16b + aesmc q0, q0 @ AES block 4k+4 - round 1 + fmov v2.d[1], r9 @ CTR block 4k+6 + + rev r9, r12 @ CTR block 4k+7 + mov r23, v3.d[0] @ AES block 4k+3 - mov low + eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid + + pmull v11.1q, q4, v15.1d @ GHASH block 4k - low + mov d10, v17.d[1] @ GHASH block 4k - mid + mov r24, v3.d[1] @ AES block 4k+3 - mov high + + aese q1, v19.16b + aesmc q1, q1 @ AES block 4k+5 - round 1 + mov d31, v6.d[1] @ GHASH block 4k+2 - mid + + aese q0, v20.16b + aesmc q0, q0 @ AES block 4k+4 - round 2 + orr r9, r11, r9, lsl #32 @ CTR block 4k+7 + + pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low + mov d8, v4.d[1] @ GHASH block 4k - mid + fmov d3, r10 @ CTR block 4k+7 + + aese q2, v18.16b + aesmc q2, q2 @ AES block 4k+6 - round 0 + fmov v3.d[1], r9 @ CTR block 4k+7 + + pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid + eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid + + rev64 q7, q7 @ GHASH block 4k+3 + + aese q2, v19.16b + aesmc q2, q2 @ AES block 4k+6 - round 1 + eor q8, q8, q4 @ GHASH block 4k - mid + + pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high + + aese q3, v18.16b + aesmc q3, q3 @ AES block 4k+7 - round 0 + ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid + + pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high + + pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid + eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low + + pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low + + pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid + eor q9, q9, v28.16b @ GHASH block 4k+1 - high + + eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid + + pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high + + pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high + mov d30, v7.d[1] @ GHASH block 4k+3 - mid + + aese q1, v20.16b + aesmc q1, q1 @ AES block 4k+5 - round 2 + eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid + + pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low + + eor q9, q9, q8 @ GHASH block 4k+2 - high + movi q8, #0xc2 + + aese q3, v19.16b + aesmc q3, q3 @ AES block 4k+7 - round 1 + eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid + + eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low + + aese q2, v20.16b + aesmc q2, q2 @ AES block 4k+6 - round 2 + eor q9, q9, q4 @ GHASH block 4k+3 - high + + aese q3, v20.16b + aesmc q3, q3 @ AES block 4k+7 - round 2 + eor r23, r23, r13 @ AES block 4k+3 - round 10 low +#ifdef __ARMEB__ + rev r23, r23 +#endif + pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid + eor r21, r21, r13 @ AES block 4k+2 - round 10 low +#ifdef __ARMEB__ + rev r21, r21 +#endif + eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low + + aese q2, v21.16b + aesmc q2, q2 @ AES block 4k+6 - round 3 + + aese q1, v21.16b + aesmc q1, q1 @ AES block 4k+5 - round 3 + shl d8, d8, #56 @ mod_constant + + aese q0, v21.16b + aesmc q0, q0 @ AES block 4k+4 - round 3 + + aese q2, v22.16b + aesmc q2, q2 @ AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid + + aese q1, v22.16b + aesmc q1, q1 @ AES block 4k+5 - round 4 + + aese q3, v21.16b + aesmc q3, q3 @ AES block 4k+7 - round 3 + eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up + + aese q2, v23.16b + aesmc q2, q2 @ AES block 4k+6 - round 5 + + aese q1, v23.16b + aesmc q1, q1 @ AES block 4k+5 - round 5 + + aese q3, v22.16b + aesmc q3, q3 @ AES block 4k+7 - round 4 + + aese q0, v22.16b + aesmc q0, q0 @ AES block 4k+4 - round 4 + eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up + + pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid + + aese q1, v24.16b + aesmc q1, q1 @ AES block 4k+5 - round 6 + ext q9, q9, q9, #8 @ MODULO - other top alignment + + aese q3, v23.16b + aesmc q3, q3 @ AES block 4k+7 - round 5 + + aese q0, v23.16b + aesmc q0, q0 @ AES block 4k+4 - round 5 + eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid + + aese q1, v25.16b + aesmc q1, q1 @ AES block 4k+5 - round 7 + + aese q2, v24.16b + aesmc q2, q2 @ AES block 4k+6 - round 6 + + aese q0, v24.16b + aesmc q0, q0 @ AES block 4k+4 - round 6 + + aese q1, v26.16b + aesmc q1, q1 @ AES block 4k+5 - round 8 + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + + aese q3, v24.16b + aesmc q3, q3 @ AES block 4k+7 - round 6 + + aese q0, v25.16b + aesmc q0, q0 @ AES block 4k+4 - round 7 + + aese q1, v27.16b @ AES block 4k+5 - round 9 + + pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low + eor r24, r24, r14 @ AES block 4k+3 - round 10 high +#ifdef __ARMEB__ + rev r24, r24 +#endif + aese q2, v25.16b + aesmc q2, q2 @ AES block 4k+6 - round 7 + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + + aese q3, v25.16b + aesmc q3, q3 @ AES block 4k+7 - round 7 + + aese q0, v26.16b + aesmc q0, q0 @ AES block 4k+4 - round 8 + eor v11.16b, v11.16b, q8 @ MODULO - fold into low + + aese q2, v26.16b + aesmc q2, q2 @ AES block 4k+6 - round 8 + + aese q3, v26.16b + aesmc q3, q3 @ AES block 4k+7 - round 8 + eor r22, r22, r14 @ AES block 4k+2 - round 10 high +#ifdef __ARMEB__ + rev r22, r22 +#endif + aese q0, v27.16b @ AES block 4k+4 - round 9 + stp r21, r22, [r2], #16 @ AES block 4k+2 - store result + + aese q2, v27.16b @ AES block 4k+6 - round 9 + add r12, r12, #1 @ CTR block 4k+7 + stp r23, r24, [r2], #16 @ AES block 4k+3 - store result + + aese q3, v27.16b @ AES block 4k+7 - round 9 + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low +.L128_dec_tail:@ TAIL + + sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process + ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext + + eor q0, q5, q0 @ AES block 4k+4 - result + + mov r7, v0.d[1] @ AES block 4k+4 - mov high + + mov r6, v0.d[0] @ AES block 4k+4 - mov low + + cmp r5, #48 + + eor r7, r7, r14 @ AES block 4k+4 - round 10 high +#ifdef __ARMEB__ + rev r7, r7 +#endif + ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag + eor r6, r6, r13 @ AES block 4k+4 - round 10 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + bgt .L128_dec_blocks_more_than_3 + + mov q3, q2 + sub r12, r12, #1 + movi v11.8b, #0 + + movi q9, #0 + mov q2, q1 + + movi v10.8b, #0 + cmp r5, #32 + bgt .L128_dec_blocks_more_than_2 + + cmp r5, #16 + + mov q3, q1 + sub r12, r12, #1 + bgt .L128_dec_blocks_more_than_1 + + sub r12, r12, #1 + b .L128_dec_blocks_less_than_1 +.L128_dec_blocks_more_than_3:@ blocks left > 3 + rev64 q4, q5 @ GHASH final-3 block + ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext + + eor q4, q4, q8 @ feed in partial tag + + mov d10, v17.d[1] @ GHASH final-3 block - mid + stp r6, r7, [r2], #16 @ AES final-3 block - store result + eor q0, q5, q1 @ AES final-2 block - result + + mov d22, v4.d[1] @ GHASH final-3 block - mid + mov r7, v0.d[1] @ AES final-2 block - mov high + + pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low + mov r6, v0.d[0] @ AES final-2 block - mov low + + pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high + + eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid + + movi q8, #0 @ suppress further partial tag feed in + eor r7, r7, r14 @ AES final-2 block - round 10 high +#ifdef __ARMEB__ + rev r7, r7 +#endif + pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid + eor r6, r6, r13 @ AES final-2 block - round 10 low +#ifdef __ARMEB__ + rev r6, r6 +#endif +.L128_dec_blocks_more_than_2:@ blocks left > 2 + + rev64 q4, q5 @ GHASH final-2 block + ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext + + eor q4, q4, q8 @ feed in partial tag + + eor q0, q5, q2 @ AES final-1 block - result + stp r6, r7, [r2], #16 @ AES final-2 block - store result + + mov d22, v4.d[1] @ GHASH final-2 block - mid + + pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low + + pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high + mov r6, v0.d[0] @ AES final-1 block - mov low + + mov r7, v0.d[1] @ AES final-1 block - mov high + eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid + + movi q8, #0 @ suppress further partial tag feed in + + pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid + + eor r6, r6, r13 @ AES final-1 block - round 10 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low + + eor q9, q9, v20.16b @ GHASH final-2 block - high + + eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid + eor r7, r7, r14 @ AES final-1 block - round 10 high +#ifdef __ARMEB__ + rev r7, r7 +#endif +.L128_dec_blocks_more_than_1:@ blocks left > 1 + + rev64 q4, q5 @ GHASH final-1 block + + ld1 { q5}, [r0], #16 @ AES final block - load ciphertext + eor q4, q4, q8 @ feed in partial tag + + mov d22, v4.d[1] @ GHASH final-1 block - mid + + eor q0, q5, q3 @ AES final block - result + + eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid + + stp r6, r7, [r2], #16 @ AES final-1 block - store result + mov r6, v0.d[0] @ AES final block - mov low + + mov r7, v0.d[1] @ AES final block - mov high + ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid + + pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low + + pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high + + pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid + movi q8, #0 @ suppress further partial tag feed in + + eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low + + eor q9, q9, v20.16b @ GHASH final-1 block - high + eor r7, r7, r14 @ AES final block - round 10 high +#ifdef __ARMEB__ + rev r7, r7 +#endif + eor r6, r6, r13 @ AES final block - round 10 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid +.L128_dec_blocks_less_than_1:@ blocks left <= 1 + + mvn r14, xzr @ rk10_h = 0xffffffffffffffff + and r1, r1, #127 @ bit_length %= 128 + + mvn r13, xzr @ rk10_l = 0xffffffffffffffff + sub r1, r1, #128 @ bit_length -= 128 + + neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) + + and r1, r1, #127 @ bit_length %= 128 + + lsr r14, r14, r1 @ rk10_h is mask for top 64b of last block + cmp r1, #64 + + csel r10, r14, xzr, lt + csel r9, r13, r14, lt + + fmov d0, r9 @ ctr0b is mask for last block + + mov v0.d[1], r10 + + and q5, q5, q0 @ possibly partial last block has zeroes in highest bits + + rev64 q4, q5 @ GHASH final block + + eor q4, q4, q8 @ feed in partial tag + + ldp r4, r5, [r2] @ load existing bytes we need to not overwrite + + and r7, r7, r10 + + pmull2 v20.1q, q4, v12.2d @ GHASH final block - high + mov d8, v4.d[1] @ GHASH final block - mid + + eor q8, q8, q4 @ GHASH final block - mid + eor q9, q9, v20.16b @ GHASH final block - high + + pmull v8.1q, q8, v16.1d @ GHASH final block - mid + + pmull v21.1q, q4, v12.1d @ GHASH final block - low + bic r4, r4, r9 @ mask out low existing bytes + and r6, r6, r9 + +#ifndef __ARMEB__ + rev r9, r12 +#else + mov r9, r12 +#endif + + eor v10.16b, v10.16b, q8 @ GHASH final block - mid + movi q8, #0xc2 + + eor v11.16b, v11.16b, v21.16b @ GHASH final block - low + + bic r5, r5, r10 @ mask out high existing bytes + shl d8, d8, #56 @ mod_constant + + eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up + + pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid + + eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up + + orr r6, r6, r4 + str r9, [r16, #12] @ store the updated counter + + orr r7, r7, r5 + stp r6, r7, [r2] + ext q9, q9, q9, #8 @ MODULO - other top alignment + + eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid + + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + + pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + + eor v11.16b, v11.16b, q8 @ MODULO - fold into low + + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov r0, r15 + st1 { v11.16b }, [r3] + + ldp r21, r22, [sp, #16] + ldp r23, r24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp r19, r20, [sp], #112 + bx lr + +.L128_dec_ret: + mov r0, #0x0 + bx lr +.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel +.globl aes_gcm_enc_192_kernel +.type aes_gcm_enc_192_kernel,%function +.align 4 +aes_gcm_enc_192_kernel: + AARCH64_VALID_CALL_TARGET + cbz r1, .L192_enc_ret + stp r19, r20, [sp, #-112]! + mov r16, r4 + mov r8, r5 + stp r21, r22, [sp, #16] + stp r23, r24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 +#ifdef __ARMEB__ + rev r10, r10 + rev r11, r11 +#endif + ldp r13, r14, [r8, #192] @ load rk12 +#ifdef __ARMEB__ + ror r13, r13, #32 + ror r14, r14, #32 +#endif + ld1 {v18.4s}, [r8], #16 @ load rk0 + + ld1 {v19.4s}, [r8], #16 @ load rk1 + + ld1 {v20.4s}, [r8], #16 @ load rk2 + + lsr r12, r11, #32 + ld1 {v21.4s}, [r8], #16 @ load rk3 + orr r11, r11, r11 + + ld1 {v22.4s}, [r8], #16 @ load rk4 + rev r12, r12 @ rev_ctr32 + + add r12, r12, #1 @ increment rev_ctr32 + fmov d3, r10 @ CTR block 3 + + rev r9, r12 @ CTR block 1 + add r12, r12, #1 @ CTR block 1 + fmov d1, r10 @ CTR block 1 + + orr r9, r11, r9, lsl #32 @ CTR block 1 + ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible + + fmov v1.d[1], r9 @ CTR block 1 + rev r9, r12 @ CTR block 2 + add r12, r12, #1 @ CTR block 2 + + fmov d2, r10 @ CTR block 2 + orr r9, r11, r9, lsl #32 @ CTR block 2 + + fmov v2.d[1], r9 @ CTR block 2 + rev r9, r12 @ CTR block 3 + + orr r9, r11, r9, lsl #32 @ CTR block 3 + ld1 {v23.4s}, [r8], #16 @ load rk5 + + fmov v3.d[1], r9 @ CTR block 3 + + ld1 {v24.4s}, [r8], #16 @ load rk6 + + ld1 {v25.4s}, [r8], #16 @ load rk7 + + aese q0, v18.16b + aesmc q0, q0 @ AES block 0 - round 0 + ld1 { v11.16b}, [r3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese q3, v18.16b + aesmc q3, q3 @ AES block 3 - round 0 + ld1 {v26.4s}, [r8], #16 @ load rk8 + + aese q1, v18.16b + aesmc q1, q1 @ AES block 1 - round 0 + ldr q15, [r3, #112] @ load h4l | h4h +#ifndef __ARMEB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + aese q2, v18.16b + aesmc q2, q2 @ AES block 2 - round 0 + ld1 {v27.4s}, [r8], #16 @ load rk9 + + aese q0, v19.16b + aesmc q0, q0 @ AES block 0 - round 1 + ld1 {v28.4s}, [r8], #16 @ load rk10 + + aese q1, v19.16b + aesmc q1, q1 @ AES block 1 - round 1 + ldr q12, [r3, #32] @ load h1l | h1h +#ifndef __ARMEB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese q2, v19.16b + aesmc q2, q2 @ AES block 2 - round 1 + ld1 {v29.4s}, [r8], #16 @ load rk11 + + aese q3, v19.16b + aesmc q3, q3 @ AES block 3 - round 1 + ldr q14, [r3, #80] @ load h3l | h3h +#ifndef __ARMEB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese q0, v20.16b + aesmc q0, q0 @ AES block 0 - round 2 + + aese q2, v20.16b + aesmc q2, q2 @ AES block 2 - round 2 + + aese q3, v20.16b + aesmc q3, q3 @ AES block 3 - round 2 + + aese q0, v21.16b + aesmc q0, q0 @ AES block 0 - round 3 + trn1 q9, v14.2d, v15.2d @ h4h | h3h + + aese q2, v21.16b + aesmc q2, q2 @ AES block 2 - round 3 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 1 - round 2 + trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l + + aese q0, v22.16b + aesmc q0, q0 @ AES block 0 - round 4 + + aese q3, v21.16b + aesmc q3, q3 @ AES block 3 - round 3 + + aese q1, v21.16b + aesmc q1, q1 @ AES block 1 - round 3 + + aese q0, v23.16b + aesmc q0, q0 @ AES block 0 - round 5 + + aese q2, v22.16b + aesmc q2, q2 @ AES block 2 - round 4 + + aese q1, v22.16b + aesmc q1, q1 @ AES block 1 - round 4 + + aese q0, v24.16b + aesmc q0, q0 @ AES block 0 - round 6 + + aese q3, v22.16b + aesmc q3, q3 @ AES block 3 - round 4 + + aese q2, v23.16b + aesmc q2, q2 @ AES block 2 - round 5 + + aese q1, v23.16b + aesmc q1, q1 @ AES block 1 - round 5 + + aese q3, v23.16b + aesmc q3, q3 @ AES block 3 - round 5 + + aese q2, v24.16b + aesmc q2, q2 @ AES block 2 - round 6 + ldr q13, [r3, #64] @ load h2l | h2h +#ifndef __ARMEB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + aese q1, v24.16b + aesmc q1, q1 @ AES block 1 - round 6 + + aese q3, v24.16b + aesmc q3, q3 @ AES block 3 - round 6 + + aese q0, v25.16b + aesmc q0, q0 @ AES block 0 - round 7 + + aese q1, v25.16b + aesmc q1, q1 @ AES block 1 - round 7 + trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l + + aese q3, v25.16b + aesmc q3, q3 @ AES block 3 - round 7 + + aese q0, v26.16b + aesmc q0, q0 @ AES block 0 - round 8 + + aese q2, v25.16b + aesmc q2, q2 @ AES block 2 - round 7 + trn1 q8, v12.2d, v13.2d @ h2h | h1h + + aese q1, v26.16b + aesmc q1, q1 @ AES block 1 - round 8 + + aese q3, v26.16b + aesmc q3, q3 @ AES block 3 - round 8 + + aese q2, v26.16b + aesmc q2, q2 @ AES block 2 - round 8 + + aese q0, v27.16b + aesmc q0, q0 @ AES block 0 - round 9 + + aese q3, v27.16b + aesmc q3, q3 @ AES block 3 - round 9 + + aese q2, v27.16b + aesmc q2, q2 @ AES block 2 - round 9 + + aese q1, v27.16b + aesmc q1, q1 @ AES block 1 - round 9 + + aese q0, v28.16b + aesmc q0, q0 @ AES block 0 - round 10 + + aese q2, v28.16b + aesmc q2, q2 @ AES block 2 - round 10 + + aese q1, v28.16b + aesmc q1, q1 @ AES block 1 - round 10 + lsr r5, r1, #3 @ byte_len + mov r15, r5 + + aese q3, v28.16b + aesmc q3, q3 @ AES block 3 - round 10 + sub r5, r5, #1 @ byte_len - 1 + + eor v16.16b, v16.16b, q8 @ h2k | h1k + and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + eor v17.16b, v17.16b, q9 @ h4k | h3k + + aese q2, v29.16b @ AES block 2 - round 11 + add r4, r0, r1, lsr #3 @ end_input_ptr + add r5, r5, r0 + + aese q1, v29.16b @ AES block 1 - round 11 + cmp r0, r5 @ check if we have <= 4 blocks + + aese q0, v29.16b @ AES block 0 - round 11 + add r12, r12, #1 @ CTR block 3 + + aese q3, v29.16b @ AES block 3 - round 11 + bge .L192_enc_tail @ handle tail + + rev r9, r12 @ CTR block 4 + ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + orr r9, r11, r9, lsl #32 @ CTR block 4 + ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext +#ifdef __ARMEB__ + rev r21, r21 + rev r22, r22 +#endif + ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext +#ifdef __ARMEB__ + rev r23, r23 + rev r24, r24 +#endif + ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext +#ifdef __ARMEB__ + rev r19, r19 + rev r20, r20 +#endif + add r0, r0, #64 @ AES input_ptr update + cmp r0, r5 @ check if we have <= 8 blocks + + eor r6, r6, r13 @ AES block 0 - round 12 low + + eor r7, r7, r14 @ AES block 0 - round 12 high + eor r22, r22, r14 @ AES block 2 - round 12 high + fmov d4, r6 @ AES block 0 - mov low + + eor r24, r24, r14 @ AES block 3 - round 12 high + fmov v4.d[1], r7 @ AES block 0 - mov high + + eor r21, r21, r13 @ AES block 2 - round 12 low + eor r19, r19, r13 @ AES block 1 - round 12 low + + fmov d5, r19 @ AES block 1 - mov low + eor r20, r20, r14 @ AES block 1 - round 12 high + + fmov v5.d[1], r20 @ AES block 1 - mov high + + eor r23, r23, r13 @ AES block 3 - round 12 low + fmov d6, r21 @ AES block 2 - mov low + + add r12, r12, #1 @ CTR block 4 + eor q4, q4, q0 @ AES block 0 - result + fmov d0, r10 @ CTR block 4 + + fmov v0.d[1], r9 @ CTR block 4 + rev r9, r12 @ CTR block 5 + + orr r9, r11, r9, lsl #32 @ CTR block 5 + add r12, r12, #1 @ CTR block 5 + + fmov d7, r23 @ AES block 3 - mov low + st1 { q4}, [r2], #16 @ AES block 0 - store result + + fmov v6.d[1], r22 @ AES block 2 - mov high + + eor q5, q5, q1 @ AES block 1 - result + fmov d1, r10 @ CTR block 5 + st1 { q5}, [r2], #16 @ AES block 1 - store result + + fmov v7.d[1], r24 @ AES block 3 - mov high + + fmov v1.d[1], r9 @ CTR block 5 + rev r9, r12 @ CTR block 6 + + orr r9, r11, r9, lsl #32 @ CTR block 6 + + add r12, r12, #1 @ CTR block 6 + eor q6, q6, q2 @ AES block 2 - result + fmov d2, r10 @ CTR block 6 + + fmov v2.d[1], r9 @ CTR block 6 + rev r9, r12 @ CTR block 7 + + orr r9, r11, r9, lsl #32 @ CTR block 7 + st1 { q6}, [r2], #16 @ AES block 2 - store result + + eor q7, q7, q3 @ AES block 3 - result + st1 { q7}, [r2], #16 @ AES block 3 - store result + bge .L192_enc_prepretail @ do prepretail + +.L192_enc_main_loop:@ main loop start + aese q2, v18.16b + aesmc q2, q2 @ AES block 4k+6 - round 0 + rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) + + aese q1, v18.16b + aesmc q1, q1 @ AES block 4k+5 - round 0 + ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext +#ifdef __ARMEB__ + rev r19, r19 + rev r20, r20 +#endif + ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 + fmov d3, r10 @ CTR block 4k+3 + rev64 q4, q4 @ GHASH block 4k (only t0 is free) + + aese q2, v19.16b + aesmc q2, q2 @ AES block 4k+6 - round 1 + fmov v3.d[1], r9 @ CTR block 4k+3 + + pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high + rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) + ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext +#ifdef __ARMEB__ + rev r21, r21 + rev r22, r22 +#endif + aese q0, v18.16b + aesmc q0, q0 @ AES block 4k+4 - round 0 + ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext +#ifdef __ARMEB__ + rev r23, r23 + rev r24, r24 +#endif + pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low + eor q4, q4, v11.16b @ PRE 1 + + aese q1, v19.16b + aesmc q1, q1 @ AES block 4k+5 - round 1 + + aese q0, v19.16b + aesmc q0, q0 @ AES block 4k+4 - round 1 + rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) + + aese q3, v18.16b + aesmc q3, q3 @ AES block 4k+7 - round 0 + eor r24, r24, r14 @ AES block 4k+3 - round 12 high + + pmull v11.1q, q4, v15.1d @ GHASH block 4k - low + mov d8, v4.d[1] @ GHASH block 4k - mid + + aese q0, v20.16b + aesmc q0, q0 @ AES block 4k+4 - round 2 + + aese q3, v19.16b + aesmc q3, q3 @ AES block 4k+7 - round 1 + eor r21, r21, r13 @ AES block 4k+6 - round 12 low + + eor q8, q8, q4 @ GHASH block 4k - mid + eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low + + aese q0, v21.16b + aesmc q0, q0 @ AES block 4k+4 - round 3 + eor r19, r19, r13 @ AES block 4k+5 - round 12 low + + aese q1, v20.16b + aesmc q1, q1 @ AES block 4k+5 - round 2 + mov d31, v6.d[1] @ GHASH block 4k+2 - mid + + pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high + mov d4, v5.d[1] @ GHASH block 4k+1 - mid + + aese q2, v20.16b + aesmc q2, q2 @ AES block 4k+6 - round 2 + + aese q1, v21.16b + aesmc q1, q1 @ AES block 4k+5 - round 3 + + mov d10, v17.d[1] @ GHASH block 4k - mid + eor q9, q9, v30.16b @ GHASH block 4k+1 - high + + aese q3, v20.16b + aesmc q3, q3 @ AES block 4k+7 - round 2 + eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid + + pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high + + aese q0, v22.16b + aesmc q0, q0 @ AES block 4k+4 - round 4 + eor q4, q4, q5 @ GHASH block 4k+1 - mid + + aese q3, v21.16b + aesmc q3, q3 @ AES block 4k+7 - round 3 + + pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high + eor r20, r20, r14 @ AES block 4k+5 - round 12 high + ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid + + aese q0, v23.16b + aesmc q0, q0 @ AES block 4k+4 - round 5 + add r12, r12, #1 @ CTR block 4k+3 + + aese q3, v22.16b + aesmc q3, q3 @ AES block 4k+7 - round 4 + eor q9, q9, v30.16b @ GHASH block 4k+2 - high + + pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid + eor r22, r22, r14 @ AES block 4k+6 - round 12 high + + pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid + eor r23, r23, r13 @ AES block 4k+3 - round 12 low + mov d30, v7.d[1] @ GHASH block 4k+3 - mid + + pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid + rev r9, r12 @ CTR block 4k+8 + + pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low + orr r9, r11, r9, lsl #32 @ CTR block 4k+8 + + aese q2, v21.16b + aesmc q2, q2 @ AES block 4k+6 - round 3 + eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid + + aese q1, v22.16b + aesmc q1, q1 @ AES block 4k+5 - round 4 + ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + aese q0, v24.16b + aesmc q0, q0 @ AES block 4k+4 - round 6 + eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low + + aese q2, v22.16b + aesmc q2, q2 @ AES block 4k+6 - round 4 + add r0, r0, #64 @ AES input_ptr update + + aese q1, v23.16b + aesmc q1, q1 @ AES block 4k+5 - round 5 + movi q8, #0xc2 + + pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low + eor r7, r7, r14 @ AES block 4k+4 - round 12 high + eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid + + aese q2, v23.16b + aesmc q2, q2 @ AES block 4k+6 - round 5 + eor r6, r6, r13 @ AES block 4k+4 - round 12 low + + aese q1, v24.16b + aesmc q1, q1 @ AES block 4k+5 - round 6 + shl d8, d8, #56 @ mod_constant + + aese q3, v23.16b + aesmc q3, q3 @ AES block 4k+7 - round 5 + eor q9, q9, q5 @ GHASH block 4k+3 - high + + aese q0, v25.16b + aesmc q0, q0 @ AES block 4k+4 - round 7 + fmov d5, r19 @ AES block 4k+5 - mov low + + aese q1, v25.16b + aesmc q1, q1 @ AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid + + aese q3, v24.16b + aesmc q3, q3 @ AES block 4k+7 - round 6 + fmov v5.d[1], r20 @ AES block 4k+5 - mov high + + aese q0, v26.16b + aesmc q0, q0 @ AES block 4k+4 - round 8 + eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low + + pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid + cmp r0, r5 @ .LOOP CONTROL + fmov d4, r6 @ AES block 4k+4 - mov low + + aese q2, v24.16b + aesmc q2, q2 @ AES block 4k+6 - round 6 + fmov v4.d[1], r7 @ AES block 4k+4 - mov high + + aese q1, v26.16b + aesmc q1, q1 @ AES block 4k+5 - round 8 + fmov d7, r23 @ AES block 4k+3 - mov low + + eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid + eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up + add r12, r12, #1 @ CTR block 4k+8 + + aese q2, v25.16b + aesmc q2, q2 @ AES block 4k+6 - round 7 + fmov v7.d[1], r24 @ AES block 4k+3 - mov high + + pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid + ext q9, q9, q9, #8 @ MODULO - other top alignment + fmov d6, r21 @ AES block 4k+6 - mov low + + aese q3, v25.16b + aesmc q3, q3 @ AES block 4k+7 - round 7 + + aese q0, v27.16b + aesmc q0, q0 @ AES block 4k+4 - round 9 + eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up + + aese q2, v26.16b + aesmc q2, q2 @ AES block 4k+6 - round 8 + + aese q3, v26.16b + aesmc q3, q3 @ AES block 4k+7 - round 8 + + aese q1, v27.16b + aesmc q1, q1 @ AES block 4k+5 - round 9 + + aese q0, v28.16b + aesmc q0, q0 @ AES block 4k+4 - round 10 + eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid + + aese q3, v27.16b + aesmc q3, q3 @ AES block 4k+7 - round 9 + + aese q2, v27.16b + aesmc q2, q2 @ AES block 4k+6 - round 9 + + aese q0, v29.16b @ AES block 4k+4 - round 11 + + aese q1, v28.16b + aesmc q1, q1 @ AES block 4k+5 - round 10 + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + + aese q2, v28.16b + aesmc q2, q2 @ AES block 4k+6 - round 10 + + eor q4, q4, q0 @ AES block 4k+4 - result + fmov d0, r10 @ CTR block 4k+8 + + aese q1, v29.16b @ AES block 4k+5 - round 11 + fmov v0.d[1], r9 @ CTR block 4k+8 + rev r9, r12 @ CTR block 4k+9 + + pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low + fmov v6.d[1], r22 @ AES block 4k+6 - mov high + st1 { q4}, [r2], #16 @ AES block 4k+4 - store result + + aese q3, v28.16b + aesmc q3, q3 @ AES block 4k+7 - round 10 + orr r9, r11, r9, lsl #32 @ CTR block 4k+9 + + eor q5, q5, q1 @ AES block 4k+5 - result + add r12, r12, #1 @ CTR block 4k+9 + fmov d1, r10 @ CTR block 4k+9 + + aese q2, v29.16b @ AES block 4k+6 - round 11 + fmov v1.d[1], r9 @ CTR block 4k+9 + rev r9, r12 @ CTR block 4k+10 + + add r12, r12, #1 @ CTR block 4k+10 + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + orr r9, r11, r9, lsl #32 @ CTR block 4k+10 + + st1 { q5}, [r2], #16 @ AES block 4k+5 - store result + eor v11.16b, v11.16b, q9 @ MODULO - fold into low + + aese q3, v29.16b @ AES block 4k+7 - round 11 + eor q6, q6, q2 @ AES block 4k+6 - result + fmov d2, r10 @ CTR block 4k+10 + + st1 { q6}, [r2], #16 @ AES block 4k+6 - store result + fmov v2.d[1], r9 @ CTR block 4k+10 + rev r9, r12 @ CTR block 4k+11 + + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low + orr r9, r11, r9, lsl #32 @ CTR block 4k+11 + + eor q7, q7, q3 @ AES block 4k+3 - result + st1 { q7}, [r2], #16 @ AES block 4k+3 - store result + blt .L192_enc_main_loop + +.L192_enc_prepretail:@ PREPRETAIL + aese q0, v18.16b + aesmc q0, q0 @ AES block 4k+4 - round 0 + rev64 q4, q4 @ GHASH block 4k (only t0 is free) + + fmov d3, r10 @ CTR block 4k+3 + ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 + add r12, r12, #1 @ CTR block 4k+3 + + aese q1, v18.16b + aesmc q1, q1 @ AES block 4k+5 - round 0 + rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) + + aese q2, v18.16b + aesmc q2, q2 @ AES block 4k+6 - round 0 + + fmov v3.d[1], r9 @ CTR block 4k+3 + eor q4, q4, v11.16b @ PRE 1 + mov d10, v17.d[1] @ GHASH block 4k - mid + + aese q1, v19.16b + aesmc q1, q1 @ AES block 4k+5 - round 1 + rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) + + pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high + + pmull v11.1q, q4, v15.1d @ GHASH block 4k - low + mov d8, v4.d[1] @ GHASH block 4k - mid + + pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low + rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) + + pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high + + eor q8, q8, q4 @ GHASH block 4k - mid + mov d4, v5.d[1] @ GHASH block 4k+1 - mid + + eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low + mov d31, v6.d[1] @ GHASH block 4k+2 - mid + + aese q3, v18.16b + aesmc q3, q3 @ AES block 4k+7 - round 0 + eor q9, q9, v30.16b @ GHASH block 4k+1 - high + + pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high + + eor q4, q4, q5 @ GHASH block 4k+1 - mid + eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid + + aese q3, v19.16b + aesmc q3, q3 @ AES block 4k+7 - round 1 + + aese q2, v19.16b + aesmc q2, q2 @ AES block 4k+6 - round 1 + eor q9, q9, v30.16b @ GHASH block 4k+2 - high + + aese q0, v19.16b + aesmc q0, q0 @ AES block 4k+4 - round 1 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 4k+5 - round 2 + mov d30, v7.d[1] @ GHASH block 4k+3 - mid + + pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high + ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid + + aese q0, v20.16b + aesmc q0, q0 @ AES block 4k+4 - round 2 + + pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid + eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid + + aese q1, v21.16b + aesmc q1, q1 @ AES block 4k+5 - round 3 + + pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid + + pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid + + pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid + eor q9, q9, q5 @ GHASH block 4k+3 - high + + pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low + + aese q0, v21.16b + aesmc q0, q0 @ AES block 4k+4 - round 3 + eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid + + aese q3, v20.16b + aesmc q3, q3 @ AES block 4k+7 - round 2 + + aese q2, v20.16b + aesmc q2, q2 @ AES block 4k+6 - round 2 + eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low + + aese q0, v22.16b + aesmc q0, q0 @ AES block 4k+4 - round 4 + + aese q3, v21.16b + aesmc q3, q3 @ AES block 4k+7 - round 3 + eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid + + aese q2, v21.16b + aesmc q2, q2 @ AES block 4k+6 - round 3 + + pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low + movi q8, #0xc2 + + aese q3, v22.16b + aesmc q3, q3 @ AES block 4k+7 - round 4 + + aese q2, v22.16b + aesmc q2, q2 @ AES block 4k+6 - round 4 + + aese q1, v22.16b + aesmc q1, q1 @ AES block 4k+5 - round 4 + eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid + + aese q3, v23.16b + aesmc q3, q3 @ AES block 4k+7 - round 5 + + aese q2, v23.16b + aesmc q2, q2 @ AES block 4k+6 - round 5 + + aese q1, v23.16b + aesmc q1, q1 @ AES block 4k+5 - round 5 + eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low + + aese q0, v23.16b + aesmc q0, q0 @ AES block 4k+4 - round 5 + + aese q3, v24.16b + aesmc q3, q3 @ AES block 4k+7 - round 6 + eor v10.16b, v10.16b, q9 @ karatsuba tidy up + + aese q1, v24.16b + aesmc q1, q1 @ AES block 4k+5 - round 6 + + aese q0, v24.16b + aesmc q0, q0 @ AES block 4k+4 - round 6 + shl d8, d8, #56 @ mod_constant + + aese q3, v25.16b + aesmc q3, q3 @ AES block 4k+7 - round 7 + + aese q1, v25.16b + aesmc q1, q1 @ AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v11.16b + + aese q0, v25.16b + aesmc q0, q0 @ AES block 4k+4 - round 7 + + pmull v30.1q, q9, q8 + + aese q2, v24.16b + aesmc q2, q2 @ AES block 4k+6 - round 6 + ext q9, q9, q9, #8 + + aese q0, v26.16b + aesmc q0, q0 @ AES block 4k+4 - round 8 + + aese q1, v26.16b + aesmc q1, q1 @ AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v30.16b + + aese q2, v25.16b + aesmc q2, q2 @ AES block 4k+6 - round 7 + + aese q3, v26.16b + aesmc q3, q3 @ AES block 4k+7 - round 8 + + aese q0, v27.16b + aesmc q0, q0 @ AES block 4k+4 - round 9 + + aese q2, v26.16b + aesmc q2, q2 @ AES block 4k+6 - round 8 + eor v10.16b, v10.16b, q9 + + aese q3, v27.16b + aesmc q3, q3 @ AES block 4k+7 - round 9 + + aese q1, v27.16b + aesmc q1, q1 @ AES block 4k+5 - round 9 + + aese q2, v27.16b + aesmc q2, q2 @ AES block 4k+6 - round 9 + + pmull v30.1q, v10.1d, q8 + + ext v10.16b, v10.16b, v10.16b, #8 + + aese q3, v28.16b + aesmc q3, q3 @ AES block 4k+7 - round 10 + + aese q0, v28.16b + aesmc q0, q0 @ AES block 4k+4 - round 10 + + aese q2, v28.16b + aesmc q2, q2 @ AES block 4k+6 - round 10 + + aese q1, v28.16b + aesmc q1, q1 @ AES block 4k+5 - round 10 + eor v11.16b, v11.16b, v30.16b + + aese q0, v29.16b @ AES block 4k+4 - round 11 + + aese q3, v29.16b @ AES block 4k+7 - round 11 + + aese q2, v29.16b @ AES block 4k+6 - round 11 + + aese q1, v29.16b @ AES block 4k+5 - round 11 + eor v11.16b, v11.16b, v10.16b +.L192_enc_tail:@ TAIL + + sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process + ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + eor r6, r6, r13 @ AES block 4k+4 - round 12 low + eor r7, r7, r14 @ AES block 4k+4 - round 12 high + + fmov d4, r6 @ AES block 4k+4 - mov low + + fmov v4.d[1], r7 @ AES block 4k+4 - mov high + cmp r5, #48 + + eor q5, q4, q0 @ AES block 4k+4 - result + + ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag + bgt .L192_enc_blocks_more_than_3 + + sub r12, r12, #1 + movi v10.8b, #0 + + mov q3, q2 + movi q9, #0 + cmp r5, #32 + + mov q2, q1 + movi v11.8b, #0 + bgt .L192_enc_blocks_more_than_2 + + sub r12, r12, #1 + + mov q3, q1 + cmp r5, #16 + bgt .L192_enc_blocks_more_than_1 + + sub r12, r12, #1 + b .L192_enc_blocks_less_than_1 +.L192_enc_blocks_more_than_3:@ blocks left > 3 + st1 { q5}, [r2], #16 @ AES final-3 block - store result + + ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + rev64 q4, q5 @ GHASH final-3 block + + eor r6, r6, r13 @ AES final-2 block - round 12 low + eor q4, q4, q8 @ feed in partial tag + + eor r7, r7, r14 @ AES final-2 block - round 12 high + fmov d5, r6 @ AES final-2 block - mov low + + fmov v5.d[1], r7 @ AES final-2 block - mov high + + mov d22, v4.d[1] @ GHASH final-3 block - mid + + pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low + + mov d10, v17.d[1] @ GHASH final-3 block - mid + + eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid + + movi q8, #0 @ suppress further partial tag feed in + + pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high + + pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid + eor q5, q5, q1 @ AES final-2 block - result +.L192_enc_blocks_more_than_2:@ blocks left > 2 + + st1 { q5}, [r2], #16 @ AES final-2 block - store result + + rev64 q4, q5 @ GHASH final-2 block + ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + eor q4, q4, q8 @ feed in partial tag + + eor r7, r7, r14 @ AES final-1 block - round 12 high + + pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high + mov d22, v4.d[1] @ GHASH final-2 block - mid + + pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low + eor r6, r6, r13 @ AES final-1 block - round 12 low + + fmov d5, r6 @ AES final-1 block - mov low + + fmov v5.d[1], r7 @ AES final-1 block - mov high + eor q9, q9, v20.16b @ GHASH final-2 block - high + eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid + + eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low + + pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid + + movi q8, #0 @ suppress further partial tag feed in + + eor q5, q5, q2 @ AES final-1 block - result + + eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid +.L192_enc_blocks_more_than_1:@ blocks left > 1 + + st1 { q5}, [r2], #16 @ AES final-1 block - store result + + ldp r6, r7, [r0], #16 @ AES final block - load input low & high +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + rev64 q4, q5 @ GHASH final-1 block + + eor r6, r6, r13 @ AES final block - round 12 low + eor q4, q4, q8 @ feed in partial tag + movi q8, #0 @ suppress further partial tag feed in + + mov d22, v4.d[1] @ GHASH final-1 block - mid + + eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid + eor r7, r7, r14 @ AES final block - round 12 high + fmov d5, r6 @ AES final block - mov low + + pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high + fmov v5.d[1], r7 @ AES final block - mov high + + ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid + + eor q9, q9, v20.16b @ GHASH final-1 block - high + + pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low + + pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid + + eor q5, q5, q3 @ AES final block - result + + eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low + + eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid +.L192_enc_blocks_less_than_1:@ blocks left <= 1 + + ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored +#ifndef __ARMEB__ + rev r9, r12 +#else + mov r9, r12 +#endif + and r1, r1, #127 @ bit_length %= 128 + + sub r1, r1, #128 @ bit_length -= 128 + mvn r14, xzr @ rk12_h = 0xffffffffffffffff + + neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) + mvn r13, xzr @ rk12_l = 0xffffffffffffffff + + and r1, r1, #127 @ bit_length %= 128 + + lsr r14, r14, r1 @ rk12_h is mask for top 64b of last block + cmp r1, #64 + + csel r6, r13, r14, lt + csel r7, r14, xzr, lt + + fmov d0, r6 @ ctr0b is mask for last block + + fmov v0.d[1], r7 + + and q5, q5, q0 @ possibly partial last block has zeroes in highest bits + + rev64 q4, q5 @ GHASH final block + + eor q4, q4, q8 @ feed in partial tag + + mov d8, v4.d[1] @ GHASH final block - mid + + pmull v21.1q, q4, v12.1d @ GHASH final block - low + + pmull2 v20.1q, q4, v12.2d @ GHASH final block - high + + eor q8, q8, q4 @ GHASH final block - mid + + eor v11.16b, v11.16b, v21.16b @ GHASH final block - low + + eor q9, q9, v20.16b @ GHASH final block - high + + pmull v8.1q, q8, v16.1d @ GHASH final block - mid + + eor v10.16b, v10.16b, q8 @ GHASH final block - mid + movi q8, #0xc2 + + eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up + + shl d8, d8, #56 @ mod_constant + + bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing + + eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up + + pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid + + ext q9, q9, q9, #8 @ MODULO - other top alignment + + eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid + + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + + pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low + + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + + eor v11.16b, v11.16b, q9 @ MODULO - fold into low + str r9, [r16, #12] @ store the updated counter + + st1 { q5}, [r2] @ store all 16B + + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov r0, r15 + st1 { v11.16b }, [r3] + + ldp r21, r22, [sp, #16] + ldp r23, r24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp r19, r20, [sp], #112 + bx lr + +.L192_enc_ret: + mov r0, #0x0 + bx lr +.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel +.globl aes_gcm_dec_192_kernel +.type aes_gcm_dec_192_kernel,%function +.align 4 +aes_gcm_dec_192_kernel: + AARCH64_VALID_CALL_TARGET + cbz r1, .L192_dec_ret + stp r19, r20, [sp, #-112]! + mov r16, r4 + mov r8, r5 + stp r21, r22, [sp, #16] + stp r23, r24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + add r4, r0, r1, lsr #3 @ end_input_ptr + ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 +#ifdef __ARMEB__ + rev r10, r10 + rev r11, r11 +#endif + ldp r13, r14, [r8, #192] @ load rk12 +#ifdef __ARMEB__ + ror r13, r13, #32 + ror r14, r14, #32 +#endif + ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible + + ld1 {v18.4s}, [r8], #16 @ load rk0 + + lsr r5, r1, #3 @ byte_len + mov r15, r5 + ld1 {v19.4s}, [r8], #16 @ load rk1 + + lsr r12, r11, #32 + orr r11, r11, r11 + fmov d3, r10 @ CTR block 3 + + rev r12, r12 @ rev_ctr32 + fmov d1, r10 @ CTR block 1 + + add r12, r12, #1 @ increment rev_ctr32 + ld1 {v20.4s}, [r8], #16 @ load rk2 + + aese q0, v18.16b + aesmc q0, q0 @ AES block 0 - round 0 + rev r9, r12 @ CTR block 1 + + add r12, r12, #1 @ CTR block 1 + orr r9, r11, r9, lsl #32 @ CTR block 1 + ld1 {v21.4s}, [r8], #16 @ load rk3 + + fmov v1.d[1], r9 @ CTR block 1 + rev r9, r12 @ CTR block 2 + add r12, r12, #1 @ CTR block 2 + + fmov d2, r10 @ CTR block 2 + orr r9, r11, r9, lsl #32 @ CTR block 2 + + fmov v2.d[1], r9 @ CTR block 2 + rev r9, r12 @ CTR block 3 + + aese q0, v19.16b + aesmc q0, q0 @ AES block 0 - round 1 + orr r9, r11, r9, lsl #32 @ CTR block 3 + + fmov v3.d[1], r9 @ CTR block 3 + + ld1 {v22.4s}, [r8], #16 @ load rk4 + + aese q0, v20.16b + aesmc q0, q0 @ AES block 0 - round 2 + + aese q2, v18.16b + aesmc q2, q2 @ AES block 2 - round 0 + ld1 {v23.4s}, [r8], #16 @ load rk5 + + aese q1, v18.16b + aesmc q1, q1 @ AES block 1 - round 0 + ldr q15, [r3, #112] @ load h4l | h4h +#ifndef __ARMEB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + aese q3, v18.16b + aesmc q3, q3 @ AES block 3 - round 0 + ldr q13, [r3, #64] @ load h2l | h2h +#ifndef __ARMEB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + aese q2, v19.16b + aesmc q2, q2 @ AES block 2 - round 1 + ldr q14, [r3, #80] @ load h3l | h3h +#ifndef __ARMEB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese q1, v19.16b + aesmc q1, q1 @ AES block 1 - round 1 + + aese q3, v19.16b + aesmc q3, q3 @ AES block 3 - round 1 + ldr q12, [r3, #32] @ load h1l | h1h +#ifndef __ARMEB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese q2, v20.16b + aesmc q2, q2 @ AES block 2 - round 2 + ld1 {v24.4s}, [r8], #16 @ load rk6 + + aese q0, v21.16b + aesmc q0, q0 @ AES block 0 - round 3 + ld1 {v25.4s}, [r8], #16 @ load rk7 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 1 - round 2 + ld1 {v26.4s}, [r8], #16 @ load rk8 + + aese q3, v20.16b + aesmc q3, q3 @ AES block 3 - round 2 + ld1 {v27.4s}, [r8], #16 @ load rk9 + + aese q2, v21.16b + aesmc q2, q2 @ AES block 2 - round 3 + ld1 { v11.16b}, [r3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese q1, v21.16b + aesmc q1, q1 @ AES block 1 - round 3 + add r12, r12, #1 @ CTR block 3 + + aese q3, v21.16b + aesmc q3, q3 @ AES block 3 - round 3 + trn1 q9, v14.2d, v15.2d @ h4h | h3h + + aese q0, v22.16b + aesmc q0, q0 @ AES block 0 - round 4 + ld1 {v28.4s}, [r8], #16 @ load rk10 + + aese q1, v22.16b + aesmc q1, q1 @ AES block 1 - round 4 + trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l + + aese q2, v22.16b + aesmc q2, q2 @ AES block 2 - round 4 + + aese q3, v22.16b + aesmc q3, q3 @ AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l + + aese q0, v23.16b + aesmc q0, q0 @ AES block 0 - round 5 + ld1 {v29.4s}, [r8], #16 @ load rk11 + + aese q1, v23.16b + aesmc q1, q1 @ AES block 1 - round 5 + + aese q2, v23.16b + aesmc q2, q2 @ AES block 2 - round 5 + + aese q3, v23.16b + aesmc q3, q3 @ AES block 3 - round 5 + + aese q0, v24.16b + aesmc q0, q0 @ AES block 0 - round 6 + + aese q2, v24.16b + aesmc q2, q2 @ AES block 2 - round 6 + + aese q3, v24.16b + aesmc q3, q3 @ AES block 3 - round 6 + + aese q0, v25.16b + aesmc q0, q0 @ AES block 0 - round 7 + + aese q2, v25.16b + aesmc q2, q2 @ AES block 2 - round 7 + + aese q3, v25.16b + aesmc q3, q3 @ AES block 3 - round 7 + + aese q1, v24.16b + aesmc q1, q1 @ AES block 1 - round 6 + + aese q2, v26.16b + aesmc q2, q2 @ AES block 2 - round 8 + + aese q3, v26.16b + aesmc q3, q3 @ AES block 3 - round 8 + + aese q1, v25.16b + aesmc q1, q1 @ AES block 1 - round 7 + + aese q2, v27.16b + aesmc q2, q2 @ AES block 2 - round 9 + + aese q3, v27.16b + aesmc q3, q3 @ AES block 3 - round 9 + + aese q1, v26.16b + aesmc q1, q1 @ AES block 1 - round 8 + sub r5, r5, #1 @ byte_len - 1 + + aese q0, v26.16b + aesmc q0, q0 @ AES block 0 - round 8 + and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + aese q3, v28.16b + aesmc q3, q3 @ AES block 3 - round 10 + add r5, r5, r0 + + aese q1, v27.16b + aesmc q1, q1 @ AES block 1 - round 9 + cmp r0, r5 @ check if we have <= 4 blocks + + aese q0, v27.16b + aesmc q0, q0 @ AES block 0 - round 9 + trn1 q8, v12.2d, v13.2d @ h2h | h1h + + aese q3, v29.16b @ AES block 3 - round 11 + + aese q2, v28.16b + aesmc q2, q2 @ AES block 2 - round 10 + + aese q1, v28.16b + aesmc q1, q1 @ AES block 1 - round 10 + + aese q0, v28.16b + aesmc q0, q0 @ AES block 0 - round 10 + eor v16.16b, v16.16b, q8 @ h2k | h1k + + aese q2, v29.16b @ AES block 2 - round 11 + + aese q1, v29.16b @ AES block 1 - round 11 + eor v17.16b, v17.16b, q9 @ h4k | h3k + + aese q0, v29.16b @ AES block 0 - round 11 + bge .L192_dec_tail @ handle tail + + ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext + + eor q1, q5, q1 @ AES block 1 - result + + eor q0, q4, q0 @ AES block 0 - result + rev r9, r12 @ CTR block 4 + ld1 {q6, q7}, [r0], #32 @ AES block 2,3 - load ciphertext + + mov r19, v1.d[0] @ AES block 1 - mov low + + mov r20, v1.d[1] @ AES block 1 - mov high + + mov r6, v0.d[0] @ AES block 0 - mov low + orr r9, r11, r9, lsl #32 @ CTR block 4 + add r12, r12, #1 @ CTR block 4 + + mov r7, v0.d[1] @ AES block 0 - mov high + rev64 q4, q4 @ GHASH block 0 + + fmov d0, r10 @ CTR block 4 + rev64 q5, q5 @ GHASH block 1 + cmp r0, r5 @ check if we have <= 8 blocks + + eor r19, r19, r13 @ AES block 1 - round 12 low +#ifdef __ARMEB__ + rev r19, r19 +#endif + fmov v0.d[1], r9 @ CTR block 4 + rev r9, r12 @ CTR block 5 + + orr r9, r11, r9, lsl #32 @ CTR block 5 + fmov d1, r10 @ CTR block 5 + eor r20, r20, r14 @ AES block 1 - round 12 high +#ifdef __ARMEB__ + rev r20, r20 +#endif + add r12, r12, #1 @ CTR block 5 + fmov v1.d[1], r9 @ CTR block 5 + eor r6, r6, r13 @ AES block 0 - round 12 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + rev r9, r12 @ CTR block 6 + eor r7, r7, r14 @ AES block 0 - round 12 high +#ifdef __ARMEB__ + rev r7, r7 +#endif + stp r6, r7, [r2], #16 @ AES block 0 - store result + orr r9, r11, r9, lsl #32 @ CTR block 6 + + stp r19, r20, [r2], #16 @ AES block 1 - store result + + add r12, r12, #1 @ CTR block 6 + eor q2, q6, q2 @ AES block 2 - result + bge .L192_dec_prepretail @ do prepretail + +.L192_dec_main_loop:@ main loop start + aese q1, v18.16b + aesmc q1, q1 @ AES block 4k+5 - round 0 + ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 + + pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low + mov r21, v2.d[0] @ AES block 4k+2 - mov low + + mov r22, v2.d[1] @ AES block 4k+2 - mov high + eor q3, q7, q3 @ AES block 4k+3 - result + rev64 q7, q7 @ GHASH block 4k+3 + + aese q1, v19.16b + aesmc q1, q1 @ AES block 4k+5 - round 1 + fmov d2, r10 @ CTR block 4k+6 + + aese q0, v18.16b + aesmc q0, q0 @ AES block 4k+4 - round 0 + eor q4, q4, v11.16b @ PRE 1 + + pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high + fmov v2.d[1], r9 @ CTR block 4k+6 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 4k+5 - round 2 + mov r24, v3.d[1] @ AES block 4k+3 - mov high + + aese q0, v19.16b + aesmc q0, q0 @ AES block 4k+4 - round 1 + mov r23, v3.d[0] @ AES block 4k+3 - mov low + + pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high + fmov d3, r10 @ CTR block 4k+7 + mov d8, v4.d[1] @ GHASH block 4k - mid + + pmull v11.1q, q4, v15.1d @ GHASH block 4k - low + mov d10, v17.d[1] @ GHASH block 4k - mid + rev r9, r12 @ CTR block 4k+7 + + aese q2, v18.16b + aesmc q2, q2 @ AES block 4k+6 - round 0 + orr r9, r11, r9, lsl #32 @ CTR block 4k+7 + + fmov v3.d[1], r9 @ CTR block 4k+7 + eor q8, q8, q4 @ GHASH block 4k - mid + mov d4, v5.d[1] @ GHASH block 4k+1 - mid + + aese q1, v21.16b + aesmc q1, q1 @ AES block 4k+5 - round 3 + + aese q0, v20.16b + aesmc q0, q0 @ AES block 4k+4 - round 2 + eor r22, r22, r14 @ AES block 4k+2 - round 12 high +#ifdef __ARMEB__ + rev r22, r22 +#endif + aese q2, v19.16b + aesmc q2, q2 @ AES block 4k+6 - round 1 + eor q4, q4, q5 @ GHASH block 4k+1 - mid + + pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid + + aese q3, v18.16b + aesmc q3, q3 @ AES block 4k+7 - round 0 + rev64 q6, q6 @ GHASH block 4k+2 + + aese q2, v20.16b + aesmc q2, q2 @ AES block 4k+6 - round 2 + + pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid + eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low + eor r21, r21, r13 @ AES block 4k+2 - round 12 low +#ifdef __ARMEB__ + rev r21, r21 +#endif + aese q1, v22.16b + aesmc q1, q1 @ AES block 4k+5 - round 4 + + aese q0, v21.16b + aesmc q0, q0 @ AES block 4k+4 - round 3 + + eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid + mov d31, v6.d[1] @ GHASH block 4k+2 - mid + + aese q3, v19.16b + aesmc q3, q3 @ AES block 4k+7 - round 1 + eor q9, q9, v30.16b @ GHASH block 4k+1 - high + + aese q0, v22.16b + aesmc q0, q0 @ AES block 4k+4 - round 4 + + pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high + eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid + + pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low + + aese q0, v23.16b + aesmc q0, q0 @ AES block 4k+4 - round 5 + + eor q9, q9, v30.16b @ GHASH block 4k+2 - high + mov d30, v7.d[1] @ GHASH block 4k+3 - mid + + aese q1, v23.16b + aesmc q1, q1 @ AES block 4k+5 - round 5 + + pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high + + aese q3, v20.16b + aesmc q3, q3 @ AES block 4k+7 - round 2 + eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid + + aese q1, v24.16b + aesmc q1, q1 @ AES block 4k+5 - round 6 + + aese q0, v24.16b + aesmc q0, q0 @ AES block 4k+4 - round 6 + ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid + + aese q3, v21.16b + aesmc q3, q3 @ AES block 4k+7 - round 3 + + pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid + eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low + + aese q0, v25.16b + aesmc q0, q0 @ AES block 4k+4 - round 7 + + pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid + eor q9, q9, q5 @ GHASH block 4k+3 - high + + aese q1, v25.16b + aesmc q1, q1 @ AES block 4k+5 - round 7 + + aese q0, v26.16b + aesmc q0, q0 @ AES block 4k+4 - round 8 + movi q8, #0xc2 + + pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low + + aese q1, v26.16b + aesmc q1, q1 @ AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid + + aese q2, v21.16b + aesmc q2, q2 @ AES block 4k+6 - round 3 + + aese q0, v27.16b + aesmc q0, q0 @ AES block 4k+4 - round 9 + eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low + + aese q3, v22.16b + aesmc q3, q3 @ AES block 4k+7 - round 4 + + aese q2, v22.16b + aesmc q2, q2 @ AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid + + aese q0, v28.16b + aesmc q0, q0 @ AES block 4k+4 - round 10 + + aese q1, v27.16b + aesmc q1, q1 @ AES block 4k+5 - round 9 + eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up + + aese q2, v23.16b + aesmc q2, q2 @ AES block 4k+6 - round 5 + + aese q3, v23.16b + aesmc q3, q3 @ AES block 4k+7 - round 5 + shl d8, d8, #56 @ mod_constant + + aese q1, v28.16b + aesmc q1, q1 @ AES block 4k+5 - round 10 + + aese q2, v24.16b + aesmc q2, q2 @ AES block 4k+6 - round 6 + ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext + + aese q3, v24.16b + aesmc q3, q3 @ AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up + + pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid + ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext + eor r23, r23, r13 @ AES block 4k+3 - round 12 low +#ifdef __ARMEB__ + rev r23, r23 +#endif + aese q2, v25.16b + aesmc q2, q2 @ AES block 4k+6 - round 7 + ext q9, q9, q9, #8 @ MODULO - other top alignment + + aese q0, v29.16b @ AES block 4k+4 - round 11 + add r12, r12, #1 @ CTR block 4k+7 + + aese q3, v25.16b + aesmc q3, q3 @ AES block 4k+7 - round 7 + eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid + + aese q2, v26.16b + aesmc q2, q2 @ AES block 4k+6 - round 8 + ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext + + aese q1, v29.16b @ AES block 4k+5 - round 11 + ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext + rev r9, r12 @ CTR block 4k+8 + + aese q3, v26.16b + aesmc q3, q3 @ AES block 4k+7 - round 8 + stp r21, r22, [r2], #16 @ AES block 4k+2 - store result + + aese q2, v27.16b + aesmc q2, q2 @ AES block 4k+6 - round 9 + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + + cmp r0, r5 @ .LOOP CONTROL + + eor q0, q4, q0 @ AES block 4k+4 - result + eor r24, r24, r14 @ AES block 4k+3 - round 12 high +#ifdef __ARMEB__ + rev r24, r24 +#endif + eor q1, q5, q1 @ AES block 4k+5 - result + + aese q2, v28.16b + aesmc q2, q2 @ AES block 4k+6 - round 10 + orr r9, r11, r9, lsl #32 @ CTR block 4k+8 + + aese q3, v27.16b + aesmc q3, q3 @ AES block 4k+7 - round 9 + + pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low + mov r19, v1.d[0] @ AES block 4k+5 - mov low + + mov r6, v0.d[0] @ AES block 4k+4 - mov low + stp r23, r24, [r2], #16 @ AES block 4k+3 - store result + rev64 q5, q5 @ GHASH block 4k+5 + + aese q2, v29.16b @ AES block 4k+6 - round 11 + mov r7, v0.d[1] @ AES block 4k+4 - mov high + + aese q3, v28.16b + aesmc q3, q3 @ AES block 4k+7 - round 10 + mov r20, v1.d[1] @ AES block 4k+5 - mov high + + fmov d0, r10 @ CTR block 4k+8 + add r12, r12, #1 @ CTR block 4k+8 + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + + eor q2, q6, q2 @ AES block 4k+6 - result + fmov v0.d[1], r9 @ CTR block 4k+8 + rev r9, r12 @ CTR block 4k+9 + + eor r6, r6, r13 @ AES block 4k+4 - round 12 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + orr r9, r11, r9, lsl #32 @ CTR block 4k+9 + eor v11.16b, v11.16b, q8 @ MODULO - fold into low + + fmov d1, r10 @ CTR block 4k+9 + add r12, r12, #1 @ CTR block 4k+9 + eor r19, r19, r13 @ AES block 4k+5 - round 12 low +#ifdef __ARMEB__ + rev r19, r19 +#endif + fmov v1.d[1], r9 @ CTR block 4k+9 + rev r9, r12 @ CTR block 4k+10 + eor r20, r20, r14 @ AES block 4k+5 - round 12 high +#ifdef __ARMEB__ + rev r20, r20 +#endif + eor r7, r7, r14 @ AES block 4k+4 - round 12 high +#ifdef __ARMEB__ + rev r7, r7 +#endif + stp r6, r7, [r2], #16 @ AES block 4k+4 - store result + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low + + add r12, r12, #1 @ CTR block 4k+10 + rev64 q4, q4 @ GHASH block 4k+4 + orr r9, r11, r9, lsl #32 @ CTR block 4k+10 + + aese q3, v29.16b @ AES block 4k+7 - round 11 + stp r19, r20, [r2], #16 @ AES block 4k+5 - store result + blt .L192_dec_main_loop + +.L192_dec_prepretail:@ PREPRETAIL + mov r22, v2.d[1] @ AES block 4k+2 - mov high + ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 + eor q3, q7, q3 @ AES block 4k+3 - result + + aese q1, v18.16b + aesmc q1, q1 @ AES block 4k+5 - round 0 + mov r21, v2.d[0] @ AES block 4k+2 - mov low + + aese q0, v18.16b + aesmc q0, q0 @ AES block 4k+4 - round 0 + mov d10, v17.d[1] @ GHASH block 4k - mid + + eor q4, q4, v11.16b @ PRE 1 + fmov d2, r10 @ CTR block 4k+6 + + aese q1, v19.16b + aesmc q1, q1 @ AES block 4k+5 - round 1 + mov r23, v3.d[0] @ AES block 4k+3 - mov low + + aese q0, v19.16b + aesmc q0, q0 @ AES block 4k+4 - round 1 + mov r24, v3.d[1] @ AES block 4k+3 - mov high + + pmull v11.1q, q4, v15.1d @ GHASH block 4k - low + mov d8, v4.d[1] @ GHASH block 4k - mid + fmov d3, r10 @ CTR block 4k+7 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 4k+5 - round 2 + rev64 q6, q6 @ GHASH block 4k+2 + + pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high + fmov v2.d[1], r9 @ CTR block 4k+6 + rev r9, r12 @ CTR block 4k+7 + + orr r9, r11, r9, lsl #32 @ CTR block 4k+7 + eor q8, q8, q4 @ GHASH block 4k - mid + mov d4, v5.d[1] @ GHASH block 4k+1 - mid + + pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low + eor r24, r24, r14 @ AES block 4k+3 - round 12 high +#ifdef __ARMEB__ + rev r24, r24 +#endif + fmov v3.d[1], r9 @ CTR block 4k+7 + + aese q0, v20.16b + aesmc q0, q0 @ AES block 4k+4 - round 2 + eor r21, r21, r13 @ AES block 4k+2 - round 12 low +#ifdef __ARMEB__ + rev r21, r21 +#endif + pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high + eor r22, r22, r14 @ AES block 4k+2 - round 12 high +#ifdef __ARMEB__ + rev r22, r22 +#endif + eor q4, q4, q5 @ GHASH block 4k+1 - mid + + pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid + eor r23, r23, r13 @ AES block 4k+3 - round 12 low +#ifdef __ARMEB__ + rev r23, r23 +#endif + stp r21, r22, [r2], #16 @ AES block 4k+2 - store result + + rev64 q7, q7 @ GHASH block 4k+3 + stp r23, r24, [r2], #16 @ AES block 4k+3 - store result + + aese q3, v18.16b + aesmc q3, q3 @ AES block 4k+7 - round 0 + eor q9, q9, v30.16b @ GHASH block 4k+1 - high + + pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid + add r12, r12, #1 @ CTR block 4k+7 + + pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high + eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low + + aese q2, v18.16b + aesmc q2, q2 @ AES block 4k+6 - round 0 + + eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid + mov d31, v6.d[1] @ GHASH block 4k+2 - mid + + aese q3, v19.16b + aesmc q3, q3 @ AES block 4k+7 - round 1 + + aese q2, v19.16b + aesmc q2, q2 @ AES block 4k+6 - round 1 + eor q9, q9, v30.16b @ GHASH block 4k+2 - high + + eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid + + pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low + + aese q2, v20.16b + aesmc q2, q2 @ AES block 4k+6 - round 2 + mov d30, v7.d[1] @ GHASH block 4k+3 - mid + + aese q3, v20.16b + aesmc q3, q3 @ AES block 4k+7 - round 2 + ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid + + pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low + + aese q0, v21.16b + aesmc q0, q0 @ AES block 4k+4 - round 3 + eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid + + aese q1, v21.16b + aesmc q1, q1 @ AES block 4k+5 - round 3 + + pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid + eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low + + aese q0, v22.16b + aesmc q0, q0 @ AES block 4k+4 - round 4 + + pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high + movi q8, #0xc2 + + pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid + + aese q2, v21.16b + aesmc q2, q2 @ AES block 4k+6 - round 3 + + shl d8, d8, #56 @ mod_constant + eor q9, q9, q5 @ GHASH block 4k+3 - high + + aese q0, v23.16b + aesmc q0, q0 @ AES block 4k+4 - round 5 + eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid + + aese q2, v22.16b + aesmc q2, q2 @ AES block 4k+6 - round 4 + + pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid + eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low + + aese q0, v24.16b + aesmc q0, q0 @ AES block 4k+4 - round 6 + + aese q3, v21.16b + aesmc q3, q3 @ AES block 4k+7 - round 3 + eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid + + aese q2, v23.16b + aesmc q2, q2 @ AES block 4k+6 - round 5 + + aese q0, v25.16b + aesmc q0, q0 @ AES block 4k+4 - round 7 + eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up + + aese q3, v22.16b + aesmc q3, q3 @ AES block 4k+7 - round 4 + + aese q2, v24.16b + aesmc q2, q2 @ AES block 4k+6 - round 6 + ext q9, q9, q9, #8 @ MODULO - other top alignment + + aese q0, v26.16b + aesmc q0, q0 @ AES block 4k+4 - round 8 + + aese q3, v23.16b + aesmc q3, q3 @ AES block 4k+7 - round 5 + eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up + + aese q1, v22.16b + aesmc q1, q1 @ AES block 4k+5 - round 4 + + aese q2, v25.16b + aesmc q2, q2 @ AES block 4k+6 - round 7 + + aese q0, v27.16b + aesmc q0, q0 @ AES block 4k+4 - round 9 + + aese q1, v23.16b + aesmc q1, q1 @ AES block 4k+5 - round 5 + + aese q3, v24.16b + aesmc q3, q3 @ AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid + + aese q0, v28.16b + aesmc q0, q0 @ AES block 4k+4 - round 10 + + aese q1, v24.16b + aesmc q1, q1 @ AES block 4k+5 - round 6 + + aese q3, v25.16b + aesmc q3, q3 @ AES block 4k+7 - round 7 + + aese q2, v26.16b + aesmc q2, q2 @ AES block 4k+6 - round 8 + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + + aese q1, v25.16b + aesmc q1, q1 @ AES block 4k+5 - round 7 + + aese q3, v26.16b + aesmc q3, q3 @ AES block 4k+7 - round 8 + + aese q2, v27.16b + aesmc q2, q2 @ AES block 4k+6 - round 9 + + aese q1, v26.16b + aesmc q1, q1 @ AES block 4k+5 - round 8 + + aese q3, v27.16b + aesmc q3, q3 @ AES block 4k+7 - round 9 + + pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low + + aese q1, v27.16b + aesmc q1, q1 @ AES block 4k+5 - round 9 + + aese q2, v28.16b + aesmc q2, q2 @ AES block 4k+6 - round 10 + + aese q3, v28.16b + aesmc q3, q3 @ AES block 4k+7 - round 10 + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + + aese q1, v28.16b + aesmc q1, q1 @ AES block 4k+5 - round 10 + + aese q0, v29.16b + eor v11.16b, v11.16b, q8 @ MODULO - fold into low + + aese q2, v29.16b + + aese q1, v29.16b + + aese q3, v29.16b + + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low +.L192_dec_tail:@ TAIL + + sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process + ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext + + eor q0, q5, q0 @ AES block 4k+4 - result + + mov r7, v0.d[1] @ AES block 4k+4 - mov high + + mov r6, v0.d[0] @ AES block 4k+4 - mov low + + ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag + + cmp r5, #48 + + eor r7, r7, r14 @ AES block 4k+4 - round 12 high +#ifdef __ARMEB__ + rev r7, r7 +#endif + eor r6, r6, r13 @ AES block 4k+4 - round 12 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + bgt .L192_dec_blocks_more_than_3 + + movi v11.8b, #0 + movi q9, #0 + + mov q3, q2 + mov q2, q1 + sub r12, r12, #1 + + movi v10.8b, #0 + cmp r5, #32 + bgt .L192_dec_blocks_more_than_2 + + mov q3, q1 + cmp r5, #16 + sub r12, r12, #1 + + bgt .L192_dec_blocks_more_than_1 + + sub r12, r12, #1 + b .L192_dec_blocks_less_than_1 +.L192_dec_blocks_more_than_3:@ blocks left > 3 + rev64 q4, q5 @ GHASH final-3 block + ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext + + stp r6, r7, [r2], #16 @ AES final-3 block - store result + + eor q4, q4, q8 @ feed in partial tag + + eor q0, q5, q1 @ AES final-2 block - result + + pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low + mov r6, v0.d[0] @ AES final-2 block - mov low + mov d22, v4.d[1] @ GHASH final-3 block - mid + + mov r7, v0.d[1] @ AES final-2 block - mov high + + mov d10, v17.d[1] @ GHASH final-3 block - mid + eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid + + pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high + + eor r6, r6, r13 @ AES final-2 block - round 12 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + movi q8, #0 @ suppress further partial tag feed in + + pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid + eor r7, r7, r14 @ AES final-2 block - round 12 high +#ifdef __ARMEB__ + rev r7, r7 +#endif +.L192_dec_blocks_more_than_2:@ blocks left > 2 + + rev64 q4, q5 @ GHASH final-2 block + ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext + + eor q4, q4, q8 @ feed in partial tag + + movi q8, #0 @ suppress further partial tag feed in + + eor q0, q5, q2 @ AES final-1 block - result + + mov d22, v4.d[1] @ GHASH final-2 block - mid + + pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low + + stp r6, r7, [r2], #16 @ AES final-2 block - store result + + eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid + mov r7, v0.d[1] @ AES final-1 block - mov high + + eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low + mov r6, v0.d[0] @ AES final-1 block - mov low + + pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high + + pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid + + eor q9, q9, v20.16b @ GHASH final-2 block - high + eor r7, r7, r14 @ AES final-1 block - round 12 high +#ifdef __ARMEB__ + rev r7, r7 +#endif + eor r6, r6, r13 @ AES final-1 block - round 12 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid +.L192_dec_blocks_more_than_1:@ blocks left > 1 + + rev64 q4, q5 @ GHASH final-1 block + + eor q4, q4, q8 @ feed in partial tag + ld1 { q5}, [r0], #16 @ AES final block - load ciphertext + + mov d22, v4.d[1] @ GHASH final-1 block - mid + + pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high + + eor q0, q5, q3 @ AES final block - result + stp r6, r7, [r2], #16 @ AES final-1 block - store result + + eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid + + eor q9, q9, v20.16b @ GHASH final-1 block - high + + pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low + mov r7, v0.d[1] @ AES final block - mov high + + ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid + mov r6, v0.d[0] @ AES final block - mov low + + pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid + + movi q8, #0 @ suppress further partial tag feed in + eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low + eor r7, r7, r14 @ AES final block - round 12 high +#ifdef __ARMEB__ + rev r7, r7 +#endif + eor r6, r6, r13 @ AES final block - round 12 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid +.L192_dec_blocks_less_than_1:@ blocks left <= 1 + + mvn r13, xzr @ rk12_l = 0xffffffffffffffff + ldp r4, r5, [r2] @ load existing bytes we need to not overwrite + and r1, r1, #127 @ bit_length %= 128 + + sub r1, r1, #128 @ bit_length -= 128 + + neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) + + and r1, r1, #127 @ bit_length %= 128 + mvn r14, xzr @ rk12_h = 0xffffffffffffffff + + lsr r14, r14, r1 @ rk12_h is mask for top 64b of last block + cmp r1, #64 + + csel r9, r13, r14, lt + csel r10, r14, xzr, lt + + fmov d0, r9 @ ctr0b is mask for last block + and r6, r6, r9 + bic r4, r4, r9 @ mask out low existing bytes + + orr r6, r6, r4 + mov v0.d[1], r10 +#ifndef __ARMEB__ + rev r9, r12 +#else + mov r9, r12 +#endif + + and q5, q5, q0 @ possibly partial last block has zeroes in highest bits + str r9, [r16, #12] @ store the updated counter + + rev64 q4, q5 @ GHASH final block + + eor q4, q4, q8 @ feed in partial tag + bic r5, r5, r10 @ mask out high existing bytes + + and r7, r7, r10 + + pmull2 v20.1q, q4, v12.2d @ GHASH final block - high + mov d8, v4.d[1] @ GHASH final block - mid + + pmull v21.1q, q4, v12.1d @ GHASH final block - low + + eor q8, q8, q4 @ GHASH final block - mid + + eor q9, q9, v20.16b @ GHASH final block - high + + pmull v8.1q, q8, v16.1d @ GHASH final block - mid + + eor v11.16b, v11.16b, v21.16b @ GHASH final block - low + + eor v10.16b, v10.16b, q8 @ GHASH final block - mid + movi q8, #0xc2 + + eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up + + shl d8, d8, #56 @ mod_constant + + eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up + + pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid + orr r7, r7, r5 + stp r6, r7, [r2] + + ext q9, q9, q9, #8 @ MODULO - other top alignment + + eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid + + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + + pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low + + eor v11.16b, v11.16b, q8 @ MODULO - fold into low + + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov r0, r15 + st1 { v11.16b }, [r3] + + ldp r21, r22, [sp, #16] + ldp r23, r24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp r19, r20, [sp], #112 + bx lr + +.L192_dec_ret: + mov r0, #0x0 + bx lr +.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel +.globl aes_gcm_enc_256_kernel +.type aes_gcm_enc_256_kernel,%function +.align 4 +aes_gcm_enc_256_kernel: + AARCH64_VALID_CALL_TARGET + cbz r1, .L256_enc_ret + stp r19, r20, [sp, #-112]! + mov r16, r4 + mov r8, r5 + stp r21, r22, [sp, #16] + stp r23, r24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + add r4, r0, r1, lsr #3 @ end_input_ptr + lsr r5, r1, #3 @ byte_len + mov r15, r5 + ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 +#ifdef __ARMEB__ + rev r10, r10 + rev r11, r11 +#endif + ldp r13, r14, [r8, #224] @ load rk14 +#ifdef __ARMEB__ + ror r13, r13, #32 + ror r14, r14, #32 +#endif + ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible + sub r5, r5, #1 @ byte_len - 1 + + ld1 {v18.4s}, [r8], #16 @ load rk0 + and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + ld1 {v19.4s}, [r8], #16 @ load rk1 + add r5, r5, r0 + + lsr r12, r11, #32 + fmov d2, r10 @ CTR block 2 + orr r11, r11, r11 + + rev r12, r12 @ rev_ctr32 + cmp r0, r5 @ check if we have <= 4 blocks + fmov d1, r10 @ CTR block 1 + + aese q0, v18.16b + aesmc q0, q0 @ AES block 0 - round 0 + add r12, r12, #1 @ increment rev_ctr32 + + rev r9, r12 @ CTR block 1 + fmov d3, r10 @ CTR block 3 + + orr r9, r11, r9, lsl #32 @ CTR block 1 + add r12, r12, #1 @ CTR block 1 + ld1 {v20.4s}, [r8], #16 @ load rk2 + + fmov v1.d[1], r9 @ CTR block 1 + rev r9, r12 @ CTR block 2 + add r12, r12, #1 @ CTR block 2 + + orr r9, r11, r9, lsl #32 @ CTR block 2 + ld1 {v21.4s}, [r8], #16 @ load rk3 + + fmov v2.d[1], r9 @ CTR block 2 + rev r9, r12 @ CTR block 3 + + aese q0, v19.16b + aesmc q0, q0 @ AES block 0 - round 1 + orr r9, r11, r9, lsl #32 @ CTR block 3 + + fmov v3.d[1], r9 @ CTR block 3 + + aese q1, v18.16b + aesmc q1, q1 @ AES block 1 - round 0 + ld1 {v22.4s}, [r8], #16 @ load rk4 + + aese q0, v20.16b + aesmc q0, q0 @ AES block 0 - round 2 + ld1 {v23.4s}, [r8], #16 @ load rk5 + + aese q2, v18.16b + aesmc q2, q2 @ AES block 2 - round 0 + ld1 {v24.4s}, [r8], #16 @ load rk6 + + aese q1, v19.16b + aesmc q1, q1 @ AES block 1 - round 1 + ldr q14, [r3, #80] @ load h3l | h3h +#ifndef __ARMEB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese q3, v18.16b + aesmc q3, q3 @ AES block 3 - round 0 + ld1 {v25.4s}, [r8], #16 @ load rk7 + + aese q2, v19.16b + aesmc q2, q2 @ AES block 2 - round 1 + ld1 {v26.4s}, [r8], #16 @ load rk8 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 1 - round 2 + ldr q13, [r3, #64] @ load h2l | h2h +#ifndef __ARMEB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + aese q3, v19.16b + aesmc q3, q3 @ AES block 3 - round 1 + ld1 {v27.4s}, [r8], #16 @ load rk9 + + aese q2, v20.16b + aesmc q2, q2 @ AES block 2 - round 2 + ldr q15, [r3, #112] @ load h4l | h4h +#ifndef __ARMEB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + aese q1, v21.16b + aesmc q1, q1 @ AES block 1 - round 3 + ld1 {v28.4s}, [r8], #16 @ load rk10 + + aese q3, v20.16b + aesmc q3, q3 @ AES block 3 - round 2 + ld1 {v29.4s}, [r8], #16 @ load rk11 + + aese q2, v21.16b + aesmc q2, q2 @ AES block 2 - round 3 + add r12, r12, #1 @ CTR block 3 + + aese q0, v21.16b + aesmc q0, q0 @ AES block 0 - round 3 + + aese q3, v21.16b + aesmc q3, q3 @ AES block 3 - round 3 + ld1 { v11.16b}, [r3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese q2, v22.16b + aesmc q2, q2 @ AES block 2 - round 4 + + aese q0, v22.16b + aesmc q0, q0 @ AES block 0 - round 4 + + aese q1, v22.16b + aesmc q1, q1 @ AES block 1 - round 4 + + aese q3, v22.16b + aesmc q3, q3 @ AES block 3 - round 4 + + aese q0, v23.16b + aesmc q0, q0 @ AES block 0 - round 5 + + aese q1, v23.16b + aesmc q1, q1 @ AES block 1 - round 5 + + aese q3, v23.16b + aesmc q3, q3 @ AES block 3 - round 5 + + aese q2, v23.16b + aesmc q2, q2 @ AES block 2 - round 5 + + aese q1, v24.16b + aesmc q1, q1 @ AES block 1 - round 6 + trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l + + aese q3, v24.16b + aesmc q3, q3 @ AES block 3 - round 6 + ld1 {v30.4s}, [r8], #16 @ load rk12 + + aese q0, v24.16b + aesmc q0, q0 @ AES block 0 - round 6 + ldr q12, [r3, #32] @ load h1l | h1h +#ifndef __ARMEB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese q2, v24.16b + aesmc q2, q2 @ AES block 2 - round 6 + ld1 {v31.4s}, [r8], #16 @ load rk13 + + aese q1, v25.16b + aesmc q1, q1 @ AES block 1 - round 7 + trn1 q9, v14.2d, v15.2d @ h4h | h3h + + aese q0, v25.16b + aesmc q0, q0 @ AES block 0 - round 7 + + aese q2, v25.16b + aesmc q2, q2 @ AES block 2 - round 7 + + aese q3, v25.16b + aesmc q3, q3 @ AES block 3 - round 7 + trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l + + aese q1, v26.16b + aesmc q1, q1 @ AES block 1 - round 8 + + aese q2, v26.16b + aesmc q2, q2 @ AES block 2 - round 8 + + aese q3, v26.16b + aesmc q3, q3 @ AES block 3 - round 8 + + aese q1, v27.16b + aesmc q1, q1 @ AES block 1 - round 9 + + aese q2, v27.16b + aesmc q2, q2 @ AES block 2 - round 9 + + aese q0, v26.16b + aesmc q0, q0 @ AES block 0 - round 8 + + aese q1, v28.16b + aesmc q1, q1 @ AES block 1 - round 10 + + aese q3, v27.16b + aesmc q3, q3 @ AES block 3 - round 9 + + aese q0, v27.16b + aesmc q0, q0 @ AES block 0 - round 9 + + aese q2, v28.16b + aesmc q2, q2 @ AES block 2 - round 10 + + aese q3, v28.16b + aesmc q3, q3 @ AES block 3 - round 10 + + aese q1, v29.16b + aesmc q1, q1 @ AES block 1 - round 11 + + aese q2, v29.16b + aesmc q2, q2 @ AES block 2 - round 11 + + aese q0, v28.16b + aesmc q0, q0 @ AES block 0 - round 10 + + aese q1, v30.16b + aesmc q1, q1 @ AES block 1 - round 12 + + aese q2, v30.16b + aesmc q2, q2 @ AES block 2 - round 12 + + aese q0, v29.16b + aesmc q0, q0 @ AES block 0 - round 11 + eor v17.16b, v17.16b, q9 @ h4k | h3k + + aese q3, v29.16b + aesmc q3, q3 @ AES block 3 - round 11 + + aese q2, v31.16b @ AES block 2 - round 13 + trn1 q8, v12.2d, v13.2d @ h2h | h1h + + aese q0, v30.16b + aesmc q0, q0 @ AES block 0 - round 12 + + aese q3, v30.16b + aesmc q3, q3 @ AES block 3 - round 12 + + aese q1, v31.16b @ AES block 1 - round 13 + + aese q0, v31.16b @ AES block 0 - round 13 + + aese q3, v31.16b @ AES block 3 - round 13 + eor v16.16b, v16.16b, q8 @ h2k | h1k + bge .L256_enc_tail @ handle tail + + ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext +#ifdef __ARMEB__ + rev r19, r19 + rev r20, r20 +#endif + rev r9, r12 @ CTR block 4 + ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext +#ifdef __ARMEB__ + rev r23, r23 + rev r24, r24 +#endif + ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext +#ifdef __ARMEB__ + rev r21, r21 + rev r22, r22 +#endif + add r0, r0, #64 @ AES input_ptr update + + eor r19, r19, r13 @ AES block 1 - round 14 low + eor r20, r20, r14 @ AES block 1 - round 14 high + + fmov d5, r19 @ AES block 1 - mov low + eor r6, r6, r13 @ AES block 0 - round 14 low + + eor r7, r7, r14 @ AES block 0 - round 14 high + eor r24, r24, r14 @ AES block 3 - round 14 high + fmov d4, r6 @ AES block 0 - mov low + + cmp r0, r5 @ check if we have <= 8 blocks + fmov v4.d[1], r7 @ AES block 0 - mov high + eor r23, r23, r13 @ AES block 3 - round 14 low + + eor r21, r21, r13 @ AES block 2 - round 14 low + fmov v5.d[1], r20 @ AES block 1 - mov high + + fmov d6, r21 @ AES block 2 - mov low + add r12, r12, #1 @ CTR block 4 + + orr r9, r11, r9, lsl #32 @ CTR block 4 + fmov d7, r23 @ AES block 3 - mov low + eor r22, r22, r14 @ AES block 2 - round 14 high + + fmov v6.d[1], r22 @ AES block 2 - mov high + + eor q4, q4, q0 @ AES block 0 - result + fmov d0, r10 @ CTR block 4 + + fmov v0.d[1], r9 @ CTR block 4 + rev r9, r12 @ CTR block 5 + add r12, r12, #1 @ CTR block 5 + + eor q5, q5, q1 @ AES block 1 - result + fmov d1, r10 @ CTR block 5 + orr r9, r11, r9, lsl #32 @ CTR block 5 + + fmov v1.d[1], r9 @ CTR block 5 + rev r9, r12 @ CTR block 6 + st1 { q4}, [r2], #16 @ AES block 0 - store result + + fmov v7.d[1], r24 @ AES block 3 - mov high + orr r9, r11, r9, lsl #32 @ CTR block 6 + eor q6, q6, q2 @ AES block 2 - result + + st1 { q5}, [r2], #16 @ AES block 1 - store result + + add r12, r12, #1 @ CTR block 6 + fmov d2, r10 @ CTR block 6 + + fmov v2.d[1], r9 @ CTR block 6 + st1 { q6}, [r2], #16 @ AES block 2 - store result + rev r9, r12 @ CTR block 7 + + orr r9, r11, r9, lsl #32 @ CTR block 7 + + eor q7, q7, q3 @ AES block 3 - result + st1 { q7}, [r2], #16 @ AES block 3 - store result + bge .L256_enc_prepretail @ do prepretail + +.L256_enc_main_loop:@ main loop start + aese q0, v18.16b + aesmc q0, q0 @ AES block 4k+4 - round 0 + rev64 q4, q4 @ GHASH block 4k (only t0 is free) + + aese q1, v18.16b + aesmc q1, q1 @ AES block 4k+5 - round 0 + fmov d3, r10 @ CTR block 4k+3 + + aese q2, v18.16b + aesmc q2, q2 @ AES block 4k+6 - round 0 + ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 + + aese q0, v19.16b + aesmc q0, q0 @ AES block 4k+4 - round 1 + fmov v3.d[1], r9 @ CTR block 4k+3 + + aese q1, v19.16b + aesmc q1, q1 @ AES block 4k+5 - round 1 + ldp r23, r24, [r0, #48] @ AES block 4k+7 - load plaintext +#ifdef __ARMEB__ + rev r23, r23 + rev r24, r24 +#endif + aese q2, v19.16b + aesmc q2, q2 @ AES block 4k+6 - round 1 + ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext +#ifdef __ARMEB__ + rev r21, r21 + rev r22, r22 +#endif + aese q0, v20.16b + aesmc q0, q0 @ AES block 4k+4 - round 2 + eor q4, q4, v11.16b @ PRE 1 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 4k+5 - round 2 + + aese q3, v18.16b + aesmc q3, q3 @ AES block 4k+7 - round 0 + eor r23, r23, r13 @ AES block 4k+7 - round 14 low + + aese q0, v21.16b + aesmc q0, q0 @ AES block 4k+4 - round 3 + mov d10, v17.d[1] @ GHASH block 4k - mid + + pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high + eor r22, r22, r14 @ AES block 4k+6 - round 14 high + mov d8, v4.d[1] @ GHASH block 4k - mid + + aese q3, v19.16b + aesmc q3, q3 @ AES block 4k+7 - round 1 + rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) + + aese q0, v22.16b + aesmc q0, q0 @ AES block 4k+4 - round 4 + + pmull v11.1q, q4, v15.1d @ GHASH block 4k - low + eor q8, q8, q4 @ GHASH block 4k - mid + + aese q2, v20.16b + aesmc q2, q2 @ AES block 4k+6 - round 2 + + aese q0, v23.16b + aesmc q0, q0 @ AES block 4k+4 - round 5 + rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) + + pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high + + pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid + rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) + + pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low + + eor q9, q9, q4 @ GHASH block 4k+1 - high + mov d4, v5.d[1] @ GHASH block 4k+1 - mid + + aese q1, v21.16b + aesmc q1, q1 @ AES block 4k+5 - round 3 + + aese q3, v20.16b + aesmc q3, q3 @ AES block 4k+7 - round 2 + eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low + + aese q2, v21.16b + aesmc q2, q2 @ AES block 4k+6 - round 3 + + aese q1, v22.16b + aesmc q1, q1 @ AES block 4k+5 - round 4 + mov d8, v6.d[1] @ GHASH block 4k+2 - mid + + aese q3, v21.16b + aesmc q3, q3 @ AES block 4k+7 - round 3 + eor q4, q4, q5 @ GHASH block 4k+1 - mid + + aese q2, v22.16b + aesmc q2, q2 @ AES block 4k+6 - round 4 + + aese q0, v24.16b + aesmc q0, q0 @ AES block 4k+4 - round 6 + eor q8, q8, q6 @ GHASH block 4k+2 - mid + + aese q3, v22.16b + aesmc q3, q3 @ AES block 4k+7 - round 4 + + pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid + + aese q0, v25.16b + aesmc q0, q0 @ AES block 4k+4 - round 7 + + aese q3, v23.16b + aesmc q3, q3 @ AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid + + aese q1, v23.16b + aesmc q1, q1 @ AES block 4k+5 - round 5 + + aese q0, v26.16b + aesmc q0, q0 @ AES block 4k+4 - round 8 + + aese q2, v23.16b + aesmc q2, q2 @ AES block 4k+6 - round 5 + + aese q1, v24.16b + aesmc q1, q1 @ AES block 4k+5 - round 6 + eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid + + pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high + + pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low + + aese q1, v25.16b + aesmc q1, q1 @ AES block 4k+5 - round 7 + + pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low + eor q9, q9, q4 @ GHASH block 4k+2 - high + + aese q3, v24.16b + aesmc q3, q3 @ AES block 4k+7 - round 6 + ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext +#ifdef __ARMEB__ + rev r19, r19 + rev r20, r20 +#endif + aese q1, v26.16b + aesmc q1, q1 @ AES block 4k+5 - round 8 + mov d4, v7.d[1] @ GHASH block 4k+3 - mid + + aese q2, v24.16b + aesmc q2, q2 @ AES block 4k+6 - round 6 + eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low + + pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid + + pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high + eor q4, q4, q7 @ GHASH block 4k+3 - mid + + aese q2, v25.16b + aesmc q2, q2 @ AES block 4k+6 - round 7 + eor r19, r19, r13 @ AES block 4k+5 - round 14 low + + aese q1, v27.16b + aesmc q1, q1 @ AES block 4k+5 - round 9 + eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid + + aese q3, v25.16b + aesmc q3, q3 @ AES block 4k+7 - round 7 + eor r21, r21, r13 @ AES block 4k+6 - round 14 low + + aese q0, v27.16b + aesmc q0, q0 @ AES block 4k+4 - round 9 + movi q8, #0xc2 + + pmull v4.1q, q4, v16.1d @ GHASH block 4k+3 - mid + eor q9, q9, q5 @ GHASH block 4k+3 - high + fmov d5, r19 @ AES block 4k+5 - mov low + + aese q2, v26.16b + aesmc q2, q2 @ AES block 4k+6 - round 8 + ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + aese q0, v28.16b + aesmc q0, q0 @ AES block 4k+4 - round 10 + shl d8, d8, #56 @ mod_constant + + aese q3, v26.16b + aesmc q3, q3 @ AES block 4k+7 - round 8 + eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low + + aese q2, v27.16b + aesmc q2, q2 @ AES block 4k+6 - round 9 + + aese q1, v28.16b + aesmc q1, q1 @ AES block 4k+5 - round 10 + eor v10.16b, v10.16b, q4 @ GHASH block 4k+3 - mid + + aese q3, v27.16b + aesmc q3, q3 @ AES block 4k+7 - round 9 + add r12, r12, #1 @ CTR block 4k+3 + + aese q0, v29.16b + aesmc q0, q0 @ AES block 4k+4 - round 11 + eor q4, v11.16b, q9 @ MODULO - karatsuba tidy up + + aese q1, v29.16b + aesmc q1, q1 @ AES block 4k+5 - round 11 + add r0, r0, #64 @ AES input_ptr update + + pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid + rev r9, r12 @ CTR block 4k+8 + ext q9, q9, q9, #8 @ MODULO - other top alignment + + aese q2, v28.16b + aesmc q2, q2 @ AES block 4k+6 - round 10 + eor r6, r6, r13 @ AES block 4k+4 - round 14 low + + aese q1, v30.16b + aesmc q1, q1 @ AES block 4k+5 - round 12 + eor v10.16b, v10.16b, q4 @ MODULO - karatsuba tidy up + + aese q3, v28.16b + aesmc q3, q3 @ AES block 4k+7 - round 10 + eor r7, r7, r14 @ AES block 4k+4 - round 14 high + + fmov d4, r6 @ AES block 4k+4 - mov low + orr r9, r11, r9, lsl #32 @ CTR block 4k+8 + eor q7, q9, q7 @ MODULO - fold into mid + + aese q0, v30.16b + aesmc q0, q0 @ AES block 4k+4 - round 12 + eor r20, r20, r14 @ AES block 4k+5 - round 14 high + + aese q2, v29.16b + aesmc q2, q2 @ AES block 4k+6 - round 11 + eor r24, r24, r14 @ AES block 4k+7 - round 14 high + + aese q3, v29.16b + aesmc q3, q3 @ AES block 4k+7 - round 11 + add r12, r12, #1 @ CTR block 4k+8 + + aese q0, v31.16b @ AES block 4k+4 - round 13 + fmov v4.d[1], r7 @ AES block 4k+4 - mov high + eor v10.16b, v10.16b, q7 @ MODULO - fold into mid + + aese q2, v30.16b + aesmc q2, q2 @ AES block 4k+6 - round 12 + fmov d7, r23 @ AES block 4k+7 - mov low + + aese q1, v31.16b @ AES block 4k+5 - round 13 + fmov v5.d[1], r20 @ AES block 4k+5 - mov high + + fmov d6, r21 @ AES block 4k+6 - mov low + cmp r0, r5 @ .LOOP CONTROL + + fmov v6.d[1], r22 @ AES block 4k+6 - mov high + + pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low + eor q4, q4, q0 @ AES block 4k+4 - result + fmov d0, r10 @ CTR block 4k+8 + + fmov v0.d[1], r9 @ CTR block 4k+8 + rev r9, r12 @ CTR block 4k+9 + add r12, r12, #1 @ CTR block 4k+9 + + eor q5, q5, q1 @ AES block 4k+5 - result + fmov d1, r10 @ CTR block 4k+9 + orr r9, r11, r9, lsl #32 @ CTR block 4k+9 + + aese q3, v30.16b + aesmc q3, q3 @ AES block 4k+7 - round 12 + fmov v1.d[1], r9 @ CTR block 4k+9 + + aese q2, v31.16b @ AES block 4k+6 - round 13 + rev r9, r12 @ CTR block 4k+10 + st1 { q4}, [r2], #16 @ AES block 4k+4 - store result + + orr r9, r11, r9, lsl #32 @ CTR block 4k+10 + eor v11.16b, v11.16b, q9 @ MODULO - fold into low + fmov v7.d[1], r24 @ AES block 4k+7 - mov high + + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + st1 { q5}, [r2], #16 @ AES block 4k+5 - store result + add r12, r12, #1 @ CTR block 4k+10 + + aese q3, v31.16b @ AES block 4k+7 - round 13 + eor q6, q6, q2 @ AES block 4k+6 - result + fmov d2, r10 @ CTR block 4k+10 + + st1 { q6}, [r2], #16 @ AES block 4k+6 - store result + fmov v2.d[1], r9 @ CTR block 4k+10 + rev r9, r12 @ CTR block 4k+11 + + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low + orr r9, r11, r9, lsl #32 @ CTR block 4k+11 + + eor q7, q7, q3 @ AES block 4k+7 - result + st1 { q7}, [r2], #16 @ AES block 4k+7 - store result + blt .L256_enc_main_loop + +.L256_enc_prepretail:@ PREPRETAIL + aese q1, v18.16b + aesmc q1, q1 @ AES block 4k+5 - round 0 + rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) + + aese q2, v18.16b + aesmc q2, q2 @ AES block 4k+6 - round 0 + fmov d3, r10 @ CTR block 4k+3 + + aese q0, v18.16b + aesmc q0, q0 @ AES block 4k+4 - round 0 + rev64 q4, q4 @ GHASH block 4k (only t0 is free) + + fmov v3.d[1], r9 @ CTR block 4k+3 + ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 + + aese q2, v19.16b + aesmc q2, q2 @ AES block 4k+6 - round 1 + + aese q0, v19.16b + aesmc q0, q0 @ AES block 4k+4 - round 1 + + eor q4, q4, v11.16b @ PRE 1 + rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) + + aese q2, v20.16b + aesmc q2, q2 @ AES block 4k+6 - round 2 + + aese q3, v18.16b + aesmc q3, q3 @ AES block 4k+7 - round 0 + mov d10, v17.d[1] @ GHASH block 4k - mid + + aese q1, v19.16b + aesmc q1, q1 @ AES block 4k+5 - round 1 + + pmull v11.1q, q4, v15.1d @ GHASH block 4k - low + mov d8, v4.d[1] @ GHASH block 4k - mid + + pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high + + aese q2, v21.16b + aesmc q2, q2 @ AES block 4k+6 - round 3 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 4k+5 - round 2 + eor q8, q8, q4 @ GHASH block 4k - mid + + aese q0, v20.16b + aesmc q0, q0 @ AES block 4k+4 - round 2 + + aese q3, v19.16b + aesmc q3, q3 @ AES block 4k+7 - round 1 + + aese q1, v21.16b + aesmc q1, q1 @ AES block 4k+5 - round 3 + + pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid + + pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high + + pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low + + aese q3, v20.16b + aesmc q3, q3 @ AES block 4k+7 - round 2 + + eor q9, q9, q4 @ GHASH block 4k+1 - high + mov d4, v5.d[1] @ GHASH block 4k+1 - mid + + aese q0, v21.16b + aesmc q0, q0 @ AES block 4k+4 - round 3 + eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low + + aese q3, v21.16b + aesmc q3, q3 @ AES block 4k+7 - round 3 + + eor q4, q4, q5 @ GHASH block 4k+1 - mid + mov d8, v6.d[1] @ GHASH block 4k+2 - mid + + aese q0, v22.16b + aesmc q0, q0 @ AES block 4k+4 - round 4 + rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) + + aese q3, v22.16b + aesmc q3, q3 @ AES block 4k+7 - round 4 + + pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid + eor q8, q8, q6 @ GHASH block 4k+2 - mid + add r12, r12, #1 @ CTR block 4k+3 + + pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low + + aese q3, v23.16b + aesmc q3, q3 @ AES block 4k+7 - round 5 + + aese q2, v22.16b + aesmc q2, q2 @ AES block 4k+6 - round 4 + eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid + + pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high + + eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low + ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid + + aese q2, v23.16b + aesmc q2, q2 @ AES block 4k+6 - round 5 + + eor q9, q9, q4 @ GHASH block 4k+2 - high + mov d4, v7.d[1] @ GHASH block 4k+3 - mid + + aese q1, v22.16b + aesmc q1, q1 @ AES block 4k+5 - round 4 + + pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid + + eor q4, q4, q7 @ GHASH block 4k+3 - mid + + pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high + + aese q1, v23.16b + aesmc q1, q1 @ AES block 4k+5 - round 5 + + pmull v4.1q, q4, v16.1d @ GHASH block 4k+3 - mid + eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid + + aese q0, v23.16b + aesmc q0, q0 @ AES block 4k+4 - round 5 + + aese q1, v24.16b + aesmc q1, q1 @ AES block 4k+5 - round 6 + + aese q2, v24.16b + aesmc q2, q2 @ AES block 4k+6 - round 6 + + aese q0, v24.16b + aesmc q0, q0 @ AES block 4k+4 - round 6 + movi q8, #0xc2 + + aese q3, v24.16b + aesmc q3, q3 @ AES block 4k+7 - round 6 + + aese q1, v25.16b + aesmc q1, q1 @ AES block 4k+5 - round 7 + eor q9, q9, q5 @ GHASH block 4k+3 - high + + aese q0, v25.16b + aesmc q0, q0 @ AES block 4k+4 - round 7 + + aese q3, v25.16b + aesmc q3, q3 @ AES block 4k+7 - round 7 + shl d8, d8, #56 @ mod_constant + + aese q1, v26.16b + aesmc q1, q1 @ AES block 4k+5 - round 8 + eor v10.16b, v10.16b, q4 @ GHASH block 4k+3 - mid + + pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low + + aese q3, v26.16b + aesmc q3, q3 @ AES block 4k+7 - round 8 + + aese q1, v27.16b + aesmc q1, q1 @ AES block 4k+5 - round 9 + + aese q0, v26.16b + aesmc q0, q0 @ AES block 4k+4 - round 8 + eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low + + aese q3, v27.16b + aesmc q3, q3 @ AES block 4k+7 - round 9 + + eor v10.16b, v10.16b, q9 @ karatsuba tidy up + + pmull v4.1q, q9, q8 + ext q9, q9, q9, #8 + + aese q3, v28.16b + aesmc q3, q3 @ AES block 4k+7 - round 10 + + aese q2, v25.16b + aesmc q2, q2 @ AES block 4k+6 - round 7 + eor v10.16b, v10.16b, v11.16b + + aese q1, v28.16b + aesmc q1, q1 @ AES block 4k+5 - round 10 + + aese q0, v27.16b + aesmc q0, q0 @ AES block 4k+4 - round 9 + + aese q2, v26.16b + aesmc q2, q2 @ AES block 4k+6 - round 8 + + aese q1, v29.16b + aesmc q1, q1 @ AES block 4k+5 - round 11 + eor v10.16b, v10.16b, q4 + + aese q0, v28.16b + aesmc q0, q0 @ AES block 4k+4 - round 10 + + aese q2, v27.16b + aesmc q2, q2 @ AES block 4k+6 - round 9 + + aese q1, v30.16b + aesmc q1, q1 @ AES block 4k+5 - round 12 + + aese q0, v29.16b + aesmc q0, q0 @ AES block 4k+4 - round 11 + eor v10.16b, v10.16b, q9 + + aese q3, v29.16b + aesmc q3, q3 @ AES block 4k+7 - round 11 + + aese q2, v28.16b + aesmc q2, q2 @ AES block 4k+6 - round 10 + + aese q0, v30.16b + aesmc q0, q0 @ AES block 4k+4 - round 12 + + pmull v4.1q, v10.1d, q8 + + aese q2, v29.16b + aesmc q2, q2 @ AES block 4k+6 - round 11 + ext v10.16b, v10.16b, v10.16b, #8 + + aese q3, v30.16b + aesmc q3, q3 @ AES block 4k+7 - round 12 + + aese q1, v31.16b @ AES block 4k+5 - round 13 + eor v11.16b, v11.16b, q4 + + aese q2, v30.16b + aesmc q2, q2 @ AES block 4k+6 - round 12 + + aese q3, v31.16b @ AES block 4k+7 - round 13 + + aese q0, v31.16b @ AES block 4k+4 - round 13 + + aese q2, v31.16b @ AES block 4k+6 - round 13 + eor v11.16b, v11.16b, v10.16b +.L256_enc_tail:@ TAIL + + ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag + sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process + ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + eor r6, r6, r13 @ AES block 4k+4 - round 14 low + eor r7, r7, r14 @ AES block 4k+4 - round 14 high + + cmp r5, #48 + fmov d4, r6 @ AES block 4k+4 - mov low + + fmov v4.d[1], r7 @ AES block 4k+4 - mov high + + eor q5, q4, q0 @ AES block 4k+4 - result + bgt .L256_enc_blocks_more_than_3 + + cmp r5, #32 + mov q3, q2 + movi v11.8b, #0 + + movi q9, #0 + sub r12, r12, #1 + + mov q2, q1 + movi v10.8b, #0 + bgt .L256_enc_blocks_more_than_2 + + mov q3, q1 + sub r12, r12, #1 + cmp r5, #16 + + bgt .L256_enc_blocks_more_than_1 + + sub r12, r12, #1 + b .L256_enc_blocks_less_than_1 +.L256_enc_blocks_more_than_3:@ blocks left > 3 + st1 { q5}, [r2], #16 @ AES final-3 block - store result + + ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + rev64 q4, q5 @ GHASH final-3 block + + eor r6, r6, r13 @ AES final-2 block - round 14 low + eor q4, q4, q8 @ feed in partial tag + + eor r7, r7, r14 @ AES final-2 block - round 14 high + + mov d22, v4.d[1] @ GHASH final-3 block - mid + fmov d5, r6 @ AES final-2 block - mov low + + fmov v5.d[1], r7 @ AES final-2 block - mov high + + eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid + movi q8, #0 @ suppress further partial tag feed in + + mov d10, v17.d[1] @ GHASH final-3 block - mid + + pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low + + pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high + + pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid + eor q5, q5, q1 @ AES final-2 block - result +.L256_enc_blocks_more_than_2:@ blocks left > 2 + + st1 { q5}, [r2], #16 @ AES final-2 block - store result + + ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + rev64 q4, q5 @ GHASH final-2 block + + eor r6, r6, r13 @ AES final-1 block - round 14 low + eor q4, q4, q8 @ feed in partial tag + + fmov d5, r6 @ AES final-1 block - mov low + eor r7, r7, r14 @ AES final-1 block - round 14 high + + fmov v5.d[1], r7 @ AES final-1 block - mov high + + movi q8, #0 @ suppress further partial tag feed in + + pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high + mov d22, v4.d[1] @ GHASH final-2 block - mid + + pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low + + eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid + + eor q5, q5, q2 @ AES final-1 block - result + + eor q9, q9, v20.16b @ GHASH final-2 block - high + + pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid + + eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low + + eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid +.L256_enc_blocks_more_than_1:@ blocks left > 1 + + st1 { q5}, [r2], #16 @ AES final-1 block - store result + + rev64 q4, q5 @ GHASH final-1 block + + ldp r6, r7, [r0], #16 @ AES final block - load input low & high +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif + eor q4, q4, q8 @ feed in partial tag + + movi q8, #0 @ suppress further partial tag feed in + + eor r6, r6, r13 @ AES final block - round 14 low + mov d22, v4.d[1] @ GHASH final-1 block - mid + + pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high + eor r7, r7, r14 @ AES final block - round 14 high + + eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid + + eor q9, q9, v20.16b @ GHASH final-1 block - high + + ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid + fmov d5, r6 @ AES final block - mov low + + fmov v5.d[1], r7 @ AES final block - mov high + + pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid + + pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low + + eor q5, q5, q3 @ AES final block - result + eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid + + eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low +.L256_enc_blocks_less_than_1:@ blocks left <= 1 + + and r1, r1, #127 @ bit_length %= 128 + + mvn r13, xzr @ rk14_l = 0xffffffffffffffff + sub r1, r1, #128 @ bit_length -= 128 + + neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) + ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored + + mvn r14, xzr @ rk14_h = 0xffffffffffffffff + and r1, r1, #127 @ bit_length %= 128 + + lsr r14, r14, r1 @ rk14_h is mask for top 64b of last block + cmp r1, #64 + + csel r6, r13, r14, lt + csel r7, r14, xzr, lt + + fmov d0, r6 @ ctr0b is mask for last block + + fmov v0.d[1], r7 + + and q5, q5, q0 @ possibly partial last block has zeroes in highest bits + + rev64 q4, q5 @ GHASH final block + + eor q4, q4, q8 @ feed in partial tag + + bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing + + pmull2 v20.1q, q4, v12.2d @ GHASH final block - high + mov d8, v4.d[1] @ GHASH final block - mid +#ifndef __ARMEB__ + rev r9, r12 +#else + mov r9, r12 +#endif + + pmull v21.1q, q4, v12.1d @ GHASH final block - low + + eor q9, q9, v20.16b @ GHASH final block - high + eor q8, q8, q4 @ GHASH final block - mid + + pmull v8.1q, q8, v16.1d @ GHASH final block - mid + + eor v11.16b, v11.16b, v21.16b @ GHASH final block - low + + eor v10.16b, v10.16b, q8 @ GHASH final block - mid + movi q8, #0xc2 + + eor q4, v11.16b, q9 @ MODULO - karatsuba tidy up + + shl d8, d8, #56 @ mod_constant + + eor v10.16b, v10.16b, q4 @ MODULO - karatsuba tidy up + + pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid + + ext q9, q9, q9, #8 @ MODULO - other top alignment + + eor v10.16b, v10.16b, q7 @ MODULO - fold into mid + + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + + pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low + + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + + str r9, [r16, #12] @ store the updated counter + + st1 { q5}, [r2] @ store all 16B + eor v11.16b, v11.16b, q9 @ MODULO - fold into low + + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov r0, r15 + st1 { v11.16b }, [r3] + + ldp r21, r22, [sp, #16] + ldp r23, r24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp r19, r20, [sp], #112 + bx lr + +.L256_enc_ret: + mov r0, #0x0 + bx lr +.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel +.globl aes_gcm_dec_256_kernel +.type aes_gcm_dec_256_kernel,%function +.align 4 +aes_gcm_dec_256_kernel: + AARCH64_VALID_CALL_TARGET + cbz r1, .L256_dec_ret + stp r19, r20, [sp, #-112]! + mov r16, r4 + mov r8, r5 + stp r21, r22, [sp, #16] + stp r23, r24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + lsr r5, r1, #3 @ byte_len + mov r15, r5 + ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 +#ifdef __ARMEB__ + rev r10, r10 + rev r11, r11 +#endif + ldp r13, r14, [r8, #224] @ load rk14 +#ifdef __ARMEB__ + ror r14, r14, #32 + ror r13, r13, #32 +#endif + ld1 {v18.4s}, [r8], #16 @ load rk0 + sub r5, r5, #1 @ byte_len - 1 + + ld1 {v19.4s}, [r8], #16 @ load rk1 + and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + add r4, r0, r1, lsr #3 @ end_input_ptr + ld1 {v20.4s}, [r8], #16 @ load rk2 + + lsr r12, r11, #32 + ld1 {v21.4s}, [r8], #16 @ load rk3 + orr r11, r11, r11 + + ld1 {v22.4s}, [r8], #16 @ load rk4 + add r5, r5, r0 + rev r12, r12 @ rev_ctr32 + + add r12, r12, #1 @ increment rev_ctr32 + fmov d3, r10 @ CTR block 3 + + rev r9, r12 @ CTR block 1 + add r12, r12, #1 @ CTR block 1 + fmov d1, r10 @ CTR block 1 + + orr r9, r11, r9, lsl #32 @ CTR block 1 + ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible + + fmov v1.d[1], r9 @ CTR block 1 + rev r9, r12 @ CTR block 2 + add r12, r12, #1 @ CTR block 2 + + fmov d2, r10 @ CTR block 2 + orr r9, r11, r9, lsl #32 @ CTR block 2 + + fmov v2.d[1], r9 @ CTR block 2 + rev r9, r12 @ CTR block 3 + + orr r9, r11, r9, lsl #32 @ CTR block 3 + ld1 {v23.4s}, [r8], #16 @ load rk5 + + fmov v3.d[1], r9 @ CTR block 3 + add r12, r12, #1 @ CTR block 3 + + ld1 {v24.4s}, [r8], #16 @ load rk6 + + ld1 {v25.4s}, [r8], #16 @ load rk7 + + ld1 {v26.4s}, [r8], #16 @ load rk8 + + aese q0, v18.16b + aesmc q0, q0 @ AES block 0 - round 0 + ldr q14, [r3, #80] @ load h3l | h3h +#ifndef __ARMEB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + + aese q3, v18.16b + aesmc q3, q3 @ AES block 3 - round 0 + ldr q15, [r3, #112] @ load h4l | h4h +#ifndef __ARMEB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + + aese q1, v18.16b + aesmc q1, q1 @ AES block 1 - round 0 + ldr q13, [r3, #64] @ load h2l | h2h +#ifndef __ARMEB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + + aese q2, v18.16b + aesmc q2, q2 @ AES block 2 - round 0 + ld1 {v27.4s}, [r8], #16 @ load rk9 + + aese q0, v19.16b + aesmc q0, q0 @ AES block 0 - round 1 + + aese q1, v19.16b + aesmc q1, q1 @ AES block 1 - round 1 + ld1 { v11.16b}, [r3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese q2, v19.16b + aesmc q2, q2 @ AES block 2 - round 1 + ld1 {v28.4s}, [r8], #16 @ load rk10 + + aese q3, v19.16b + aesmc q3, q3 @ AES block 3 - round 1 + ld1 {v29.4s}, [r8], #16 @ load rk11 + + aese q0, v20.16b + aesmc q0, q0 @ AES block 0 - round 2 + ldr q12, [r3, #32] @ load h1l | h1h +#ifndef __ARMEB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese q2, v20.16b + aesmc q2, q2 @ AES block 2 - round 2 + ld1 {v30.4s}, [r8], #16 @ load rk12 + + aese q3, v20.16b + aesmc q3, q3 @ AES block 3 - round 2 + + aese q0, v21.16b + aesmc q0, q0 @ AES block 0 - round 3 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 1 - round 2 + + aese q3, v21.16b + aesmc q3, q3 @ AES block 3 - round 3 + + aese q0, v22.16b + aesmc q0, q0 @ AES block 0 - round 4 + cmp r0, r5 @ check if we have <= 4 blocks + + aese q2, v21.16b + aesmc q2, q2 @ AES block 2 - round 3 + + aese q1, v21.16b + aesmc q1, q1 @ AES block 1 - round 3 + + aese q3, v22.16b + aesmc q3, q3 @ AES block 3 - round 4 + + aese q2, v22.16b + aesmc q2, q2 @ AES block 2 - round 4 + + aese q1, v22.16b + aesmc q1, q1 @ AES block 1 - round 4 + + aese q3, v23.16b + aesmc q3, q3 @ AES block 3 - round 5 + + aese q0, v23.16b + aesmc q0, q0 @ AES block 0 - round 5 + + aese q1, v23.16b + aesmc q1, q1 @ AES block 1 - round 5 + + aese q2, v23.16b + aesmc q2, q2 @ AES block 2 - round 5 + + aese q0, v24.16b + aesmc q0, q0 @ AES block 0 - round 6 + + aese q3, v24.16b + aesmc q3, q3 @ AES block 3 - round 6 + + aese q1, v24.16b + aesmc q1, q1 @ AES block 1 - round 6 + + aese q2, v24.16b + aesmc q2, q2 @ AES block 2 - round 6 + + aese q0, v25.16b + aesmc q0, q0 @ AES block 0 - round 7 + + aese q1, v25.16b + aesmc q1, q1 @ AES block 1 - round 7 + + aese q3, v25.16b + aesmc q3, q3 @ AES block 3 - round 7 + + aese q0, v26.16b + aesmc q0, q0 @ AES block 0 - round 8 + + aese q2, v25.16b + aesmc q2, q2 @ AES block 2 - round 7 + + aese q3, v26.16b + aesmc q3, q3 @ AES block 3 - round 8 + + aese q1, v26.16b + aesmc q1, q1 @ AES block 1 - round 8 + + aese q0, v27.16b + aesmc q0, q0 @ AES block 0 - round 9 + + aese q2, v26.16b + aesmc q2, q2 @ AES block 2 - round 8 + ld1 {v31.4s}, [r8], #16 @ load rk13 + + aese q1, v27.16b + aesmc q1, q1 @ AES block 1 - round 9 + + aese q0, v28.16b + aesmc q0, q0 @ AES block 0 - round 10 + + aese q3, v27.16b + aesmc q3, q3 @ AES block 3 - round 9 + + aese q1, v28.16b + aesmc q1, q1 @ AES block 1 - round 10 + + aese q2, v27.16b + aesmc q2, q2 @ AES block 2 - round 9 + + aese q3, v28.16b + aesmc q3, q3 @ AES block 3 - round 10 + + aese q0, v29.16b + aesmc q0, q0 @ AES block 0 - round 11 + + aese q2, v28.16b + aesmc q2, q2 @ AES block 2 - round 10 + + aese q3, v29.16b + aesmc q3, q3 @ AES block 3 - round 11 + + aese q1, v29.16b + aesmc q1, q1 @ AES block 1 - round 11 + + aese q2, v29.16b + aesmc q2, q2 @ AES block 2 - round 11 + + trn1 q9, v14.2d, v15.2d @ h4h | h3h + + trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l + + trn1 q8, v12.2d, v13.2d @ h2h | h1h + trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l + + aese q1, v30.16b + aesmc q1, q1 @ AES block 1 - round 12 + + aese q0, v30.16b + aesmc q0, q0 @ AES block 0 - round 12 + + aese q2, v30.16b + aesmc q2, q2 @ AES block 2 - round 12 + + aese q3, v30.16b + aesmc q3, q3 @ AES block 3 - round 12 + eor v17.16b, v17.16b, q9 @ h4k | h3k + + aese q1, v31.16b @ AES block 1 - round 13 + + aese q2, v31.16b @ AES block 2 - round 13 + eor v16.16b, v16.16b, q8 @ h2k | h1k + + aese q3, v31.16b @ AES block 3 - round 13 + + aese q0, v31.16b @ AES block 0 - round 13 + bge .L256_dec_tail @ handle tail + + ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext + + rev r9, r12 @ CTR block 4 + + eor q0, q4, q0 @ AES block 0 - result + + eor q1, q5, q1 @ AES block 1 - result + rev64 q5, q5 @ GHASH block 1 + ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext + + mov r7, v0.d[1] @ AES block 0 - mov high + + mov r6, v0.d[0] @ AES block 0 - mov low + rev64 q4, q4 @ GHASH block 0 + add r12, r12, #1 @ CTR block 4 + + fmov d0, r10 @ CTR block 4 + orr r9, r11, r9, lsl #32 @ CTR block 4 + + fmov v0.d[1], r9 @ CTR block 4 + rev r9, r12 @ CTR block 5 + add r12, r12, #1 @ CTR block 5 + + mov r19, v1.d[0] @ AES block 1 - mov low + + orr r9, r11, r9, lsl #32 @ CTR block 5 + mov r20, v1.d[1] @ AES block 1 - mov high + eor r7, r7, r14 @ AES block 0 - round 14 high +#ifdef __ARMEB__ + rev r7, r7 +#endif + eor r6, r6, r13 @ AES block 0 - round 14 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + stp r6, r7, [r2], #16 @ AES block 0 - store result + fmov d1, r10 @ CTR block 5 + + ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext + + fmov v1.d[1], r9 @ CTR block 5 + rev r9, r12 @ CTR block 6 + add r12, r12, #1 @ CTR block 6 + + eor r19, r19, r13 @ AES block 1 - round 14 low +#ifdef __ARMEB__ + rev r19, r19 +#endif + orr r9, r11, r9, lsl #32 @ CTR block 6 + + eor r20, r20, r14 @ AES block 1 - round 14 high +#ifdef __ARMEB__ + rev r20, r20 +#endif + stp r19, r20, [r2], #16 @ AES block 1 - store result + + eor q2, q6, q2 @ AES block 2 - result + cmp r0, r5 @ check if we have <= 8 blocks + bge .L256_dec_prepretail @ do prepretail + +.L256_dec_main_loop:@ main loop start + mov r21, v2.d[0] @ AES block 4k+2 - mov low + ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 + eor q3, q7, q3 @ AES block 4k+3 - result + + aese q0, v18.16b + aesmc q0, q0 @ AES block 4k+4 - round 0 + mov r22, v2.d[1] @ AES block 4k+2 - mov high + + aese q1, v18.16b + aesmc q1, q1 @ AES block 4k+5 - round 0 + fmov d2, r10 @ CTR block 4k+6 + + fmov v2.d[1], r9 @ CTR block 4k+6 + eor q4, q4, v11.16b @ PRE 1 + rev r9, r12 @ CTR block 4k+7 + + aese q0, v19.16b + aesmc q0, q0 @ AES block 4k+4 - round 1 + mov r24, v3.d[1] @ AES block 4k+3 - mov high + + aese q1, v19.16b + aesmc q1, q1 @ AES block 4k+5 - round 1 + mov r23, v3.d[0] @ AES block 4k+3 - mov low + + pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high + mov d8, v4.d[1] @ GHASH block 4k - mid + fmov d3, r10 @ CTR block 4k+7 + + aese q0, v20.16b + aesmc q0, q0 @ AES block 4k+4 - round 2 + orr r9, r11, r9, lsl #32 @ CTR block 4k+7 + + aese q2, v18.16b + aesmc q2, q2 @ AES block 4k+6 - round 0 + fmov v3.d[1], r9 @ CTR block 4k+7 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 4k+5 - round 2 + eor q8, q8, q4 @ GHASH block 4k - mid + + aese q0, v21.16b + aesmc q0, q0 @ AES block 4k+4 - round 3 + eor r22, r22, r14 @ AES block 4k+2 - round 14 high +#ifdef __ARMEB__ + rev r22, r22 +#endif + aese q2, v19.16b + aesmc q2, q2 @ AES block 4k+6 - round 1 + mov d10, v17.d[1] @ GHASH block 4k - mid + + aese q1, v21.16b + aesmc q1, q1 @ AES block 4k+5 - round 3 + rev64 q6, q6 @ GHASH block 4k+2 + + aese q3, v18.16b + aesmc q3, q3 @ AES block 4k+7 - round 0 + eor r21, r21, r13 @ AES block 4k+2 - round 14 low +#ifdef __ARMEB__ + rev r21, r21 +#endif + aese q2, v20.16b + aesmc q2, q2 @ AES block 4k+6 - round 2 + stp r21, r22, [r2], #16 @ AES block 4k+2 - store result + + pmull v11.1q, q4, v15.1d @ GHASH block 4k - low + + pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high + + aese q2, v21.16b + aesmc q2, q2 @ AES block 4k+6 - round 3 + rev64 q7, q7 @ GHASH block 4k+3 + + pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid + eor r23, r23, r13 @ AES block 4k+3 - round 14 low +#ifdef __ARMEB__ + rev r23, r23 +#endif + pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low + eor r24, r24, r14 @ AES block 4k+3 - round 14 high +#ifdef __ARMEB__ + rev r24, r24 +#endif + eor q9, q9, q4 @ GHASH block 4k+1 - high + + aese q2, v22.16b + aesmc q2, q2 @ AES block 4k+6 - round 4 + + aese q3, v19.16b + aesmc q3, q3 @ AES block 4k+7 - round 1 + mov d4, v5.d[1] @ GHASH block 4k+1 - mid + + aese q0, v22.16b + aesmc q0, q0 @ AES block 4k+4 - round 4 + eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low + + aese q2, v23.16b + aesmc q2, q2 @ AES block 4k+6 - round 5 + add r12, r12, #1 @ CTR block 4k+7 + + aese q3, v20.16b + aesmc q3, q3 @ AES block 4k+7 - round 2 + mov d8, v6.d[1] @ GHASH block 4k+2 - mid + + aese q1, v22.16b + aesmc q1, q1 @ AES block 4k+5 - round 4 + eor q4, q4, q5 @ GHASH block 4k+1 - mid + + pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low + + aese q3, v21.16b + aesmc q3, q3 @ AES block 4k+7 - round 3 + eor q8, q8, q6 @ GHASH block 4k+2 - mid + + aese q1, v23.16b + aesmc q1, q1 @ AES block 4k+5 - round 5 + + aese q0, v23.16b + aesmc q0, q0 @ AES block 4k+4 - round 5 + eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low + + pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid + rev r9, r12 @ CTR block 4k+8 + + aese q1, v24.16b + aesmc q1, q1 @ AES block 4k+5 - round 6 + ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid + + aese q0, v24.16b + aesmc q0, q0 @ AES block 4k+4 - round 6 + add r12, r12, #1 @ CTR block 4k+8 + + aese q3, v22.16b + aesmc q3, q3 @ AES block 4k+7 - round 4 + + aese q1, v25.16b + aesmc q1, q1 @ AES block 4k+5 - round 7 + eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid + + aese q0, v25.16b + aesmc q0, q0 @ AES block 4k+4 - round 7 + + pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high + mov d6, v7.d[1] @ GHASH block 4k+3 - mid + + aese q3, v23.16b + aesmc q3, q3 @ AES block 4k+7 - round 5 + + pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid + + aese q0, v26.16b + aesmc q0, q0 @ AES block 4k+4 - round 8 + eor q9, q9, q4 @ GHASH block 4k+2 - high + + aese q3, v24.16b + aesmc q3, q3 @ AES block 4k+7 - round 6 + + pmull v4.1q, q7, v12.1d @ GHASH block 4k+3 - low + orr r9, r11, r9, lsl #32 @ CTR block 4k+8 + eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid + + pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high + + aese q0, v27.16b + aesmc q0, q0 @ AES block 4k+4 - round 9 + eor q6, q6, q7 @ GHASH block 4k+3 - mid + + aese q1, v26.16b + aesmc q1, q1 @ AES block 4k+5 - round 8 + + aese q2, v24.16b + aesmc q2, q2 @ AES block 4k+6 - round 6 + eor q9, q9, q5 @ GHASH block 4k+3 - high + + aese q0, v28.16b + aesmc q0, q0 @ AES block 4k+4 - round 10 + + pmull v6.1q, q6, v16.1d @ GHASH block 4k+3 - mid + movi q8, #0xc2 + + aese q2, v25.16b + aesmc q2, q2 @ AES block 4k+6 - round 7 + eor v11.16b, v11.16b, q4 @ GHASH block 4k+3 - low + + aese q0, v29.16b + aesmc q0, q0 @ AES block 4k+4 - round 11 + + aese q3, v25.16b + aesmc q3, q3 @ AES block 4k+7 - round 7 + shl d8, d8, #56 @ mod_constant + + aese q2, v26.16b + aesmc q2, q2 @ AES block 4k+6 - round 8 + eor v10.16b, v10.16b, q6 @ GHASH block 4k+3 - mid + + aese q0, v30.16b + aesmc q0, q0 @ AES block 4k+4 - round 12 + + pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid + eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up + + aese q1, v27.16b + aesmc q1, q1 @ AES block 4k+5 - round 9 + ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext + + aese q0, v31.16b @ AES block 4k+4 - round 13 + ext q9, q9, q9, #8 @ MODULO - other top alignment + + aese q1, v28.16b + aesmc q1, q1 @ AES block 4k+5 - round 10 + eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up + + aese q2, v27.16b + aesmc q2, q2 @ AES block 4k+6 - round 9 + ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext + + aese q3, v26.16b + aesmc q3, q3 @ AES block 4k+7 - round 8 + eor q0, q4, q0 @ AES block 4k+4 - result + + aese q1, v29.16b + aesmc q1, q1 @ AES block 4k+5 - round 11 + stp r23, r24, [r2], #16 @ AES block 4k+3 - store result + + aese q2, v28.16b + aesmc q2, q2 @ AES block 4k+6 - round 10 + eor v10.16b, v10.16b, q7 @ MODULO - fold into mid + + aese q3, v27.16b + aesmc q3, q3 @ AES block 4k+7 - round 9 + ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext + + aese q1, v30.16b + aesmc q1, q1 @ AES block 4k+5 - round 12 + ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext + + aese q2, v29.16b + aesmc q2, q2 @ AES block 4k+6 - round 11 + mov r7, v0.d[1] @ AES block 4k+4 - mov high + + aese q3, v28.16b + aesmc q3, q3 @ AES block 4k+7 - round 10 + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + + aese q1, v31.16b @ AES block 4k+5 - round 13 + mov r6, v0.d[0] @ AES block 4k+4 - mov low + + aese q2, v30.16b + aesmc q2, q2 @ AES block 4k+6 - round 12 + fmov d0, r10 @ CTR block 4k+8 + + aese q3, v29.16b + aesmc q3, q3 @ AES block 4k+7 - round 11 + fmov v0.d[1], r9 @ CTR block 4k+8 + + pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low + eor q1, q5, q1 @ AES block 4k+5 - result + rev r9, r12 @ CTR block 4k+9 + + aese q2, v31.16b @ AES block 4k+6 - round 13 + orr r9, r11, r9, lsl #32 @ CTR block 4k+9 + cmp r0, r5 @ .LOOP CONTROL + + add r12, r12, #1 @ CTR block 4k+9 + + eor r6, r6, r13 @ AES block 4k+4 - round 14 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + eor r7, r7, r14 @ AES block 4k+4 - round 14 high +#ifdef __ARMEB__ + rev r7, r7 +#endif + mov r20, v1.d[1] @ AES block 4k+5 - mov high + eor q2, q6, q2 @ AES block 4k+6 - result + eor v11.16b, v11.16b, q8 @ MODULO - fold into low + + aese q3, v30.16b + aesmc q3, q3 @ AES block 4k+7 - round 12 + mov r19, v1.d[0] @ AES block 4k+5 - mov low + + fmov d1, r10 @ CTR block 4k+9 + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + + fmov v1.d[1], r9 @ CTR block 4k+9 + rev r9, r12 @ CTR block 4k+10 + add r12, r12, #1 @ CTR block 4k+10 + + aese q3, v31.16b @ AES block 4k+7 - round 13 + orr r9, r11, r9, lsl #32 @ CTR block 4k+10 + + rev64 q5, q5 @ GHASH block 4k+5 + eor r20, r20, r14 @ AES block 4k+5 - round 14 high +#ifdef __ARMEB__ + rev r20, r20 +#endif + stp r6, r7, [r2], #16 @ AES block 4k+4 - store result + + eor r19, r19, r13 @ AES block 4k+5 - round 14 low +#ifdef __ARMEB__ + rev r19, r19 +#endif + stp r19, r20, [r2], #16 @ AES block 4k+5 - store result + + rev64 q4, q4 @ GHASH block 4k+4 + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low + blt .L256_dec_main_loop + + +.L256_dec_prepretail:@ PREPRETAIL + ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 + mov r21, v2.d[0] @ AES block 4k+2 - mov low + eor q3, q7, q3 @ AES block 4k+3 - result + + aese q0, v18.16b + aesmc q0, q0 @ AES block 4k+4 - round 0 + mov r22, v2.d[1] @ AES block 4k+2 - mov high + + aese q1, v18.16b + aesmc q1, q1 @ AES block 4k+5 - round 0 + fmov d2, r10 @ CTR block 4k+6 + + fmov v2.d[1], r9 @ CTR block 4k+6 + rev r9, r12 @ CTR block 4k+7 + eor q4, q4, v11.16b @ PRE 1 + + rev64 q6, q6 @ GHASH block 4k+2 + orr r9, r11, r9, lsl #32 @ CTR block 4k+7 + mov r23, v3.d[0] @ AES block 4k+3 - mov low + + aese q1, v19.16b + aesmc q1, q1 @ AES block 4k+5 - round 1 + mov r24, v3.d[1] @ AES block 4k+3 - mov high + + pmull v11.1q, q4, v15.1d @ GHASH block 4k - low + mov d8, v4.d[1] @ GHASH block 4k - mid + fmov d3, r10 @ CTR block 4k+7 + + pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high + fmov v3.d[1], r9 @ CTR block 4k+7 + + aese q2, v18.16b + aesmc q2, q2 @ AES block 4k+6 - round 0 + mov d10, v17.d[1] @ GHASH block 4k - mid + + aese q0, v19.16b + aesmc q0, q0 @ AES block 4k+4 - round 1 + eor q8, q8, q4 @ GHASH block 4k - mid + + pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high + + aese q2, v19.16b + aesmc q2, q2 @ AES block 4k+6 - round 1 + rev64 q7, q7 @ GHASH block 4k+3 + + aese q3, v18.16b + aesmc q3, q3 @ AES block 4k+7 - round 0 + + pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid + eor q9, q9, q4 @ GHASH block 4k+1 - high + + pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low + + aese q3, v19.16b + aesmc q3, q3 @ AES block 4k+7 - round 1 + mov d4, v5.d[1] @ GHASH block 4k+1 - mid + + aese q0, v20.16b + aesmc q0, q0 @ AES block 4k+4 - round 2 + + aese q1, v20.16b + aesmc q1, q1 @ AES block 4k+5 - round 2 + eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low + + aese q2, v20.16b + aesmc q2, q2 @ AES block 4k+6 - round 2 + + aese q0, v21.16b + aesmc q0, q0 @ AES block 4k+4 - round 3 + mov d8, v6.d[1] @ GHASH block 4k+2 - mid + + aese q3, v20.16b + aesmc q3, q3 @ AES block 4k+7 - round 2 + eor q4, q4, q5 @ GHASH block 4k+1 - mid + + pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low + + aese q0, v22.16b + aesmc q0, q0 @ AES block 4k+4 - round 4 + + aese q3, v21.16b + aesmc q3, q3 @ AES block 4k+7 - round 3 + eor q8, q8, q6 @ GHASH block 4k+2 - mid + + pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid + + aese q0, v23.16b + aesmc q0, q0 @ AES block 4k+4 - round 5 + eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low + + aese q3, v22.16b + aesmc q3, q3 @ AES block 4k+7 - round 4 + + pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high + eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid + + pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high + + aese q3, v23.16b + aesmc q3, q3 @ AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid + + aese q2, v21.16b + aesmc q2, q2 @ AES block 4k+6 - round 3 + + aese q1, v21.16b + aesmc q1, q1 @ AES block 4k+5 - round 3 + eor q9, q9, q4 @ GHASH block 4k+2 - high + + pmull v4.1q, q7, v12.1d @ GHASH block 4k+3 - low + + aese q2, v22.16b + aesmc q2, q2 @ AES block 4k+6 - round 4 + mov d6, v7.d[1] @ GHASH block 4k+3 - mid + + aese q1, v22.16b + aesmc q1, q1 @ AES block 4k+5 - round 4 + + pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid + + aese q2, v23.16b + aesmc q2, q2 @ AES block 4k+6 - round 5 + eor q6, q6, q7 @ GHASH block 4k+3 - mid + + aese q1, v23.16b + aesmc q1, q1 @ AES block 4k+5 - round 5 + + aese q3, v24.16b + aesmc q3, q3 @ AES block 4k+7 - round 6 + eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid + + aese q2, v24.16b + aesmc q2, q2 @ AES block 4k+6 - round 6 + + aese q0, v24.16b + aesmc q0, q0 @ AES block 4k+4 - round 6 + movi q8, #0xc2 + + aese q1, v24.16b + aesmc q1, q1 @ AES block 4k+5 - round 6 + eor v11.16b, v11.16b, q4 @ GHASH block 4k+3 - low + + pmull v6.1q, q6, v16.1d @ GHASH block 4k+3 - mid + + aese q3, v25.16b + aesmc q3, q3 @ AES block 4k+7 - round 7 + eor q9, q9, q5 @ GHASH block 4k+3 - high + + aese q1, v25.16b + aesmc q1, q1 @ AES block 4k+5 - round 7 + + aese q0, v25.16b + aesmc q0, q0 @ AES block 4k+4 - round 7 + eor v10.16b, v10.16b, q6 @ GHASH block 4k+3 - mid + + aese q3, v26.16b + aesmc q3, q3 @ AES block 4k+7 - round 8 + + aese q2, v25.16b + aesmc q2, q2 @ AES block 4k+6 - round 7 + eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up + + aese q1, v26.16b + aesmc q1, q1 @ AES block 4k+5 - round 8 + + aese q0, v26.16b + aesmc q0, q0 @ AES block 4k+4 - round 8 + shl d8, d8, #56 @ mod_constant + + aese q2, v26.16b + aesmc q2, q2 @ AES block 4k+6 - round 8 + + aese q1, v27.16b + aesmc q1, q1 @ AES block 4k+5 - round 9 + eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up + + pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid + + aese q2, v27.16b + aesmc q2, q2 @ AES block 4k+6 - round 9 + ext q9, q9, q9, #8 @ MODULO - other top alignment + + aese q3, v27.16b + aesmc q3, q3 @ AES block 4k+7 - round 9 + + aese q0, v27.16b + aesmc q0, q0 @ AES block 4k+4 - round 9 + eor v10.16b, v10.16b, q7 @ MODULO - fold into mid + + aese q2, v28.16b + aesmc q2, q2 @ AES block 4k+6 - round 10 + + aese q3, v28.16b + aesmc q3, q3 @ AES block 4k+7 - round 10 + + aese q0, v28.16b + aesmc q0, q0 @ AES block 4k+4 - round 10 + eor r22, r22, r14 @ AES block 4k+2 - round 14 high +#ifdef __ARMEB__ + rev r22, r22 +#endif + aese q1, v28.16b + aesmc q1, q1 @ AES block 4k+5 - round 10 + eor r23, r23, r13 @ AES block 4k+3 - round 14 low +#ifdef __ARMEB__ + rev r23, r23 +#endif + aese q2, v29.16b + aesmc q2, q2 @ AES block 4k+6 - round 11 + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + + aese q0, v29.16b + aesmc q0, q0 @ AES block 4k+4 - round 11 + add r12, r12, #1 @ CTR block 4k+7 + + aese q1, v29.16b + aesmc q1, q1 @ AES block 4k+5 - round 11 + eor r21, r21, r13 @ AES block 4k+2 - round 14 low +#ifdef __ARMEB__ + rev r21, r21 +#endif + + aese q2, v30.16b + aesmc q2, q2 @ AES block 4k+6 - round 12 + + pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low + eor r24, r24, r14 @ AES block 4k+3 - round 14 high +#ifdef __ARMEB__ + rev r24, r24 +#endif + + aese q3, v29.16b + aesmc q3, q3 @ AES block 4k+7 - round 11 + stp r21, r22, [r2], #16 @ AES block 4k+2 - store result + + aese q1, v30.16b + aesmc q1, q1 @ AES block 4k+5 - round 12 + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + + aese q0, v30.16b + aesmc q0, q0 @ AES block 4k+4 - round 12 + stp r23, r24, [r2], #16 @ AES block 4k+3 - store result + + aese q3, v30.16b + aesmc q3, q3 @ AES block 4k+7 - round 12 + eor v11.16b, v11.16b, q8 @ MODULO - fold into low + + aese q1, v31.16b @ AES block 4k+5 - round 13 + + aese q0, v31.16b @ AES block 4k+4 - round 13 + + aese q3, v31.16b @ AES block 4k+7 - round 13 + + aese q2, v31.16b @ AES block 4k+6 - round 13 + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low +.L256_dec_tail:@ TAIL + + sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process + ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext + + eor q0, q5, q0 @ AES block 4k+4 - result + + mov r6, v0.d[0] @ AES block 4k+4 - mov low + + mov r7, v0.d[1] @ AES block 4k+4 - mov high + ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag + + cmp r5, #48 + + eor r6, r6, r13 @ AES block 4k+4 - round 14 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + + eor r7, r7, r14 @ AES block 4k+4 - round 14 high +#ifdef __ARMEB__ + rev r7, r7 +#endif + bgt .L256_dec_blocks_more_than_3 + + sub r12, r12, #1 + mov q3, q2 + movi v10.8b, #0 + + movi v11.8b, #0 + cmp r5, #32 + + movi q9, #0 + mov q2, q1 + bgt .L256_dec_blocks_more_than_2 + + sub r12, r12, #1 + + mov q3, q1 + cmp r5, #16 + bgt .L256_dec_blocks_more_than_1 + + sub r12, r12, #1 + b .L256_dec_blocks_less_than_1 +.L256_dec_blocks_more_than_3:@ blocks left > 3 + rev64 q4, q5 @ GHASH final-3 block + ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext + + stp r6, r7, [r2], #16 @ AES final-3 block - store result + + mov d10, v17.d[1] @ GHASH final-3 block - mid + + eor q4, q4, q8 @ feed in partial tag + + eor q0, q5, q1 @ AES final-2 block - result + + mov d22, v4.d[1] @ GHASH final-3 block - mid + + mov r6, v0.d[0] @ AES final-2 block - mov low + + mov r7, v0.d[1] @ AES final-2 block - mov high + + eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid + + movi q8, #0 @ suppress further partial tag feed in + + pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high + + pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid + eor r6, r6, r13 @ AES final-2 block - round 14 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + + pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low + eor r7, r7, r14 @ AES final-2 block - round 14 high +#ifdef __ARMEB__ + rev r7, r7 +#endif +.L256_dec_blocks_more_than_2:@ blocks left > 2 + + rev64 q4, q5 @ GHASH final-2 block + ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext + + eor q4, q4, q8 @ feed in partial tag + stp r6, r7, [r2], #16 @ AES final-2 block - store result + + eor q0, q5, q2 @ AES final-1 block - result + + mov d22, v4.d[1] @ GHASH final-2 block - mid + + pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low + + pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high + + eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid + mov r6, v0.d[0] @ AES final-1 block - mov low + + mov r7, v0.d[1] @ AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low + movi q8, #0 @ suppress further partial tag feed in + + pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid + + eor q9, q9, v20.16b @ GHASH final-2 block - high + eor r6, r6, r13 @ AES final-1 block - round 14 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + + eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid + eor r7, r7, r14 @ AES final-1 block - round 14 high +#ifdef __ARMEB__ + rev r7, r7 +#endif +.L256_dec_blocks_more_than_1:@ blocks left > 1 + + stp r6, r7, [r2], #16 @ AES final-1 block - store result + rev64 q4, q5 @ GHASH final-1 block + + ld1 { q5}, [r0], #16 @ AES final block - load ciphertext + + eor q4, q4, q8 @ feed in partial tag + movi q8, #0 @ suppress further partial tag feed in + + mov d22, v4.d[1] @ GHASH final-1 block - mid + + eor q0, q5, q3 @ AES final block - result + + pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high + + eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid + + pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low + mov r6, v0.d[0] @ AES final block - mov low + + ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid + + mov r7, v0.d[1] @ AES final block - mov high + + pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid + eor r6, r6, r13 @ AES final block - round 14 low +#ifdef __ARMEB__ + rev r6, r6 +#endif + eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low + + eor q9, q9, v20.16b @ GHASH final-1 block - high + + eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid + eor r7, r7, r14 @ AES final block - round 14 high +#ifdef __ARMEB__ + rev r7, r7 +#endif +.L256_dec_blocks_less_than_1:@ blocks left <= 1 + + and r1, r1, #127 @ bit_length %= 128 + mvn r14, xzr @ rk14_h = 0xffffffffffffffff + + sub r1, r1, #128 @ bit_length -= 128 + mvn r13, xzr @ rk14_l = 0xffffffffffffffff + + ldp r4, r5, [r2] @ load existing bytes we need to not overwrite + neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) + + and r1, r1, #127 @ bit_length %= 128 + + lsr r14, r14, r1 @ rk14_h is mask for top 64b of last block + cmp r1, #64 + + csel r9, r13, r14, lt + csel r10, r14, xzr, lt + + fmov d0, r9 @ ctr0b is mask for last block + and r6, r6, r9 + + mov v0.d[1], r10 + bic r4, r4, r9 @ mask out low existing bytes + +#ifndef __ARMEB__ + rev r9, r12 +#else + mov r9, r12 +#endif + + bic r5, r5, r10 @ mask out high existing bytes + + orr r6, r6, r4 + + and r7, r7, r10 + + orr r7, r7, r5 + + and q5, q5, q0 @ possibly partial last block has zeroes in highest bits + + rev64 q4, q5 @ GHASH final block + + eor q4, q4, q8 @ feed in partial tag + + pmull v21.1q, q4, v12.1d @ GHASH final block - low + + mov d8, v4.d[1] @ GHASH final block - mid + + eor q8, q8, q4 @ GHASH final block - mid + + pmull2 v20.1q, q4, v12.2d @ GHASH final block - high + + pmull v8.1q, q8, v16.1d @ GHASH final block - mid + + eor q9, q9, v20.16b @ GHASH final block - high + + eor v11.16b, v11.16b, v21.16b @ GHASH final block - low + + eor v10.16b, v10.16b, q8 @ GHASH final block - mid + movi q8, #0xc2 + + eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up + + shl d8, d8, #56 @ mod_constant + + eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up + + pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid + + ext q9, q9, q9, #8 @ MODULO - other top alignment + + eor v10.16b, v10.16b, q7 @ MODULO - fold into mid + + eor v10.16b, v10.16b, q9 @ MODULO - fold into mid + + pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low + + ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment + + eor v11.16b, v11.16b, q8 @ MODULO - fold into low + + stp r6, r7, [r2] + + str r9, [r16, #12] @ store the updated counter + + eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov r0, r15 + st1 { v11.16b }, [r3] + + ldp r21, r22, [sp, #16] + ldp r23, r24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp r19, r20, [sp], #112 + bx lr + +.L256_dec_ret: + mov r0, #0x0 + bx lr +.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif