bcrypt-macros.mac

;
; pbcrypt: parallel bcrypt for password cracking
; Copyright (C) 2019  Catalina Juarros <https://github.com/cat-j>
;
; This file is part of pbcrypt.
; 
; pbcrypt is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 2 of the License, or
; (at your option) any later version.
; 
; pbcrypt is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
; 
; You should have received a copy of the GNU General Public License
; along with pbcrypt.  If not, see <https://www.gnu.org/licenses/>.
;

%ifndef _BCRYPT_MACROS_MAC_
%define _BCRYPT_MACROS_MAC_

; how many 1-byte memory slots each P_n takes up
%define P_VALUE_MEMORY_SIZE 4

; how many 1-byte memory slots each element in an S-box takes up
%define S_ELEMENT_MEMORY_SIZE 4

; how many 1-byte memory slots one S-box takes up
%define S_BOX_MEMORY_SIZE 1024

; same as before, but for parallel S-box
%define PARALLEL_S_BOX_MEMORY_SIZE 4096
%define PARALLEL_D_S_BOX_MEMORY_SIZE 8192

; encryption rounds
%define ROUNDS 16

; XMM register size in bytes
%define XMM_SIZE 16

; YMM register size in bytes
%define YMM_SIZE 32

; P-array byte offset within context struct
%define BLF_CTX_P_OFFSET 4096

; P-array byte offset within parallel context struct
%define P_BLF_CTX_P_OFFSET 0x4000

; P-array byte offset within double parallel context struct
%define PD_BLF_CTX_P_OFFSET 0x8000

; length of bcrypt hash in 32-bit words
%define BCRYPT_WORDS 6

; for loaded P-array variant
%define salt                xmm0
%define p_0_7               ymm1
%define p_0_7x              xmm1
%define p_8_15              ymm2
%define p_8_15x             xmm2
%define p_16_17             xmm3
%define ctext_x             xmm4
%define ctext_y             ymm4

; for no-penalties loaded P-array variant
%define p_0_3               xmm1
%define p_4_7               xmm5
%define p_4_7y              ymm5
%define p_8_11              xmm2
%define p_12_15             xmm6
%define p_12_15y            ymm6

; for XMM parallel variant
%define gather_mask_xmm     xmm13
%define element_offset_xmm  xmm14

; for YMM parallel variant
%define gather_mask_ymm     ymm13
%define element_offset_ymm  ymm14

; generally handy
%define endianness_mask_ymm ymm15
%define endianness_mask_xmm xmm15


; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; ;;;;;;;;;; MACROS ;;;;;;;;;;
; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%define ROTATE_128(Y) vpermq Y, Y, 0x4e

; Function for Feistel network
; %1 -> array of S-boxes
; %2: data
; %3: temporary register for shifting data (modified)
; %4: output (modified)
%macro F 4
    ; %4 <- S[0][x >> 24] + S[1][x >> 16 & 0xff]
    mov %3, %2
    shr %3, 24 ; highest 8 bits
    and %3, 0xff
    mov %4, [%1 + %3*S_ELEMENT_MEMORY_SIZE]
    mov %3, %2
    shr %3, 16
    and %3, 0xff ; second-highest 8 bits
    add %1, S_BOX_MEMORY_SIZE ; move to next S-box
    add %4, [%1 + %3*S_ELEMENT_MEMORY_SIZE]

    ; %4 <- %4 ^ S[2][x >> 8 & 0xff]
    mov %3, %2
    shr %3, 8
    and %3, 0xff ; second-lowest 8 bits
    add %1, S_BOX_MEMORY_SIZE ; move to next S-box
    xor %4, [%1 + %3*S_ELEMENT_MEMORY_SIZE]

    ; %4 <- %4 + S[3][x & 0xff]
    mov %3, %2
    and %3, 0xff ; lowest 8 bits
    add %1, S_BOX_MEMORY_SIZE ; move to next S-box
    add %4, [%1 + %3*S_ELEMENT_MEMORY_SIZE]
%endmacro

; %1 -> array of S-boxes
; %2: temporary register for F (modified)
; %3: data half
; %4: other data half
; %5: value read from P-array, p[n]
; %6: temporary register for F output (modified)
; BLOWFISH_ROUND s, t1, i, j, p[n], t2
%macro BLOWFISH_ROUND 6
    F %1, %4, %2, %6 ; %6 <- F(%1, %4) = F(s, j)
    xor %6, %5       ; %6 <- F(s, j) ^ p[n]
    xor %3, %6       ;  i <- i ^ F(s, j) ^ p[n]
%endmacro

%macro F_BIG_ENDIAN 4
    %xdefine blf_state %1
    %xdefine x         %2
    %xdefine output    %3
    %xdefine tmp       %4
    
    ; output <- s[x & 0xff] + s[0x100 + (x>>8) & 0xff]
    mov tmp, x
    and tmp, 0xff
    mov output, [blf_state + tmp*S_ELEMENT_MEMORY_SIZE] ; s[x & 0xff]
    mov tmp, x
    shr tmp, 8
    and tmp, 0xff
    add output, [blf_state + S_BOX_MEMORY_SIZE + tmp*S_ELEMENT_MEMORY_SIZE]

    ; output <- output ^ s[0x200 + (x>>16) & 0xff]
    mov tmp, x
    shr tmp, 16
    and tmp, 0xff
    xor output, [blf_state + 2*S_BOX_MEMORY_SIZE + tmp*S_ELEMENT_MEMORY_SIZE]

    ; output <- output + s[0x300 + (x>>24) & 0xff]
    mov tmp, x
    shr tmp, 24
    and tmp, 0xff
    add output, [blf_state + 3*S_BOX_MEMORY_SIZE + tmp*S_ELEMENT_MEMORY_SIZE]
%endmacro

; %1 -> array of parallel S-boxes
; %2: 4 32-bit blocks from different passwords
; %3: output
; %4: temporary XMM register
; %5: temporary XMM register
; %6: mask; only most significant bit of each element used
; gather mask must already be loaded in gather_mask_xmm!
%macro F_XMM 6
    ; output <- s[x & 0xff] + s[0x100 + (x>>8) & 0xff]
    movdqa     %4, %2
    psrld      %4, 24                 ; highest byte
    pslld      %4, 2                  ; multiply by 4
    paddd      %4, element_offset_xmm ; move inside vector
    movdqa     %6, gather_mask_xmm
    vpgatherdd %3, [%4*S_ELEMENT_MEMORY_SIZE + %1], %6

    movdqa     %4, %2
    pslld      %4, 8
    psrld      %4, 24                 ; 2nd-highest byte
    pslld      %4, 2                  ; multiply by 4
    paddd      %4, element_offset_xmm
    movdqa     %6, gather_mask_xmm
    vpgatherdd %5, \
    [%4*S_ELEMENT_MEMORY_SIZE + %1 + PARALLEL_S_BOX_MEMORY_SIZE], %6
    paddd      %3, %5

    movdqa     %4, %2
    pslld      %4, 16
    psrld      %4, 24                 ; 2nd-lowest byte
    pslld      %4, 2                  ; multiply by 4
    paddd      %4, element_offset_xmm
    movdqa     %6, gather_mask_xmm
    vpgatherdd %5, \
    [%4*S_ELEMENT_MEMORY_SIZE + %1 + 2*PARALLEL_S_BOX_MEMORY_SIZE], %6
    pxor       %3, %5

    movdqa     %4, %2
    pslld      %4, 24
    psrld      %4, 24                 ; lowest byte
    pslld      %4, 2                  ; multiply by 4
    paddd      %4, element_offset_xmm
    movdqa     %6, gather_mask_xmm
    vpgatherdd %5, \
    [%4*S_ELEMENT_MEMORY_SIZE + %1 + 3*PARALLEL_S_BOX_MEMORY_SIZE], %6
    paddd      %3, %5
%endmacro

; %1 -> array of parallel S-boxes
; %2: 8 32-bit blocks from different passwords
; %3: output
; %4: temporary YMM register
; %5: temporary YMM register
; %6: mask; only most significant bit of each element used
; gather mask must already be loaded in gather_mask_ymm!
%macro F_YMM 6
    ; output <- s[x & 0xff] + s[0x100 + (x>>8) & 0xff]
    vmovdqa    %4, %2
    vpsrld     %4, 24                 ; highest byte
    vpslld     %4, 3                  ; multiply by 8
    vpaddd     %4, element_offset_ymm ; move inside vector
    vmovdqa    %6, gather_mask_ymm
    vpgatherdd %3, [%4*S_ELEMENT_MEMORY_SIZE + %1], %6

    vmovdqa    %4, %2
    vpslld     %4, 8
    vpsrld     %4, 24                 ; 2nd-highest byte
    vpslld     %4, 3                  ; multiply by 8
    vpaddd     %4, element_offset_ymm
    vmovdqa    %6, gather_mask_ymm
    vpgatherdd %5, \
    [%4*S_ELEMENT_MEMORY_SIZE + %1 + PARALLEL_D_S_BOX_MEMORY_SIZE], %6
    vpaddd     %3, %5

    vmovdqa    %4, %2
    vpslld     %4, 16
    vpsrld     %4, 24                 ; 2nd-lowest byte
    vpslld     %4, 3                  ; multiply by 8
    vpaddd     %4, element_offset_ymm
    vmovdqa    %6, gather_mask_ymm
    vpgatherdd %5, \
    [%4*S_ELEMENT_MEMORY_SIZE + %1 + 2*PARALLEL_D_S_BOX_MEMORY_SIZE], %6
    vpxor      %3, %5

    vmovdqa    %4, %2
    vpslld     %4, 24
    vpsrld     %4, 24                 ; lowest byte
    vpslld     %4, 3                  ; multiply by 8
    vpaddd     %4, element_offset_ymm
    vmovdqa    %6, gather_mask_ymm
    vpgatherdd %5, \
    [%4*S_ELEMENT_MEMORY_SIZE + %1 + 3*PARALLEL_D_S_BOX_MEMORY_SIZE], %6
    vpaddd     %3, %5
%endmacro

%macro BLOWFISH_ROUND_BIG_ENDIAN 7
    ; %xdefine blf_state %1
    ; %xdefine p_n       %2
    ; %xdefine i         %3
    ; %xdefine j         %4
    ; %xdefine f_output  %5
    ; %xdefine f_tmp     %6
    ; %xdefine r_tmp     %7

    F_BIG_ENDIAN %1, %4, %5, %6
    REVERSE_4_BYTES %5, %6, %7
    xor %5, %2
    xor %3, %5
%endmacro

; %1 -> array of parallel S-boxes
; %2: 4 P-elements for different passwords (same index)
; %3: 4 32-bit 'i' blocks
; %4: 4 32-bit 'j' blocks (inputs for F_XMM)
; %5: 4 F outputs
; %6: temporary XMM register for F_XMM
; %7: temporary XMM register for F_XMM
; %8: mask for F_XMM
%macro BLOWFISH_ROUND_XMM 8
    F_XMM %1, %4, %5, %6, %7, %8
    pxor %5, %2
    pxor %3, %5
%endmacro

; %1 -> array of parallel S-boxes
; %2: 4 P-elements for different passwords (same index)
; %3: 4 32-bit 'i' blocks
; %4: 4 32-bit 'j' blocks (inputs for F_YMM)
; %5: 4 F outputs
; %6: temporary YMM register for F_YMM
; %7: temporary YMM register for F_YMM
; %8: mask for F_YMM
%macro BLOWFISH_ROUND_YMM 8
    F_YMM %1, %4, %5, %6, %7, %8
    vpxor %5, %2
    vpxor %3, %5
%endmacro

; %1: | l | r |, then | 0 | r |
; %2: |garbage|, then | 0 | l |
%macro SPLIT_L_R 2
    mov %2, %1
    shl %1, 32
    shr %1, 32
    shr %2, 32
%endmacro

; input:  | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
; output: | b0 | b1 | b2 | b3 | b4 | b5 | b6 | b7 |
; %1: input, then output
; %2: temp
; %3: temp
; %4: lower 32 bits of %2
%macro REVERSE_8_BYTES 4
    mov %3, %1         ; | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
    shl %3, 56         ; | b0 | 00 | 00 | 00 | 00 | 00 | 00 | 00 |
    
    mov %2, %1
    and %2, 0xff00     ; | 00 | 00 | 00 | 00 | 00 | 00 | b1 | 00 |
    shl %2, 40         ; | 00 | b1 | 00 | 00 | 00 | 00 | 00 | 00 |
    or  %3, %2         ; | b0 | b1 | 00 | 00 | 00 | 00 | 00 | 00 |

    mov %2, %1
    and %2, 0xff0000   ; | 00 | 00 | 00 | 00 | 00 | b2 | 00 | 00 |
    shl %2, 24         ; | 00 | 00 | b2 | 00 | 00 | 00 | 00 | 00 |
    or  %3, %2         ; | b0 | b1 | b2 | 00 | 00 | 00 | 00 | 00 |
    
    mov %2, %1
    and %4, 0xff000000 ; | 00 | 00 | 00 | 00 | b3 | 00 | 00 | 00 |
    shl %2, 8          ; | 00 | 00 | 00 | b3 | 00 | 00 | 00 | 00 |
    or  %3, %2         ; | b0 | b1 | b2 | b3 | 00 | 00 | 00 | 00 |

    mov %2, %1
    shr %2, 8          ; | 00 | b7 | b6 | b5 | b4 | b3 | b2 | b1 |
    and %4, 0xff000000 ; | 00 | 00 | 00 | 00 | b4 | 00 | 00 | 00 |
    or  %3, %2         ; | b0 | b1 | b2 | b3 | b4 | 00 | 00 | 00 |
    
    mov %2, %1
    shr %2, 24         ; | 00 | 00 | 00 | b7 | b6 | b5 | b4 | b3 |
    and %2, 0xff0000   ; | 00 | 00 | 00 | 00 | 00 | b5 | 00 | 00 |
    or  %3, %2         ; | b0 | b1 | b2 | b3 | b4 | b5 | 00 | 00 |

    mov %2, %1
    shr %2, 40         ; | 00 | 00 | 00 | 00 | 00 | b7 | b6 | b5 |
    and %2, 0xff00     ; | 00 | 00 | 00 | 00 | 00 | 00 | b6 | 00 |
    or  %3, %2         ; | b0 | b1 | b2 | b3 | b4 | b5 | b6 | 00 |

    shr %1, 56         ; | 00 | 00 | 00 | 00 | 00 | 00 | 00 | b7 |
    or  %1, %3         ; | b0 | b1 | b2 | b3 | b4 | b5 | b6 | b7 |
%endmacro

; input:  | b3 | b2 | b1 | b0 |
; output: | b0 | b1 | b2 | b3 |
%macro REVERSE_4_BYTES 3
    ; %xdefine data %1
    ; %xdefine tmp1 %2
    ; %xdefine tmp2 %3

    mov %2, %1
    shr %2, 24
    and %2, 0xff     ; | 00 | 00 | 00 | b3 |

    mov %3, %1
    shr %3, 8        ; | 00 | b3 | b2 | b1 |
    and %3, 0xff00   ; | 00 | 00 | b2 | 00 |
    or  %2, %3       ; | 00 | 00 | b2 | b3 |

    mov %3, %1
    shl %3, 8        ; | b2 | b1 | b0 | 00 |
    and %3, 0xff0000 ; | 00 | b1 | 00 | 00 |
    or  %2, %3       ; | 00 | b1 | b2 | b3 |

    shl %1, 24       ; | b0 | 00 | 00 | 00 |
    or  %1, %2       ; | b0 | b1 | b2 | b3 |
%endmacro

; %1: input, then output
; %2: helper reg
; %3: lower 32 bits of input reg
; %4: lower 32 bits of helper reg
%macro REVERSE_ENDIANNESS_2_DWORDS_BSWAP 4
    mov   %2, %1 ; |  Xl  |  Xr  |
    shr   %2, 32 ; |  00  |  Xl  |
    bswap %4     ; |  00  |  Xl' |
    shl   %2, 32 ; |  Xl' |  00  |
    bswap %3     ; |  00  |  Xr' |
    or    %1, %2 ; |  Xl' |  Xr' |
%endmacro

; %1: key_data
; %2: key_data_low
; %3: key_data_ctr
; %4: key_ptr
; %5: key_len
; %6: loop_ctr
; %7: data
; %8: iteration number
%macro XOR_WITH_KEY 8
    .key_loop_%8:
        cmp %6, 8
        je  .end_key_loop_%8

        .extract_key_bytes_%8:
            cmp %3, %5
            jl  .continue_extract_%8
            xor %3, %3 ; reset counter

        .continue_extract_%8:
            mov %2, [%4 + %3] ; key_data_low, [key_ptr + key_data_ctr]
            shl %7, 8
            or  %7, %1

            inc %3
            inc %6
            jmp .key_loop_%8

    .end_key_loop_%8:
        rol %7, 32
        xor [rdi + BLF_CTX_P_OFFSET + %8*P_VALUE_MEMORY_SIZE], %7
        xor %6, %6
%endmacro

; %1: key_data
; %2: key_data_ctr
; %3: key_ptr
; %4: key_len
; %5: loop_ctr
; %6: iteration number
%macro READ_4_KEY_BYTES_PARALLEL 6
    .key_loop_%6:
        cmp %5, 4 ; four iterations, one for each byte
        je  .end_key_loop_%6
        ; leave space for next bytes
        pslld %1, 8

        .extract_key_bytes_%6:
            cmp %2, %4
            jl  .continue_extract_%6
            xor %2, %2 ; reset counter
        
        .continue_extract_%6:
            ; extract a byte from each key
            pinsrb %1, [%3 + %2], 0
            add    %3, %4
            pinsrb %1, [%3 + %2], 4
            add    %3, %4
            pinsrb %1, [%3 + %2], 8
            add    %3, %4
            pinsrb %1, [%3 + %2], 12
            add    %3, %4

            ; back to first key
            shl %4, 2
            sub %3, %4
            shr %4, 2

            ; loop
            inc %2
            inc %5
            jmp .key_loop_%6

    .end_key_loop_%6:
%endmacro

; %1: key_data
; %2: key_data_ctr_1
; %3: key_ptr
; %4: key_len
; %5: loop_ctr
; %6: iteration number
; %7: lower 16 bytes of key_data
; %8: helper XMM register
; %9: key_data_ctr_2
%macro READ_4_KEY_BYTES_PARALLEL_DOUBLE 9
    .lower_key_loop_%6:
        cmp %5, 4 ; four iterations, one for each byte
        je  .end_lower_key_loop_%6
        ; leave space for next bytes
        vpslld %7, 8

        .extract_lower_key_bytes_%6:
            cmp %2, %4
            jl  .continue_extract_lower_%6
            xor %2, %2 ; reset counter
        
        .continue_extract_lower_%6:
            ; extract a byte from each key
            vpinsrb %7, [%3 + %2], 0
            add     %3, %4
            vpinsrb %7, [%3 + %2], 4
            add     %3, %4
            vpinsrb %7, [%3 + %2], 8
            add     %3, %4
            vpinsrb %7, [%3 + %2], 12
            add     %3, %4

            ; back to first key
            shl %4, 2
            sub %3, %4
            shr %4, 2

            ; loop
            inc %2
            inc %5
            jmp .lower_key_loop_%6

    .end_lower_key_loop_%6:
        ; skip to 4th key
        shl %4, 2
        add %3, %4
        shr %4, 2

    .higher_key_loop_%6:
        cmp %5, 8 ; four iterations, one for each byte. 4 + 4 = 8
        je  .end_higher_key_loop_%6
        ; leave space for next bytes
        vpslld %8, 8

        .extract_higher_key_bytes_%6:
            cmp %9, %4
            jl  .continue_extract_higher_%6
            xor %9, %9 ; reset counter
        
        .continue_extract_higher_%6:
            ; extract a byte from each key
            vpinsrb %8, [%3 + %9], 0
            add     %3, %4
            vpinsrb %8, [%3 + %9], 4
            add     %3, %4
            vpinsrb %8, [%3 + %9], 8
            add     %3, %4
            vpinsrb %8, [%3 + %9], 12
            add     %3, %4

            ; back to 4th key
            shl %4, 2
            sub %3, %4
            shr %4, 2

            ; loop
            inc %9
            inc %5
            jmp .higher_key_loop_%6

    .end_higher_key_loop_%6:
        vinserti128 %1, %8, 1

        ; back to first key
        shl %4, 2
        sub %3, %4
        shr %4, 2
%endmacro

; %8: iteration number
%macro READ_32_KEY_BYTES 8

    %define key_data     %1
    %define key_data_1   %2
    %define key_data_2   %3
    %define key_data_ctr %4
    %define key_ptr      %5
    %define key_len      %6
    %define loop_ctr     %7

    .lower_half_loop_%8:
        cmp     loop_ctr, 16
        je      .upper_half_loop_%8
        vpsrldq key_data, 1 ; shift one byte

        .extract_key_bytes_lower_%8:
            cmp     key_data_ctr, key_len
            jl      .continue_extract_lower_%8
            xor     key_data_ctr, key_data_ctr ; wrap around

        .continue_extract_lower_%8:
            vpinsrb key_data_1, [key_ptr + key_data_ctr], 15
            inc     loop_ctr
            inc     key_data_ctr
            jmp     .lower_half_loop_%8
    
    .upper_half_loop_%8:
        cmp     loop_ctr, 32
        je      .end_load_key_%8
        vpsrldq key_data_2, 1

        .extract_key_bytes_higher_%8:
            cmp     key_data_ctr, key_len
            jl      .continue_extract_higher_%8
            xor     key_data_ctr, key_data_ctr

        .continue_extract_higher_%8:
            vpinsrb key_data_2, [key_ptr + key_data_ctr], 15
            inc     loop_ctr
            inc     key_data_ctr
            jmp     .upper_half_loop_%8
        
    .end_load_key_%8:
        vinserti128 key_data, key_data_2, 1

%endmacro

; %1: key data
; %2: key_data_ctr
; %3: key_ptr
; %4: key_len
; %5: loop_ctr
; %6: iteration number
%macro READ_16_KEY_BYTES 6
    .key_loop_%6:
        cmp     %5, 16
        je      .end_key_loop_%6
        vpsrldq %1, 1 ; shift one byte
    
    .extract_key_bytes_%6:
        cmp     %2, %4
        jl      .continue_extract_%6
        xor     %2, %2

    .continue_extract_%6:
        vpinsrb %1, [%3 + %2], 15
        inc     %5
        inc     %2
        jmp     .key_loop_%6
    
    .end_key_loop_%6:
%endmacro

%macro READ_8_KEY_BYTES 8
    .key_loop_%8:
        cmp     loop_ctr, 8
        je      .end_key_loop_%8
        vpsrldq key_data_1, 1
    
    .extract_key_bytes_%8:
        cmp     key_data_ctr, key_len
        jl      .continue_extract_%8
        xor     key_data_ctr, key_data_ctr

    .continue_extract_%8:
        vpinsrb key_data_1, [key_ptr + key_data_ctr], 7
        inc     loop_ctr
        inc     key_data_ctr
        jmp     .key_loop_%8
    
    .end_key_loop_%8:
%endmacro

; copy single-data ciphertext
; %1 -> ciphertext buffer
; %2: temporary register
; %3: temporary register
; %4: temporary register
; %5: lower 32 bits of %3
; %6 -> 24-byte ciphertext to be copied
%macro COPY_CTEXT 6
    %assign j 0
    %rep BCRYPT_WORDS / 2
        mov %2, [%6 + j*8]
        REVERSE_8_BYTES %2, %3, %4, %5
        rol %2, 32
        mov [%1 + j*8], %2
        %assign j j+1
    %endrep
%endmacro

; copy x4 parallel ciphertext
; %1 -> ciphertext buffer
; %2 -> ciphertext to be copied
; %3: temporary YMM register
%macro COPY_CTEXT_XMM 3
    %assign j 0
    %rep BCRYPT_WORDS / 2
        vmovdqu %3, [%2 + j*YMM_SIZE]
        vpshufb %3, endianness_mask_ymm
        vmovdqu [%1 + j*YMM_SIZE], %3
        %assign j j+1
    %endrep
%endmacro

; copy x8 parallel ciphertext
; %1 -> ciphertext buffer
; %2 -> ciphertext to be copied
; %3: temporary YMM register
%macro COPY_CTEXT_YMM 3
    %assign j 0
    %rep BCRYPT_WORDS
        vmovdqu %3, [%2 + j*YMM_SIZE]
        vpshufb %3, endianness_mask_ymm
        vmovdqu [%1 + j*YMM_SIZE], %3
        %assign j j+1
    %endrep
%endmacro

; Keep salt and P-array cached
; %1 -> state
; %2 -> salt
%macro LOAD_SALT_AND_P 2
    vmovdqa endianness_mask_ymm, [endianness_mask]
    vpxor   p_16_17, p_16_17
    
    movdqu  salt, [%2]
    vmovdqa p_0_7, [%1 + BLF_CTX_P_OFFSET]
    vmovdqa p_8_15, [%1 + BLF_CTX_P_OFFSET + 8*P_VALUE_MEMORY_SIZE]
    vpinsrq p_16_17, p_16_17, \
             [%1 + BLF_CTX_P_OFFSET + 16*P_VALUE_MEMORY_SIZE], 0
    
    vpshufb p_0_7, endianness_mask_ymm
    vpshufb p_8_15, endianness_mask_ymm
    vpshufb ymm3, endianness_mask_ymm
%endmacro

; Similar to the above, but after having read P
; into p_0_7, p_8_15 and p_16_17, split it across
; other regs in order to avoid AVX-SSE transition
; penalties. Also, use exclusively V-prefixed
; instructions (which is why this repeats code
; instead of just reusing the other macro)
; %1 -> state
; %2 -> salt
%macro LOAD_SALT_AND_P_NO_PENALTIES 2
    vmovdqa endianness_mask_ymm, [endianness_mask]
    vpxor   p_16_17, p_16_17
    
    vmovdqu salt, [%2]
    vmovdqa p_0_7, [%1 + BLF_CTX_P_OFFSET]
    vmovdqa p_8_15, [%1 + BLF_CTX_P_OFFSET + 8*P_VALUE_MEMORY_SIZE]
    vpinsrq p_16_17, p_16_17, \
        [%1 + BLF_CTX_P_OFFSET + 16*P_VALUE_MEMORY_SIZE], 0
    
    vpshufb p_0_7, endianness_mask_ymm
    vpshufb p_8_15, endianness_mask_ymm
    vpshufb ymm3, endianness_mask_ymm

    vmovdqa p_4_7y, p_0_7
    ROTATE_128(p_4_7y)

    vmovdqa p_12_15y, p_8_15
    ROTATE_128(p_12_15y)
%endmacro

; %1 -> state
; %2: helper general-purpose reg for extracting P[16] and P[17]
%macro STORE_P 2
    vpshufb p_0_7, endianness_mask_ymm
    vpshufb p_8_15, endianness_mask_ymm
    vpshufb ymm3, endianness_mask_ymm

    vmovdqa [%1 + BLF_CTX_P_OFFSET], p_0_7
    vmovdqa [%1 + BLF_CTX_P_OFFSET + 8*P_VALUE_MEMORY_SIZE], p_8_15
    vpextrq %2, p_16_17, 0
    mov     [%1 + BLF_CTX_P_OFFSET + 16*P_VALUE_MEMORY_SIZE], %2
%endmacro

; %1 -> state
; %2: helper general-purpose reg for extracting P[16] and P[17]
%macro STORE_P_NO_PENALTIES 2
    vpshufb p_0_7, endianness_mask_ymm
    vpshufb p_4_7y, endianness_mask_ymm
    vpshufb p_8_15, endianness_mask_ymm
    vpshufb p_12_15y, endianness_mask_ymm
    vpshufb ymm3, endianness_mask_ymm

    vmovdqa [%1 + BLF_CTX_P_OFFSET], p_0_3
    vmovdqa [%1 + BLF_CTX_P_OFFSET + 4*P_VALUE_MEMORY_SIZE], p_4_7
    vmovdqa [%1 + BLF_CTX_P_OFFSET + 8*P_VALUE_MEMORY_SIZE], p_8_11
    vmovdqa [%1 + BLF_CTX_P_OFFSET + 12*P_VALUE_MEMORY_SIZE], p_12_15
    vpextrq %2, p_16_17, 0
    mov     [%1 + BLF_CTX_P_OFFSET + 16*P_VALUE_MEMORY_SIZE], %2
%endmacro

; %1 -> initial ciphertext
%macro LOAD_CTEXT 1
    vpxor   ctext_y, ctext_y
    vpinsrq ctext_x, [%1 + 16], 0 ; skip 16 bytes
    ROTATE_128(ctext_y)
    movdqu  ctext_x, [%1]
%endmacro

; %1 -> initial ciphertext
; %2 -> helper YMM reg
; %3 -> lower 16 bytes of %2
%macro LOAD_CTEXT_NO_PENALTIES 3
    vpxor   ctext_y, ctext_y
    vmovdqu ctext_x, [%1 + 8] ; skip 8 bytes
    vpermq  ctext_y, ctext_y, 0xd2 ; shift loaded bytes
    vpinsrq %3, [%1], 0 ; insert remaining 8 bytes
    vpor    ctext_y, %2 ; reconstruct
%endmacro

; %1 -> buffer for bcrypt hash
; %2: helper general-purpose reg for extracting last 8 bytes
%macro STORE_CTEXT 2
    movdqu [%1], ctext_x
    ROTATE_128(ctext_y)
    pextrq %2, ctext_x, 0
    mov    [%1 + 16], %2
%endmacro

; %1 -> buffer for bcrypt hash
; %2: helper general-purpose reg for extracting last 8 bytes
%macro STORE_CTEXT_NO_PENALTIES 2
    vmovdqu [%1], ctext_x
    ROTATE_128(ctext_y)
    vpextrq %2, ctext_x, 0
    mov     [%1 + 16], %2
%endmacro

%endif