hex_bench.tar
application/x-tar
Filename: hex_bench.tar
Type: application/x-tar
Part: 1
hex_bench.asm 0000644 0001750 0001750 00000012022 14163614744 014230 0 ustar buschmann buschmann ;---------------------------------------
;Win64 NASM Example: Using base6.obj + GoLink
;
; debug
; nasm -f WIN64 -g hex_bench.asm -l hex_bench.lis
;;;;; golink /console id_decomp_nasm.obj GAIA_IDS_L0HP_6.dll base6.obj msvcrt.dll kernel32.dll /files
; HEX_BENCH_DATA_1GB.dll
; HEX_BENCH_DATA_2GB.dll
; golink /console hex_bench.obj hex_x86_64.obj base64_x86_64.obj HEX_BENCH_DATA_1300KB.obj base6.obj msvcrt.dll kernel32.dll /files
; golink /console hex_bench.obj hex_x86_64.obj base64_x86_64.obj HEX_BENCH_DATA_300KB.obj base6.obj msvcrt.dll kernel32.dll /files
; golink /console hex_bench.obj HEX_BENCH_DATA_300KB.dll hex_x86_64.obj base64_x86_64.obj base6.obj msvcrt.dll kernel32.dll /files
; nasm -f elf64 -g hex_bench.asm -l hex_bench.lis
; ld -o hex_bench hex_bench.o hex_x86_64.o base64_x86_64.o HEX_BENCH_DATA_1300KB.o
%ifdef ASSEMBLE_COMMAND_LINES_ON_WINDOWS
:: commands to build on Windows (nasm and golink in the path)
nasm -f WIN64 -g hex_bench.asm -l hex_bench.lis
nasm -f WIN64 -g hex_x86_64.asm -l hex_x86_64.lis
nasm -f WIN64 -g HEX_BENCH_DATA_1300KB.asm
golink /console hex_bench.obj hex_x86_64.obj HEX_BENCH_DATA_1300KB.obj
%endif
%ifdef ASSEMBLE_COMMAND_LINES_ON_LINUX
# commands to build on LINUX
nasm -f elf64 -g hex_bench.asm -l hex_bench.lis
nasm -f elf64 -g hex_x86_64.asm -l hex_x86_64.lis
nasm -f elf64 -g HEX_BENCH_DATA_1300KB.asm
ld -o hex_bench hex_bench.o hex_x86_64.o HEX_BENCH_DATA_1300KB.o
%endif
;;;;GAIA_L0HP_6.dll mul_decomp_arr_nasm.dll
extern hex_encode_fast
extern hex_encode_sse2
extern hex_encode_ssse3
extern hex_encode_avx2
;extern hex_encode_avx512f
extern hex_encode_avx512bw
extern hex_decode_sse2
extern hex_decode_avx2
extern hex_decode_avx512bw
extern get_instr_info
extern base64_encode_ssse3
extern base64_encode_avx2
extern base64_encode_avx512bw
extern base64_decode_ssse3
extern base64_decode_avx2
extern base64_decode_avx512bw
extern HEX_BENCH_BIN_START
extern HEX_BENCH_BIN_TRAIL
%ifidn __OUTPUT_FORMAT__, win64
%define __WIN__ 1
%elifidn __OUTPUT_FORMAT__, elf64
%define __ELF__ 1
%endif
; LINUX call-convention 64 bit
; -----------------------------------
; So %rdi, %rsi, %rdx, %rcx, %r8 and %r9 are the registers in order used to pass integer/pointer (i.e. INTEGER class)
; parameters to any libc function from assembly.
; %rdi is used for the first INTEGER parameter. %rsi for 2nd, %rdx for 3rd and so on.
; Then call instruction should be given.
; The stack (%rsp) must be 16B-aligned when call executes.
; BENCH results
; length of 300KB-Source-PDF is 0x54756 = 345942 bytes
; 1 million
; Celeron G5905 ca. 3.5 GHz
; NUC core i3 8109U
; G5905 Hex-encode SSE2 50 sec = 6.919 GB /sec
; NUC Hex-encode SSE2 49 sec = 7.060 GB /sec
; Hex-encode AVX2 50 sec = 6.919 GB /sec
; NUC Hex-encode AVX2 27 sec = 12.813 GB /sec
; NUC BASe64 Encode AVX2 23 sec = 15.040 GB/sec
; BENCH results
; length of 1300KB-NASM.EXE(win) is 0x153000 = 1388544 bytes = 1356 KB
; 200000 loop count
; Celeron G5905 ca. 3.5 GHz
; NUC core i3 8109U
; Tigerlake i5-1135G7 2.4 GHz- 4.2 GHz
; IceLake XEON Silver 4314 2.4 Ghz-4.7 GHz
; G5905 Hex-encode SSE2 50 sec = 6.919 GB /sec
; NUC Hex-encode SSE2 40 sec = 6.780 GB /sec
; TGL Hex-encode SSE2 27 sec = 9.81 GB /sec
; Hex-encode AVX2 50 sec = 6.919 GB /sec
; NUC Hex-encode AVX2 27 sec = 10.044 GB /sec
; NUC BASe64 Encode AVX2 23 sec = XXX15.040 GB/sec
; 1 Million = 1356 GB
; TGL Hex-encode AVX512bw 84 sec = 16.1 GB /sec
; TGL Hex-encode AVX2 27 sec = 9.81 GB /sec
; TGL Hex-encode SSSE3 27 sec = 9.81 GB /sec
; TGL Hex-encode SSE2 27 sec = 9.81 GB /sec
; ICL Hex-encode SSSE3 122 sec = GB /sec
; ICL Hex-encode AVX2 117 sec = GB /sec
; ICL Hex-encode AVX512BW 113 sec = GB /sec
; LINUX Fedora 35
; length of 1300KB-nasm(linux) is 1759032 bytes = 1718 KB
; 1 Million = 1718 GB
; TGL Hex-encode AVX512bw 95 sec = 18.1 GB /sec
default rel
section .bss align=64
HEXENCODE_OUT_ARR:
resq 1024*4096
;HEXDECODE_OUT_ARR:
section .text align=32
global start
global main
%use smartalign
ALIGNMODE p6
start:
;_start:
main:
push rbp
mov rbp,rsp
sub rsp,32
push rdi
push rsi
push r15
push r14
; call get_instr_info
; mov r14,2; 5*200000 ;1000000
mov r14,1000000
LBENCH_LOOP:
lea rsi,[HEX_BENCH_BIN_START] ; parameter 2 input buffer
lea rdx,[HEX_BENCH_BIN_TRAIL]
sub rdx,rsi ; parameter 3 number of elements
; mov rdx,512 ;[HEXENCODE_INP_N_ELEM] ; rdx = number of elements
lea rdi,[HEXENCODE_OUT_ARR] ; parameter 1 output buffer
%ifdef __WIN__
mov rcx,rdi ; parameter 1 output buffer
mov r8,rdx ; parameter 3 number of elements
mov rdx,rsi ; parameter 2 input buffer
%endif
; call hex_encode_fast
; call hex_encode_sse2
; call hex_encode_ssse3
; call hex_encode_avx2
call hex_encode_avx512bw
; call base64_encode_ssse3
; call base64_encode_avx2
; call base64_encode_avx512bw
sub r14,1
jnz LBENCH_LOOP
xor rax,rax
nop
pop r14
pop r15
pop rsi
pop rdi
sub rsp,32
mov rsp,rbp
pop rbp
ret
HEX_BENCH_DATA_1300KB.asm 0000644 0001750 0001750 00000001547 14163614744 015313 0 ustar buschmann buschmann ;---------------------------------------
;Win64 NASM Example: Using base6.obj + GoLink
;
; debug
; nasm -f WIN64 -g HEX_BENCH_DATA_1300KB.asm -l HEX_BENCH_DATA_1300KB.lis
;;;;; golink /console id_decomp_nasm.obj GAIA_IDS_L0HP_6.dll base6.obj msvcrt.dll kernel32.dll /files
; nasm -f elf64 -g HEX_BENCH_DATA_1300KB.asm -l HEX_BENCH_DATA_1300KB.lis
; HEX_BENCH_DATA_1GB.dll
global HEX_BENCH_BIN_START
global HEX_BENCH_BIN_TRAIL
%ifidn __OUTPUT_FORMAT__, win64
export HEX_BENCH_BIN_START
export HEX_BENCH_BIN_TRAIL
%endif
default rel
section .rdata align=64
HEX_BENCH_BIN_START:
%ifidn __OUTPUT_FORMAT__, win64
incbin "N:\tools_hb\nasm\nasm.exe"
; incbin "N:\d\os\ProxMox\Proxmox-VE-Datasheet.pdf"
; incbin "N:\d\os\ProxMox\proxmox-ve_6.4-1.iso"
%else
incbin "/usr/bin/nasm"
%endif
HEX_BENCH_BIN_TRAIL:
times 1024 dq 0
;
hex_x86_64.asm 0000644 0001750 0001750 00000274057 14163614744 014131 0 ustar buschmann buschmann %ifdef __NASM_MAJOR__
%ifdef COMPILE_C_STYLE_COMMENTS
/*-------------------------------------------------------------------------
*
* hex_x86_64.asm
* Assembler routines for converting a buffer to hex (hex_encode_xxx)
* and restore the binary from hex code (hex_decode_xxx) on Intel X64
*
* Copyright (c) 2021-2022, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/backend/utils/adt/hex_x86_64.asm
*
*-------------------------------------------------------------------------
*/
%endif
; hex_x86_64.sam
; Assembler routines for converting a buffer to hex (hex_encode_xxx)
; and restore the binary from hex_code (hex_decode_xxx) on Intel X64
; nasm -f WIN64 -g hex_x86_64.asm -l hex_x86_64.lis
; golink /console hexdump.obj hex_x86_64.obj base64_x86_64.obj /files
; Linux register order: %rdi, %rsi, %rdx, %rcx, %r8 and %r9
; Windows register order: rcx, rdx, r8, r9
; Windows non volatile registers: rbx,rbp,rdi,rsi,rsp, r12,r13,r14,r15 and xmm6-xmm15
; Linux non volatile registers: rbx,rbp, rsp, r12,r13,r14,r15
; nasm -f elf64 -g hex_x86_64.asm -l hex_x86_64_elf64.lis
%ifidn __OUTPUT_FORMAT__, win64
%define __WIN__ 1
%elifidn __OUTPUT_FORMAT__, elf64
%define __ELF__ 1
%endif
%define NSHIFT_ADDRESS_TO_PAGE 12
%define N_BYTES_PER_SSE2 16
%define N_BYTES_PER_AVX2 32
%define N_BYTES_PER_AVX512 64
global get_hex_encode_alloc_addon
global get_hex_decode_alloc_addon
global hex_encode_fast
global hex_encode_sse2
global hex_encode_ssse3
global hex_encode_avx2
global hex_encode_avx512bw
global hex_decode_sse2
global hex_decode_avx2
global hex_decode_avx512bw
default rel
section .rdata align=64
; values loaded with VMOVDQA64 in AVX512, so 64 bytes needed
%define VPERM_AVX2_OFFS 0b11_01_10_00
VPERM_ENCODE_OFFSETS dq 0,4,1,5,2,6,3,7
VPERM_DECODE_OFFSETS dq 0,2,4,6,1,3,5,7
ENCODE_SHUFFLE_TO_HEX times 4 db '0123456789abcdef'
ENCODE_SHUFFLE_TO_HIGH_LOW times 4 db 8,0,9,1, 10,2,11,3, 12,4,13,5, 14,6,15,7
; from here on values used with VPBROADCASTQ in AVX512 / VMOVDQA in AVX2, so only 16/32 bytes needed
;BITMASK_UPPER_HALF times 32 db 0b1111_0000
BITMASK_LOWER_HALF times 32 db 0b0000_1111
BITMASK_NIBBLE_3_IN_WORD times 16 dw 0x0F00
BITMASK_LITTLE_TO_BIG_ASCII times 32 db 0b1101_1111
BITMASK_BIG_TO_LITTLE_ASCII times 32 db 0b0010_0000
BITMASK_ZERO_ONE times 32 db 0b0101_0101
BITMASK_ONE_ZERO times 32 db 0b1010_1010
BITMASK_SELECT_DIGIT times 32 db 0b0011_1111
ALL_BYTES_9 times 32 db 9
ASCII_LITTLE_A_ADD:
ALL_BYTES_39 times 32 db 39
ASCII_0_OFFSET:
ALL_BYTES_48 times 32 db 48
;ASCII_DIGIT_9 times 32 db 48+9
ASCII_LETTER_LITTLE_A times 32 db 'a'
ASCII_LETTER_LITTLE_F times 32 db 'f'
HEX_ENCODE_ARRAYS:
HEX_ENC_MIN_SRC_LEN_ARR:
dq 0
dq 128
dq 512
dq 512
dq 1024
HEX_ENC_RESULT_BUFFER_OVERFLOW_ARR:
dq 0
dq 64
dq 128
dq 128
dq 256
HEX_ENC_IMPL_ROUTINE_ARR:
dq 0
dq hex_encode_sse2
dq hex_encode_ssse3
dq hex_encode_avx2
dq hex_encode_avx512bw
HEX_DECODE_ARRAYS:
HEX_DEC_MIN_SRC_LEN_ARR:
dq 0
dq 128
dq 512
dq 1024
HEX_DEC_RESULT_BUFFER_OVERFLOW_ARR:
dq 0
dq 64
dq 128
dq 256
HEX_DEC_IMPL_ROUTINE_ARR:
dq 0
dq hex_decode_sse2
dq hex_decode_avx2
dq hex_decode_avx512bw
section .text align=32
%use smartalign
ALIGNMODE p6
%ifdef __WIN__
%define STACK_FOR_XMM 10*16
%else
%define STACK_FOR_XMM 0
%endif
;----------------------------------------------------------------------------------------------
; get_hex_encode_alloc_addon returns the tail-handling-required allocation addon
; according to the request length and the maximum valid impl_id
; it looks for the correct values in the hex_enc_tables indexed by impl_id
get_hex_encode_alloc_addon:
sub rsp,0x28
%ifdef __WIN__
; mov rcx,rcx ; WIN parameter 1 requested source len
; mov rdx,rdx ; WIN parameter 2 maximum valid impl_id
%else
mov rcx,rdi ; LINUX parameter 1 requested source len
mov rdx,rsi ; LINUX parameter 2 maximum valid impl_id
%endif
lea r8,[HEX_ENC_MIN_SRC_LEN_ARR]
.loop_search:
cmp rcx,[r8+8*rdx] ; compare requested length with current impl_id
jge .offset_found
sub rdx,1 ; lower impl_id
jnz .loop_search
.offset_found:
lea r8,[HEX_ENC_RESULT_BUFFER_OVERFLOW_ARR]
mov rax,[r8+8*rdx] ; return the alloc_overflow
add rsp,0x28
ret
;----------------------------------------------------------------------------------------------
; get_hex_decode_alloc_addon returns the tail-handling-required allocation addon
; according to the request length and the maximum valid impl_id
; It looks for the correct values in the hex_enc_tables indexed by impl_id
get_hex_decode_alloc_addon:
sub rsp,0x28
%ifdef __WIN__
; mov rcx,rcx ; WIN parameter 1 requested source len
; mov rdx,rdx ; WIN parameter 2 maximum valid impl_id
%else
mov rcx,rdi ; LINUX parameter 1 requested source len
mov rdx,rsi ; LINUX parameter 2 maximum valid impl_id
%endif
lea r8,[HEX_DEC_MIN_SRC_LEN_ARR]
.loop_search:
cmp rcx,[r8+8*rdx] ; compare requested length with current impl_id
jge .offset_found
sub rdx,1 ; lower impl_id
jnz .loop_search
.offset_found:
lea r8,[HEX_DEC_RESULT_BUFFER_OVERFLOW_ARR]
mov rax,[r8+8*rdx] ; return the alloc_overflow
add rsp,0x28
ret
;----------------------------------------------------------------------------------------------
; hex_encode_fast is the dispatcher routine according to the cpu capabilities and
; the length of the encode request.
;
; Parameter 4 (moved to r15) is the maximum valid impl_id fullfilling the cpu requirements
; (determined at program initialization time outside this routine)
; The index into the HEX_ENCODE_ARRAYS is set to the maximum supported requirements.
; When r15 == 0 no fast encode is supported and a zero length is returned.
%define STACK_ADJ 0x28+2*8
hex_encode_fast:
sub rsp,STACK_ADJ
mov [rsp+0*8],r9
mov [rsp+1*8],r15
; r15 = checked highest valid index
%ifdef __WIN__
mov rax,r8 ; WIN parameter 3 number of elements
mov r15,r9 ; WIN parameter 4 maximum valid impl_id
%else
mov rax,rdx ; LINUX parameter 3 number of elements
mov r15,rcx ; LINUX parameter 4 maximum valid impl_id
%endif
lea r10,[HEX_ENC_MIN_SRC_LEN_ARR]
.check_length:
cmp rax,[r10+8*r15]
jge .max_length_found
sub r15,1
jnz .check_length
.max_length_found:
xor rax,rax
cmp r15,0
jz .return
lea r10,[HEX_ENC_IMPL_ROUTINE_ARR]
call [r10+8*r15]
.return:
mov r9,[rsp+0*8]
mov r15,[rsp+1*8]
add rsp,STACK_ADJ
ret
%define STACK_ADJ 0x28+6*8+STACK_FOR_XMM
;----------------------------------------------------------------------------------------------
; xmm15 ; CONST ALL bytes 9
; xmm14 ; CONST BITMASK_LOWER_HALF
; xmm13 ; CONST ASCII_0_OFFSET
; xmm12 ; CONST ASCII_LITTLE_A_ADD
; xmm11 ; Prefetch Input line 3
; xmm10 ; Prefetch Input line 2
; xmm9 ; Input Line 1
; xmm8 ; Input Line 0
; xmm7 ; Unpack RL1 Rght Half low bits secnd line
; xmm6 ; Unpack RH1 Rght Half high bits secnd line
; xmm5 ; Unpack LL1 Left Half low bits secnd line
; xmm4 ; Unpack LH1 Left Half high bits secnd line
; xmm3 ; Unpack RL0 Rght Half low bits first line
; xmm2 ; Unpack RH0 Rght Half high bits first line
; xmm1 ; Unpack LL0 Left Half low bits first line
; xmm0 ; Unpack LH0 Left Half high bits first line
%define NINP_BYTES_PER_ROUND 2*16
%define NINP_BITSHIFT 5
hex_encode_sse2:
sub rsp,STACK_ADJ
mov [rsp+STACK_FOR_XMM+0*8],rdi
mov [rsp+STACK_FOR_XMM+1*8],rsi
mov [rsp+STACK_FOR_XMM+2*8],r12
mov [rsp+STACK_FOR_XMM+3*8],r14
mov [rsp+STACK_FOR_XMM+4*8],r15
%ifdef __WIN__
MOVDQA [rsp ],xmm6
MOVDQA [rsp+1*16],xmm7
MOVDQA [rsp+2*16],xmm8
MOVDQA [rsp+3*16],xmm9
MOVDQA [rsp+4*16],xmm10
MOVDQA [rsp+5*16],xmm11
MOVDQA [rsp+6*16],xmm12
MOVDQA [rsp+7*16],xmm13
MOVDQA [rsp+8*16],xmm14
MOVDQA [rsp+9*16],xmm15
mov rdi,rcx ; parameter 1 output buffer
mov rsi,rdx ; parameter 2 input buffer
mov rdx,r8 ; parameter 3 number of elements
%endif
;; initializer for QQ0 and QQ1
MOVDQU xmm8,[rsi+0*16] ; QQ0 p__23__ p__23__ l8 QL0
MOVDQU xmm9,[rsi+1*16] ; QQ1 p__23__ p__23__ l8 QL0
;; initialize constants
MOVDQA xmm15,[ALL_BYTES_9] ; p_23__ l3
MOVDQA xmm14,[BITMASK_NIBBLE_3_IN_WORD] ; p_23__ l3
; MOVDQA xmm13,[ALL_BYTES_48] ; p_23__ l3
MOVDQA xmm12,[ALL_BYTES_39] ; p_23__ l3
MOVDQA xmm13,xmm12
PADDB xmm13,xmm15 ; 48 = 39+9
;; do page overshoot checks
mov rax,NINP_BYTES_PER_ROUND
mov r9,rdx ; exact requested number of elements to process
add r9,rsi ; r9 last valid pointer +1 of requested input buffer
mov r10,rsi ; r10 saved start of input buffer
mov r12,r9 ; r12 save of end of input buffer+1
lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
lea r8,[rdx+NINP_BYTES_PER_ROUND-1]
shr r8,NINP_BITSHIFT ; number of loops
shl r8,NINP_BITSHIFT
add r8,rsi ; r8 address of last byte+1 read in complete loops
add r8,NINP_BYTES_PER_ROUND-1 ; r8 address of last byte read in normal loop with overshoot
mov r11,r8
; DISABLED for NO OVERSHOOT
; add r11,rax ; r11 address of last byte of prefetched data
shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input
shr r8,NSHIFT_ADDRESS_TO_PAGE ; r8 page number of last byte read after normal round
cmp rcx,r8 ; stay on same page
je .LSAME_PAGE_IN_ROUND
sub rdx,rax ; don't overshoot in reading: do one round less
.LSAME_PAGE_IN_ROUND:
shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
cmp rcx,r11
je .LSAME_PAGE_IN_PREFETCH
sub rdx,rax ; don't overshoot in prefetch reading: do one round less
.LSAME_PAGE_IN_PREFETCH:
add rdx,rsi ; rdx last valid pointer+1 for normal loop
; due to prefetch add one round to end checks
add rdx,rax
add r9,rax
mov r11,rdi ; r11 saved start of output buffer
mov rcx,NINP_BYTES_PER_ROUND<<1 ; increment of output buffer for each round
;; start preprocessing before loop
add rsi,rax ; add the number of processed array elements
align 32
; ;IACA START_MARKER
; mov ebx, 111
; db 0x64, 0x67, 0x90
.LHEXENCODE_LOOP:
MOVDQA xmm6,xmm8
PUNPCKLBW xmm6,xmm9 ; QL0 p____5 l1 QQ0 [Lin0_RghtH] [00 HL_0 00 HL_1 ...]
MOVDQA xmm7,xmm8
PUNPCKHBW xmm7,xmm9 ; QL0 p____5 l1 QQ0 [Lin0_LeftH] [00 HL_0 00 HL_1 ...]
MOVDQA xmm8,xmm10
MOVDQU xmm10,[rsi+0*16] ; QL0 p_____5 p1____5 l3+ QL0
;;
MOVDQA xmm4,xmm6
PSRLW xmm4,12 ; RL2 shift RL2 Hx to lower byte in word
MOVDQA xmm5,xmm6
PAND xmm5,xmm14 ; RL2 mask nibble 3 in word (lower nibble shifted 8 bits left)
MOVDQA xmm9,xmm11
MOVDQU xmm11,[rsi+1*16] ; QL1 p_____5 p1____5 l3+ QL0
add rsi,rax ; add the number of processed array elements
PSLLW xmm6,8 ; RL0 rotate (shift) RL0 1 byte to left
MOVDQA xmm0,xmm6
PSRLW xmm0,4+8 ; RL0 shift RL0 Hx to lower byte in word
POR xmm4,xmm5 ; RL2 low nibble, high nibble at correct position (0L0H)
MOVDQA xmm1,xmm6
PAND xmm1,xmm14 ; RL0 mask nibble 3 in word (lower nibble shifted 8 bits left)
MOVDQA xmm6,xmm7
PSLLW xmm6,8 ; RL1 rotate (shift) RL1 1 byte to left
MOVDQA xmm5,xmm4
PCMPGTB xmm5,xmm15 ; RL2 all letters set to 0xFF, all digits to 0
POR xmm0,xmm1 ; RL0 low nibble, high nibble at correct position (0L0H)
PADDB xmm4,xmm13 ; RL2 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39
MOVDQA xmm2,xmm6
PSRLW xmm2,4+8 ; RL1 shift RL1 Hx to lower byte in word
MOVDQA xmm3,xmm6
PAND xmm3,xmm14 ; RL1 mask nibble 3 in word (lower nibble shifted 8 bits left)
MOVDQA xmm1,xmm0
PCMPGTB xmm1,xmm15 ; RL0 all letters set to 0xFF, all digits to 0
PAND xmm5,xmm12 ; RL2 for all letters set to 39, else 0 (
; RL2 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters)
POR xmm2,xmm3 ; RL1 low nibble, high nibble at correct position (0L0H)
PAND xmm1,xmm12 ; RL0 for all letters set to 39, else 0 (
; RL0 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters)
PADDB xmm4,xmm5 ; RL2 final result line RL2
PADDB xmm0,xmm13 ; RL0 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39
MOVDQA xmm3,xmm2
PCMPGTB xmm3,xmm15 ; RL1 all letters set to 0xFF, all digits to 0
PADDB xmm2,xmm13 ; RL1 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39
PADDB xmm0,xmm1 ; RL0 final result line RL0
MOVDQA xmm1,xmm7
PSRLW xmm1,12 ; RL3 shift RL3 Hx to lower byte in word
PAND xmm3,xmm12 ; RL1 for all letters set to 39, else 0 (
; RL1 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters)
PADDB xmm2,xmm3 ; RL1 final result line RL1
MOVDQU [rdi+0*16],xmm0 ; RL0 RL0 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
PAND xmm7,xmm14 ; RL3 mask nibble 3 in word (lower nibble shifted 8 bits left)
MOVDQA xmm6,xmm7
POR xmm6,xmm1 ; RL3 low nibble, high nibble at correct position (0L0H)
MOVDQU [rdi+1*16],xmm2 ; RL1 RL1 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
MOVDQA xmm7,xmm6
PCMPGTB xmm7,xmm15 ; RL3 all letters set to 0xFF, all digits to 0
PADDB xmm6,xmm13 ; RL3 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39
PAND xmm7,xmm12 ; RL3 for all letters set to 39, else 0 (
; RL3 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters)
MOVDQU [rdi+2*16],xmm4 ; RL2 RL2 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
PADDB xmm6,xmm7 ; RL3 final result line RL2
MOVDQU [rdi+3*16],xmm6 ; RL3 RL3 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
add rdi,rcx ; add the number of processed output bytes
cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
jl .LHEXENCODE_LOOP
; end of normal loop reached
; we can do one more round when original count has been reduced by one round
cmp rax,0
je .LFINISH_EXTRA
cmp rdx,r9 ; input buffer length was not reduced when equal
je .LFINISH_NORMAL
sub rsi,rax ; for prefetching the last round, load the last round again
sub rdx,rax ; adopt and condition for last round also
xor rax,rax
jmp .LHEXENCODE_LOOP
.LFINISH_EXTRA:
add rsi,NINP_BYTES_PER_ROUND ; add the extra round to get processed bytes
jmp .LFINISH
.LFINISH_NORMAL:
sub rsi,NINP_BYTES_PER_ROUND ; sub the added prefetch round to get processed bytes
.LFINISH:
; r9 = address of requested input bytes+1
; rsi = address of processed input bytes+1
; now get the minimum of rdx,rsi to rax
;; sub rsi,rax ; for last round do nothing (rax=0), else sub increment for one round
;; sub r9,rax
mov rax,r12
cmp rsi,r12 ; get min from r12 (address of requested input) and rsi (address of done input)
jge .LCALC_PROCESSED_BYTES
mov rax,rsi ; rax=address of last valid input byte+1
.LCALC_PROCESSED_BYTES:
sub rax,r10 ; sub the input buffer start address
; rax = number of valid processed input bytes = return value
cmp rsi,rdx ; compare rdx (address of requested input) and rsi (address of done input)
je .LNO_ZERO_OUT
mov r15,rax ; number of elements to process
shl r15,1 ; number of output bytes
add r15,r11 ; pointer to next byte after full valid output buffer
PXOR xmm0,xmm0 ; all zero
;ZERO MOVDQU [r15],xmm0 ; zero out one register width after last output
.LNO_ZERO_OUT:
%ifdef __WIN__
MOVDQA xmm6 ,[rsp ]
MOVDQA xmm7 ,[rsp+1*16]
MOVDQA xmm8 ,[rsp+2*16]
MOVDQA xmm9 ,[rsp+3*16]
MOVDQA xmm10,[rsp+4*16]
MOVDQA xmm11,[rsp+5*16]
MOVDQA xmm12,[rsp+6*16]
MOVDQA xmm13,[rsp+7*16]
MOVDQA xmm14,[rsp+8*16]
MOVDQA xmm15,[rsp+9*16]
%endif
mov rdi,[rsp+STACK_FOR_XMM+0*8]
mov rsi,[rsp+STACK_FOR_XMM+1*8]
mov r12,[rsp+STACK_FOR_XMM+2*8]
mov r14,[rsp+STACK_FOR_XMM+3*8]
mov r15,[rsp+STACK_FOR_XMM+4*8]
add rsp,STACK_ADJ
ret
;----------------------------------------------------------------------------------------------
; xmm15 ; CONST ALL bytes 9
; xmm14 ; MAX byte value of all lines (Init all to letter 'F' (0x46) ;CONST BITMASK_LOWER_HALF
; xmm13 ; MIN byte value of non-ascii-digit values (not 3x) (Init all to letter 'A' (0x41) ;CONST BITMASK_ASCII_0
; xmm12 ; ORed compare of all digit-values cmp > 9 (Init all zero) ;CONST BITMASK_WORD_LOWER_BYTE
; xmm11 ; Input line 3
; xmm10 ; Input line 2
; xmm9 ; Input line 1
; xmm8 ; Input Line 0
; xmm7 ; Unpack RL1 Rght Half low bits secnd line
; xmm6 ; Unpack RH1 Rght Half high bits secnd line
; xmm5 ; Unpack LL1 Left Half low bits secnd line
; xmm4 ; Unpack LH1 Left Half high bits secnd line
; xmm3 ; Unpack RL0 Rght Half low bits first line
; xmm2 ; Unpack RH0 Rght Half high bits first line
; xmm1 ; Unpack LL0 Left Half low bits first line
; xmm0 ; Unpack LH0 Left Half high bits first line
%define NINP_BYTES_PER_ROUND 4*16
%define NINP_BITSHIFT 6
hex_decode_sse2:
sub rsp,STACK_ADJ
mov [rsp+STACK_FOR_XMM+0*8],rdi
mov [rsp+STACK_FOR_XMM+1*8],rsi
mov [rsp+STACK_FOR_XMM+2*8],r12
mov [rsp+STACK_FOR_XMM+3*8],r14
mov [rsp+STACK_FOR_XMM+4*8],r15
%ifdef __WIN__
MOVDQA [rsp ],xmm6
MOVDQA [rsp+1*16],xmm7
MOVDQA [rsp+2*16],xmm8
MOVDQA [rsp+3*16],xmm9
MOVDQA [rsp+4*16],xmm10
MOVDQA [rsp+5*16],xmm11
MOVDQA [rsp+6*16],xmm12
MOVDQA [rsp+7*16],xmm13
MOVDQA [rsp+8*16],xmm14
MOVDQA [rsp+9*16],xmm15
mov rdi,rcx ; parameter 1 output buffer
mov rsi,rdx ; parameter 2 input buffer
mov rdx,r8 ; parameter 3 number of elements
%endif
;; initializer for QQ0 and QQ1
MOVDQU xmm8,[rsi] ;
MOVDQU xmm9,[rsi+1*16] ;
MOVDQU xmm10,[rsi+2*16] ;
MOVDQU xmm11,[rsi+3*16] ;
;; initialize constants
mov r15,[BITMASK_BIG_TO_LITTLE_ASCII]
MOVDQA xmm7,[BITMASK_LOWER_HALF]
MOVDQA xmm15,[ALL_BYTES_9] ; p_23__ l3
MOVDQA xmm14,[ASCII_LETTER_LITTLE_F] ; p_23__ l3
MOVDQA xmm13,[ASCII_LETTER_LITTLE_A] ; p_23__ l3
PXOR xmm12,xmm12 ; all zero
MOVQ xmm2,r15 ; 0b0010_0000
;; do page overshoot checks
;; due to end condition handling not done here, we only process full rounds
mov rax,NINP_BYTES_PER_ROUND
add rdx,NINP_BYTES_PER_ROUND-1
shr rdx,NINP_BITSHIFT ;
shl rdx,NINP_BITSHIFT ; rdx number of bytes read in normal loop equiv to xxx full loops
mov r9,rdx ; exact requested number of elements to process
add r9,rsi ; r9 last valid pointer +1 of requested input buffer
mov r10,rsi ; r10 saved start of input buffer
mov r12,r9 ; r12 save of end of input buffer+1
lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
mov r11,r9
; DISABLED for NO OVERSHOOT
; add r11,rax ; r11 address of last byte of prefetched data
shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input
shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
cmp rcx,r11
je .LSAME_PAGE_IN_PREFETCH
sub rdx,rax ; don't overshoot in prefetch reading: do one round less
.LSAME_PAGE_IN_PREFETCH:
add rdx,rsi ; rdx last valid pointer+1 for normal loop
; due to prefetch add one round to end checks
add rdx,rax
add r9,rax
mov r11,rdi ; r11 saved start of output buffer
mov rcx,NINP_BYTES_PER_ROUND>>1 ; increment of output buffer for each round
;; start preprocessing before loop
PUNPCKLQDQ xmm2,xmm2 ; all bytes 0b0010_0000
; PUNPCKLQDQ xmm7,xmm7 ; all bytes 0b0000_1111
MOVDQA xmm0,xmm2
MOVDQA xmm1,xmm2
MOVDQA xmm4,xmm2
MOVDQA xmm5,xmm2
add rsi,rax ; add the number of processed array elements
align 32
; ;IACA START_MARKER
; mov ebx, 111
; db 0x64, 0x67, 0x90
.LHEXDECODE_LOOP:
MOVDQA xmm6,xmm2
PSRAD xmm6,1 ; all bytes 0b0001_0000
POR xmm0,xmm8 ; line 0 all letters set to little ASCII a-f
POR xmm1,xmm9
POR xmm4,xmm10
POR xmm5,xmm11
PMAXUB xmm14,xmm0
PMAXUB xmm14,xmm1
PMAXUB xmm14,xmm4
PMAXUB xmm14,xmm5
;max check finished
POR xmm0,xmm6 ; line 0 with bits for ASCII_0 set (Byte OR 0bxx11_xxxx)
POR xmm1,xmm6
POR xmm4,xmm6
POR xmm5,xmm6
PCMPEQD xmm6,xmm6 ; all ONE
PCMPEQB xmm0,xmm8 ; set to all ONE when ASCI Digit (forced bit 0bxx11_xxxx equal to orig value)
PCMPEQB xmm1,xmm9
PCMPEQB xmm4,xmm10
PCMPEQB xmm5,xmm11
;start min check line0+1
MOVDQA xmm2,xmm0 ; copy all one when digit
MOVDQA xmm3,xmm1
PANDN xmm2,xmm6 ; set to all one for values NOT digits
PANDN xmm3,xmm6
PAND xmm2,xmm8 ; set to orig value when NOT ASCI Digit
PAND xmm3,xmm9
POR xmm2,xmm0 ; set all zero bytes to all one
POR xmm3,xmm1
PMINUB xmm13,xmm2
PMINUB xmm13,xmm3
;start min check line2+3
MOVDQA xmm2,xmm4 ; copy all one when digit
MOVDQA xmm3,xmm5
PANDN xmm2,xmm6 ; set to all one for values NOT digits
PANDN xmm3,xmm6
PAND xmm2,xmm10 ; set to orig value when NOT ASCI Digit
PAND xmm3,xmm11
POR xmm2,xmm4 ; set all zero bytes to all one
POR xmm3,xmm5
PMINUB xmm13,xmm2
PMINUB xmm13,xmm3
; start legal digit check
MOVDQA xmm2,xmm0 ; copy all one when digit
MOVDQA xmm3,xmm1
PAND xmm2,xmm8 ; set to orig value when ASCI Digit
PAND xmm3,xmm9
PAND xmm2,xmm7 ; set to lower nibble value when ASCI Digit
PAND xmm2,xmm7
PCMPGTB xmm2,xmm15 ; set to all ONE when ASCI Digit and value > 9
PCMPGTB xmm3,xmm15
POR xmm12,xmm2 ; accumulate illegal chars like ASCII digit and value > 9
POR xmm12,xmm3
PAND xmm2,xmm10 ; set to orig value when ASCI Digit
PAND xmm3,xmm11
PAND xmm2,xmm7 ; set to lower nibble value when ASCI Digit
PAND xmm3,xmm7
PCMPGTB xmm2,xmm15 ; set to all ONE when ASCI Digit and value > 9
PCMPGTB xmm3,xmm15 ; set to orig value when ASCI Digit
POR xmm12,xmm2
POR xmm12,xmm3
;-- ; all checks accumulated, xmm2,xmm3,xmm6,xmm7 have lower nibbles of lines 0-3
PCMPEQD xmm6,xmm6 ; all ONE
PSRLW xmm6,8 ; QQ0 p01____ p01____ l1
MOVDQA xmm2,xmm7 ; all bytes 0b0000_1111
MOVDQA xmm3,xmm7
PAND xmm2,xmm8 ; all byte values only lower half (nibble) Line 0+1
MOVDQU xmm8,[rsi+0*16] ;
PAND xmm3,xmm9
MOVDQU xmm9,[rsi+1*16] ;
PANDN xmm0,xmm15 ; put 9 to every element not DIGIT
PANDN xmm1,xmm15
PADDB xmm2,xmm0 ; add 9 to every nibble not DIGIT
PADDB xmm3,xmm1
MOVDQA xmm0,xmm2
PSRLW xmm0,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
PSLLW xmm2,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
MOVDQA xmm1,xmm3
PSRLW xmm1,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
PSLLW xmm3,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
POR xmm0,xmm2 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
POR xmm1,xmm3 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
PAND xmm0,xmm6 ; line 0
PAND xmm1,xmm6 ; line 1
PACKUSWB xmm0,xmm1 ; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...]
; line 0 and 1 processed
MOVDQA xmm2,xmm7 ; all bytes 0b0000_1111
MOVDQA xmm3,xmm7
PAND xmm2,xmm10 ; all byte values only lower half (nibble) Line 0+1
MOVDQU xmm10,[rsi+2*16] ;
PAND xmm3,xmm11
MOVDQU xmm11,[rsi+3*16] ;
PANDN xmm4,xmm15 ; put 9 to every element not DIGIT
PANDN xmm5,xmm15
PADDB xmm2,xmm4 ; add 9 to every nibble not DIGIT
PADDB xmm3,xmm5
add rsi,rax ; add the number of processed array elements
MOVDQU [rdi+0*16],xmm0 ; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
MOVDQA xmm4,xmm2
PSRLW xmm4,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
PSLLW xmm2,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
MOVDQA xmm5,xmm3
PSRLW xmm5,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
PSLLW xmm3,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
POR xmm4,xmm2 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
POR xmm5,xmm3 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
MOVQ xmm2,r15 ;
PUNPCKLQDQ xmm2,xmm2 ; all bytes 0b0010_0000
MOVDQA xmm0,xmm2
MOVDQA xmm1,xmm2
; MOVQ xmm7,rcx ;
PAND xmm4,xmm6 ;
PAND xmm5,xmm6 ; line 1
PACKUSWB xmm4,xmm5 ; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...]
; MOVDQA xmm1,xmm11
MOVDQU [rdi+1*16],xmm4 ; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
MOVDQA xmm4,xmm2
MOVDQA xmm5,xmm2
; PUNPCKLQDQ xmm7,xmm7 ; all bytes 0b0000_1111
add rdi,rcx ; add the number of processed output bytes
cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
jl .LHEXDECODE_LOOP
; end of normal loop reached
; we can do one more round when original count has been reduced by one round
cmp rax,0
je .LFINISH
cmp rdx,r9 ; input buffer length was not reduced when equal
je .LFINISH
sub rsi,rax ; for prefetching the last round, load the last round again
sub rdx,rax ; adopt and condition for last round also
xor rax,rax
jmp .LHEXDECODE_LOOP
.LFINISH:
mov rax,rdi
sub rax,r11 ; rax = number of output bytes
add rax,rax ; rax = number of valid processed input bytes = return value
%ifdef __WIN__
MOVDQA xmm6 ,[rsp ]
MOVDQA xmm7 ,[rsp+1*16]
MOVDQA xmm8 ,[rsp+2*16]
MOVDQA xmm9 ,[rsp+3*16]
MOVDQA xmm10,[rsp+4*16]
MOVDQA xmm11,[rsp+5*16]
MOVDQA xmm12,[rsp+6*16]
MOVDQA xmm13,[rsp+7*16]
MOVDQA xmm14,[rsp+8*16]
MOVDQA xmm15,[rsp+9*16]
%endif
mov rdi,[rsp+STACK_FOR_XMM+0*8]
mov rsi,[rsp+STACK_FOR_XMM+1*8]
mov r12,[rsp+STACK_FOR_XMM+2*8]
mov r14,[rsp+STACK_FOR_XMM+3*8]
mov r15,[rsp+STACK_FOR_XMM+4*8]
add rsp,STACK_ADJ
ret
;----------------------------------------------------------------------------------------------
; ymm15 ; CONST ALL bytes 9
; ymm14 ; MAX byte value of all lines (Init all to letter 'F' (0x46) ;CONST BITMASK_LOWER_HALF
; ymm13 ; MIN byte value of non-ascii-digit values (not 3x) (Init all to letter 'A' (0x41) ;CONST BITMASK_ASCII_0
; ymm12 ; ORed compare of all digit-values cmp > 9 (Init all zero) ;CONST BITMASK_WORD_LOWER_BYTE
; ymm11 ; Input line 3
; ymm10 ; Input line 2
; ymm9 ; Input line 1
; ymm8 ; Input Line 0
; ymm7 ; CONST BITMASK_LOWER_HALF ;Unpack RL1 Rght Half low bits secnd line
; ymm6 ; Unpack RH1 Rght Half high bits secnd line
; ymm5 ; Unpack LL1 Left Half low bits secnd line
; ymm4 ; Unpack LH1 Left Half high bits secnd line
; ymm3 ; Unpack RL0 Rght Half low bits first line
; ymm2 ; Unpack RH0 Rght Half high bits first line
; ymm1 ; Unpack LL0 Left Half low bits first line
; ymm0 ; Unpack LH0 Left Half high bits first line
%define NINP_BYTES_PER_ROUND 4*32
%define NINP_BITSHIFT 7
hex_decode_avx2:
sub rsp,STACK_ADJ
mov [rsp+STACK_FOR_XMM+0*8],rdi
mov [rsp+STACK_FOR_XMM+1*8],rsi
mov [rsp+STACK_FOR_XMM+2*8],r12
mov [rsp+STACK_FOR_XMM+3*8],r14
mov [rsp+STACK_FOR_XMM+4*8],r15
%ifdef __WIN__
VMOVDQA [rsp ],xmm6
VMOVDQA [rsp+1*16],xmm7
VMOVDQA [rsp+2*16],xmm8
VMOVDQA [rsp+3*16],xmm9
VMOVDQA [rsp+4*16],xmm10
VMOVDQA [rsp+5*16],xmm11
VMOVDQA [rsp+6*16],xmm12
VMOVDQA [rsp+7*16],xmm13
VMOVDQA [rsp+8*16],xmm14
VMOVDQA [rsp+9*16],xmm15
mov rdi,rcx ; parameter 1 output buffer
mov rsi,rdx ; parameter 2 input buffer
mov rdx,r8 ; parameter 3 number of elements
%endif
;; initializer for QQ0 and QQ1
VMOVDQU ymm8,[rsi+0*32] ;
VMOVDQU ymm9,[rsi+1*32] ;
VMOVDQU ymm10,[rsi+2*32] ;
VMOVDQU ymm11,[rsi+3*32] ;
;; initialize constants
VMOVDQA ymm15,[ALL_BYTES_9] ; p_23__ l3
VMOVDQA ymm14,[ASCII_LETTER_LITTLE_F] ; p_23__ l3
VMOVDQA ymm13,[ASCII_LETTER_LITTLE_A] ; p_23__ l3
VMOVDQA ymm7,[BITMASK_LOWER_HALF] ; 0b0000_1111
VPXOR ymm12,ymm12 ; all zero
VMOVDQA ymm2,[BITMASK_BIG_TO_LITTLE_ASCII] ; 0b0010_0000
;; do page overshoot checks
;; due to end condition handling not done here, we only process full rounds
mov rax,NINP_BYTES_PER_ROUND
add rdx,NINP_BYTES_PER_ROUND-1
shr rdx,NINP_BITSHIFT ;
shl rdx,NINP_BITSHIFT ; rdx number of bytes read in normal loop equiv to xxx full loops
mov r9,rdx ; exact requested number of elements to process
add r9,rsi ; r9 last valid pointer +1 of requested input buffer
mov r10,rsi ; r10 saved start of input buffer
mov r12,r9 ; r12 save of end of input buffer+1
lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
mov r11,r9
; DISABLED for NO OVERSHOOT
; add r11,rax ; r11 address of last byte of prefetched data
shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input
shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
cmp rcx,r11
je .LSAME_PAGE_IN_PREFETCH
sub rdx,rax ; don't overshoot in prefetch reading: do one round less
.LSAME_PAGE_IN_PREFETCH:
add rdx,rsi ; rdx last valid pointer+1 for normal loop
; due to prefetch add one round to end checks
add rdx,rax
add r9,rax
mov r11,rdi ; r11 saved start of output buffer
mov rcx,NINP_BYTES_PER_ROUND>>1 ; increment of output buffer for each round
;; start preprocessing before loop
add rsi,rax ; add the number of processed array elements
align 32
; ;IACA START_MARKER
; mov ebx, 111
; db 0x64, 0x67, 0x90
.LHEXDECODE_LOOP:
VMOVDQA ymm6,ymm2
VPSRAD ymm6,1 ; all bytes 0b0001_0000
VPOR ymm0,ymm2,ymm8 ; line 0 all letters set to little ASCII a-f
VPOR ymm1,ymm2,ymm9
VPOR ymm4,ymm2,ymm10
VPOR ymm5,ymm2,ymm11
VPMAXUB ymm14,ymm0
VPMAXUB ymm14,ymm1
VPMAXUB ymm14,ymm4
VPMAXUB ymm14,ymm5
;max check finished
VPOR ymm0,ymm6 ; line 0 with bits for ASCII_0 set (Byte OR 0bxx11_xxxx)
VPOR ymm1,ymm6
VPOR ymm4,ymm6
VPOR ymm5,ymm6
VPCMPEQD ymm6,ymm6 ; all ONE
VPCMPEQB ymm0,ymm8 ; set to all ONE when ASCI Digit (forced bit 0bxx11_xxxx equal to orig value)
VPCMPEQB ymm1,ymm9
VPCMPEQB ymm4,ymm10
VPCMPEQB ymm5,ymm11
;start min check line0+1
VPANDN ymm2,ymm0,ymm6 ; set to all one for values NOT digits
VPANDN ymm3,ymm1,ymm6
VPAND ymm2,ymm8 ; set to orig value when NOT ASCI Digit
VPAND ymm3,ymm9
VPOR ymm2,ymm0 ; set all zero bytes to all one
VPOR ymm3,ymm1
VPMINUB ymm13,ymm2
VPMINUB ymm13,ymm3
;start min check line2+3
VPANDN ymm2,ymm4,ymm6 ; set to all one for values NOT digits
VPANDN ymm3,ymm5,ymm6
VPAND ymm2,ymm10 ; set to orig value when NOT ASCI Digit
VPAND ymm3,ymm11
VPOR ymm2,ymm4 ; set all zero bytes to all one
VPOR ymm3,ymm5
VPMINUB ymm13,ymm2
VPMINUB ymm13,ymm3
; start legal digit check
VPAND ymm2,ymm0,ymm8 ; set to orig value when ASCI Digit
VPAND ymm3,ymm1,ymm9
VPAND ymm2,ymm7 ; set to lower nibble value when ASCI Digit
VPAND ymm2,ymm7
VPCMPGTB ymm2,ymm15 ; set to all ONE when ASCI Digit and value > 9
VPCMPGTB ymm3,ymm15
VPOR ymm12,ymm2 ; accumulate illegal chars like ASCII digit and value > 9
VPOR ymm12,ymm3
VPAND ymm2,ymm10 ; set to orig value when ASCI Digit
VPAND ymm3,ymm11
VPAND ymm2,ymm7 ; set to lower nibble value when ASCI Digit
VPAND ymm3,ymm7
VPCMPGTB ymm2,ymm15 ; set to all ONE when ASCI Digit and value > 9
VPCMPGTB ymm3,ymm15 ; set to orig value when ASCI Digit
VPOR ymm12,ymm2
VPOR ymm12,ymm3
; all (max, min and >9) checks finished
;-- ; all checks accumulated, ymm2,ymm3,ymm6,ymm7 have lower nibbles of lines 0-3
VPCMPEQD ymm6,ymm6 ; all ONE
VPSRLW ymm6,8 ; QQ0 p01____ p01____ l1
VPAND ymm2,ymm7,ymm8 ; all byte values only lower half (nibble) Line 0+1
VMOVDQU ymm8,[rsi+0*32] ;
VPAND ymm3,ymm7,ymm9
VMOVDQU ymm9,[rsi+1*32] ;
VPANDN ymm0,ymm15 ; put 9 to every element not DIGIT
VPANDN ymm1,ymm15
VPADDB ymm2,ymm0 ; add 9 to every nibble not DIGIT
VPADDB ymm3,ymm1
VPSRLW ymm0,ymm2,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
VPSLLW ymm2,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
VPSRLW ymm1,ymm3,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
VPSLLW ymm3,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
VPOR ymm0,ymm2 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
VPOR ymm1,ymm3 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
VPAND ymm0,ymm6 ; line 0
VPAND ymm1,ymm6 ; line 1
VPACKUSWB ymm0,ymm1 ; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...]
; line 0 and 1 processed
VPAND ymm2,ymm7,ymm10 ; all byte values only lower half (nibble) Line 0+1
VMOVDQU ymm10,[rsi+2*32] ;
VPAND ymm3,ymm7,ymm11
VMOVDQU ymm11,[rsi+3*32] ;
VPANDN ymm4,ymm15 ; put 9 to every element not DIGIT
VPANDN ymm5,ymm15
VPADDB ymm2,ymm4 ; add 9 to every nibble not DIGIT
VPADDB ymm3,ymm5
add rsi,rax ; add the number of processed array elements
VMOVDQU [rdi+0*32],ymm0 ; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
VPSRLW ymm4,ymm2,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
VPSLLW ymm2,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
VPSRLW ymm5,ymm3,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
VPSLLW ymm3,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
VPOR ymm4,ymm2 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
VPOR ymm5,ymm3 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
VMOVDQA ymm2,[BITMASK_BIG_TO_LITTLE_ASCII] ;
VPAND ymm4,ymm6 ;
VPAND ymm5,ymm6 ; line 1
VPACKUSWB ymm4,ymm5 ; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...]
VMOVDQU [rdi+1*32],ymm4 ; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
add rdi,rcx ; add the number of processed output bytes
cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
jl .LHEXDECODE_LOOP
; end of normal loop reached
; we can do one more round when original count has been reduced by one round
cmp rax,0
je .LFINISH
cmp rdx,r9 ; input buffer length was not reduced when equal
je .LFINISH
sub rsi,rax ; for prefetching the last round, load the last round again
sub rdx,rax ; adopt and condition for last round also
xor rax,rax
jmp .LHEXDECODE_LOOP
.LFINISH:
mov rax,rdi
sub rax,r11 ; rax = number of output bytes
add rax,rax ; rax = number of valid processed input bytes = return value
%ifdef __WIN__
VMOVDQA xmm6 ,[rsp ]
VMOVDQA xmm7 ,[rsp+1*16]
VMOVDQA xmm8 ,[rsp+2*16]
VMOVDQA xmm9 ,[rsp+3*16]
VMOVDQA xmm10,[rsp+4*16]
VMOVDQA xmm11,[rsp+5*16]
VMOVDQA xmm12,[rsp+6*16]
VMOVDQA xmm13,[rsp+7*16]
VMOVDQA xmm14,[rsp+8*16]
VMOVDQA xmm15,[rsp+9*16]
%endif
mov rdi,[rsp+STACK_FOR_XMM+0*8]
mov rsi,[rsp+STACK_FOR_XMM+1*8]
mov r12,[rsp+STACK_FOR_XMM+2*8]
mov r14,[rsp+STACK_FOR_XMM+3*8]
mov r15,[rsp+STACK_FOR_XMM+4*8]
add rsp,STACK_ADJ
ret
;----------------------------------------------------------------------------------------------
; k7 ; compare flags lower eq little f, initially all ONE
; k6 ; compare flags greater eq little a, initially all ONE QL0,QL1
; k5 ; CONST BITMASK_ZERO_ONE 0101010101010101 selecting lower half
; k4 ; digit flags QL3
; k3 ; digit flags QL2
; k2 ; digit flags QL1
; k1 ; digit flags QL0
; k0 ;
; zmm31 ; CONST ALL ZERO
; zmm30 ; CONST BITMASK_LOWER_HALF
; zmm29 ; CONST ASCII_0_OFFSET
; zmm28 ; CONST ASCII_LITTLE_A_ADD
; zmm27 ; CONST VPERM_DECODE_OFFSETS
; zmm26 ; CONST ALL bytes 9
; zmm25 ; Ouptut Line OH1 (Line 0 is stored in the source load regs zmm8-zmm23
; zmm24 ; Output Line OL1
; zmm23 ; Preload QL3
; zmm22 ; Preload QL2
; zmm21 ; Preload QL1
; zmm20 ; Preload QL0
; zmm19 ; Source Load QL3
; zmm18 ; Source Load QL2
; zmm17 ; Source Load QL1
; zmm16 ; Source Load QL0
; zmm15 ; QL3 little a
; zmm14 ; QL2 little a
; zmm13 ; QL1 little a
; zmm12 ; QL0 little a
; zmm11 ; QL3 masked for digit
; zmm10 ; QL2 masked for digit
; zmm9 ; QL1 masked for digit
; zmm8 ; QL0 masked for digit
; zmm7 ; lower nibble masked QL3
; zmm6 ; lower nibble masked QL2
; zmm5 ; lower nibble masked QL1
; zmm4 ; lower nibble masked QL0
; zmm3 ;
; zmm2 ;
; zmm1 ;
; zmm0 ;
%define NINP_BYTES_PER_ROUND 4*64
%define NINP_BITSHIFT 8
hex_decode_avx512bw:
sub rsp,STACK_ADJ
mov [rsp+STACK_FOR_XMM+0*8],rdi
mov [rsp+STACK_FOR_XMM+1*8],rsi
mov [rsp+STACK_FOR_XMM+2*8],r12
mov [rsp+STACK_FOR_XMM+3*8],r14
mov [rsp+STACK_FOR_XMM+4*8],r15
%ifdef __WIN__
VMOVDQA [rsp ],xmm6
VMOVDQA [rsp+1*16],xmm7
VMOVDQA [rsp+2*16],xmm8
VMOVDQA [rsp+3*16],xmm9
VMOVDQA [rsp+4*16],xmm10
VMOVDQA [rsp+5*16],xmm11
VMOVDQA [rsp+6*16],xmm12
VMOVDQA [rsp+7*16],xmm13
VMOVDQA [rsp+8*16],xmm14
VMOVDQA [rsp+9*16],xmm15
mov rdi,rcx ; parameter 1 output buffer
mov rsi,rdx ; parameter 2 input buffer
mov rdx,r8 ; parameter 3 number of elements
%endif
;; initializer for QQ0 and QQ1
VMOVDQU64 zmm20,[rsi+0*64] ; QQ0 p____5 l3+ QL4
VMOVDQU64 zmm21,[rsi+1*64] ; QQ0 p____5 l3+ QL5
VMOVDQU64 zmm22,[rsi+2*64] ; QQ0 p____5 l3+ QL6
VMOVDQU64 zmm23,[rsi+3*64] ; QQ0 p____5 l3+ QL7
;; initialize constants
KXNORQ k7,k7,k7 ; all one
VPBROADCASTQ zmm31,[ALL_BYTES_9] ; p_23__ l3
VPBROADCASTQ zmm30,[BITMASK_LOWER_HALF] ; p_23__ l3
KXNORQ k6,k6,k6 ; all one
VPBROADCASTQ zmm29,[ASCII_LETTER_LITTLE_F] ; p_23__ l3
VPBROADCASTQ zmm28,[ASCII_LETTER_LITTLE_A] ; p_23__ l3
KMOVQ k5,[BITMASK_ZERO_ONE]
VMOVDQA64 zmm27,[VPERM_DECODE_OFFSETS] ; p_23__ l3
VPBROADCASTQ zmm26,[BITMASK_BIG_TO_LITTLE_ASCII]; p_23__ l3
VPBROADCASTQ zmm25,[BITMASK_SELECT_DIGIT] ; p_23__ l3
;; do page overshoot checks
;; due to end condition handling not done here, we only process full rounds
mov rax,NINP_BYTES_PER_ROUND
add rdx,NINP_BYTES_PER_ROUND-1
shr rdx,NINP_BITSHIFT ;
shl rdx,NINP_BITSHIFT ; rdx number of bytes read in normal loop equiv to xxx full loops
mov r9,rdx ; exact requested number of elements to process
add r9,rsi ; r9 last valid pointer +1 of requested input buffer
mov r10,rsi ; r10 saved start of input buffer
mov r12,r9 ; r12 save of end of input buffer+1
lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
mov r11,r9
; DISABLED for NO OVERSHOOT
; add r11,rax ; r11 address of last byte of prefetched data
shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input
shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
cmp rcx,r11
je .LSAME_PAGE_IN_PREFETCH
sub rdx,rax ; don't overshoot in prefetch reading: do one round less
.LSAME_PAGE_IN_PREFETCH:
add rdx,rsi ; rdx last valid pointer+1 for normal loop
; due to prefetch add one round to end checks
add rdx,rax
add r9,rax
mov r11,rdi ; r11 saved start of output buffer
mov rcx,NINP_BYTES_PER_ROUND>>1 ; increment of output buffer for each round
;; start preprocessing before loop
; VPUNPCKHBW zmm1,zmm16,zmm31 ; QL0 p____5 l1 QQ0 [Lin0_LeftH] [00 HL_0 00 HL_1 ...]
; VPUNPCKLBW zmm3,zmm16,zmm31 ; QL0 p____5 l1 QQ0 [Lin0_RghtH] [00 HL_0 00 HL_1 ...]
add rsi,rax ; add the number of processed array elements
align 32
; ;IACA START_MARKER
; mov ebx, 111
; db 0x64, 0x67, 0x90
; Vector Port info AVX512
; ----------------------------------------
; VPShift p0 l1
; VPMax/Min p0 l1
; VPMUL p0 l5 ; with 2FMA-Units p05 (SKX,CLX etc.)
; VPMOVB2M p0 l3
; VPSUBUSB /SSB p0 l1
; VPALIGNR p5 l1 ;Shift of n*8 bits!
; VPERM p5 l3
; VPERMI2x 1*p05+2*p5 l7 ; (l9 with flags)
; VPCompare p5 l3-l4
; VP Pack/Unpack p5 l1(SKX) l3(TGL)
; VPSHUF p5 l1
.LHEXDECODE_LOOP:
VMOVDQA64 zmm16,zmm20 ; QL0 copy preload to load
VMOVDQA64 zmm17,zmm21 ; QL1 copy preload to load
VPANDQ zmm8,zmm25,zmm20 ; QL0 set bitmask for digits only
VPMAXUB zmm0,zmm20,zmm21 ; QL0,QL1 max from both lines
;;; VPCMPB k7{k7},zmm29,zmm20,2 ; QL0 compare lower_eq little f
VMOVDQA64 zmm18,zmm22 ; QL2
VMOVDQA64 zmm19,zmm23 ; QL3
VPCMPEQB k1,zmm8,zmm20 ; QL0 compare for is digit
VPANDQ zmm9,zmm25,zmm21 ; QL1 set bitmask for digits only
VMOVDQU64 zmm20,[rsi+0*64] ; QQ0 p____5 l3+ QL4
VMOVDQU64 zmm21,[rsi+1*64] ; QL1 p____5 l3+ QL5
VPANDQ zmm10,zmm25,zmm18 ; QL2 set bitmask for digits only
VPCMPEQB k2,zmm9,zmm17 ; QL1 compare for is digit
VMOVDQU64 zmm22,[rsi+2*64] ; QQ0 p____5 l3+ QL6
VMOVDQU64 zmm23,[rsi+3*64] ; QQ0 p____5 l3+ QL7
VPANDQ zmm11,zmm25,zmm19 ; QL3 set bitmask for digits only
VPCMPEQB k3,zmm10,zmm18 ; QL2 compare for is digit
VPMAXUB zmm1,zmm18,zmm19 ; QL2,QL3 max from both lines
;;; VPCMPB k7{k7},zmm29,zmm17,2 ; QL1 compare lower_eq little f
VPCMPEQB k4,zmm11,zmm19 ; QL2 compare for is digit
add rsi,rax ; add the number of processed array elements
VPORQ zmm12,zmm26,zmm16 ; QL0 set bit for little a
VPANDQ zmm4,zmm30,zmm16 ; QL0 bitmask lower nibble
VPORQ zmm13,zmm26,zmm17 ; QL1 set bit for little a
VPANDQ zmm5,zmm30,zmm17 ; QL1 bitmask lower nibble
VPMAXUB zmm0,zmm0,zmm1 ; QL0,QL1,QL2,QL3 max from 4 lines
VPADDB zmm4,zmm4,zmm31 ; QL0 add 9
VPORQ zmm14,zmm26,zmm18 ; QL2 set bit for little a
VPANDQ zmm6,zmm30,zmm18 ; QL2 bitmask lower nibble
VPANDQ zmm7,zmm30,zmm19 ; QL3 bitmask lower nibble
VPCMPB k7{k7},zmm29,zmm0,2 ; QL0,QL1,QL2,QL3 compare lower_eq little f
VPADDB zmm5,zmm5,zmm31 ; QL1 add 9
VPORQ zmm15,zmm26,zmm19 ; QL3 set bit for little a
VPADDB zmm6,zmm6,zmm31 ; QL2 add 9
VPADDB zmm7,zmm7,zmm31 ; QL3 add 9
VPSUBB zmm4{k1},zmm4,zmm31 ; QL0 sub 9 for digits
VPSUBB zmm5{k2},zmm5,zmm31 ; QL1 sub 9 for digits
VPSUBB zmm6{k3},zmm6,zmm31 ; QL2 sub 9 for digits
VPSUBB zmm7{k4},zmm7,zmm31 ; QL3 sub 9 for digits
;
VPSRLW zmm0,zmm4,8 ; QL0 lower nibble-value
VPSLLW zmm4,zmm4,2 ; QL0 upper nibble_value
VPADDB zmm0{k5}{z},zmm0,zmm4 ; QL0 values in lower byte of dword
VPSRLW zmm1,zmm5,8 ; QL1 lower nibble-value
VPSLLW zmm5,zmm5,2 ; QL1 upper nibble_value
VPADDB zmm1{k5}{z},zmm1,zmm5 ; QL1 values in lower byte of dword
VPACKUSWB zmm0,zmm0,zmm1 ; QL0 vlues in single bytes
VMOVDQA64 [rdi+0*64],zmm0 ;DEBUG ########### ; QL0 p____4_ p____49 l4 [Lin0 Left Half] Store Hexdump
VPERMQ zmm1,zmm27,zmm0 ; QL0,QL1 byte values in right order
;
VPSRLW zmm2,zmm6,8 ; QL2 lower nibble-value
VPSLLW zmm6,zmm6,2 ; QL2 upper nibble_value
VPADDB zmm2{k5}{z},zmm2,zmm6 ; QL2 values in lower byte of dword
VPSRLW zmm3,zmm7,8 ; QL3 lower nibble-value
VPSLLW zmm7,zmm7,2 ; QL3 upper nibble_value
VPADDB zmm3{k5}{z},zmm3,zmm7 ; QL3 values in lower byte of dword
VPACKUSWB zmm2,zmm2,zmm3 ; QL2,QL3 vlues in single bytes
VMOVDQA64 [rdi+1*64],zmm2 ;DEBUG ############## ; QL0 p____4_ p____49 l4 [Lin0 Rght Half] Store Hexdump
VPERMQ zmm3,zmm27,zmm2 ; QL2,QL3 byte values in right order
; -------- MISSING check for little a
VMOVDQA64 [rdi+0*64],zmm1 ; QL0 p____4_ p____49 l4 [Lin0 Left Half] Store Hexdump
VMOVDQA64 [rdi+1*64],zmm3 ; QL0 p____4_ p____49 l4 [Lin0 Rght Half] Store Hexdump
add rdi,rcx ; add the number of processed output bytes
cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
jl .LHEXDECODE_LOOP
; end of normal loop reached
; we can do one more round when original count has been reduced by one round
cmp rax,0
je .LFINISH
cmp rdx,r9 ; input buffer length was not reduced when equal
je .LFINISH
sub rsi,rax ; for prefetching the last round, load the last round again
sub rdx,rax ; adopt and condition for last round also
xor rax,rax
jmp .LHEXDECODE_LOOP
.LFINISH:
mov rax,rdi
sub rax,r11 ; rax = number of output bytes
add rax,rax ; rax = number of valid processed input bytes = return value
%ifdef __WIN__
VMOVDQA xmm6 ,[rsp ]
VMOVDQA xmm7 ,[rsp+1*16]
VMOVDQA xmm8 ,[rsp+2*16]
VMOVDQA xmm9 ,[rsp+3*16]
VMOVDQA xmm10,[rsp+4*16]
VMOVDQA xmm11,[rsp+5*16]
VMOVDQA xmm12,[rsp+6*16]
VMOVDQA xmm13,[rsp+7*16]
VMOVDQA xmm14,[rsp+8*16]
VMOVDQA xmm15,[rsp+9*16]
%endif
mov rdi,[rsp+STACK_FOR_XMM+0*8]
mov rsi,[rsp+STACK_FOR_XMM+1*8]
mov r12,[rsp+STACK_FOR_XMM+2*8]
mov r14,[rsp+STACK_FOR_XMM+3*8]
mov r15,[rsp+STACK_FOR_XMM+4*8]
add rsp,STACK_ADJ
ret
;----------------------------------------------------------------------------------------------
; ymm15 ; Source Load QL7
; ymm14 ; Source Load QL6
; ymm13 ; Source Load QL5
; ymm12 ; Source Load QL4
; ymm11 ; Source Load QL3
; ymm10 ; Source Load QL2
; ymm9 ; Source Load QL1
; ymm8 ; Source Load QL0
; ymm7 ; CONST ENCODE_SHUFFLE_TO_HEX
; ymm6 ; CONST BITMASK_NIBBLE_3_IN_WORD
; ymm5 ; Shift temp for High nibble 1
; ymm4 ; Shift temp for High nibble 0
; ymm3 ; Temp3
; ymm2 ; Temp2
; ymm1 ; Temp1
; ymm0 ; Temp0
%define NINP_BYTES_PER_ROUND 8*32
%define NINP_BITSHIFT 8
hex_encode_avx2:
sub rsp,STACK_ADJ
mov [rsp+STACK_FOR_XMM+0*8],rdi
mov [rsp+STACK_FOR_XMM+1*8],rsi
mov [rsp+STACK_FOR_XMM+2*8],r12
mov [rsp+STACK_FOR_XMM+3*8],r14
mov [rsp+STACK_FOR_XMM+4*8],r15
%ifdef __WIN__
VMOVDQA [rsp ],xmm6
VMOVDQA [rsp+1*16],xmm7
VMOVDQA [rsp+2*16],xmm8
VMOVDQA [rsp+3*16],xmm9
VMOVDQA [rsp+4*16],xmm10
VMOVDQA [rsp+5*16],xmm11
VMOVDQA [rsp+6*16],xmm12
VMOVDQA [rsp+7*16],xmm13
VMOVDQA [rsp+8*16],xmm14
VMOVDQA [rsp+9*16],xmm15
mov rdi,rcx ; parameter 1 output buffer
mov rsi,rdx ; parameter 2 input buffer
mov rdx,r8 ; parameter 3 number of elements
%endif
;; Loading QL0-QL3, prefetching QL4-QL7
VPERMQ ymm8, [rsi+0*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL0
VPERMQ ymm9, [rsi+1*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL1
VPERMQ ymm10,[rsi+2*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL2
VPERMQ ymm11,[rsi+3*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL3
VPERMQ ymm12,[rsi+4*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL4
VPERMQ ymm13,[rsi+5*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL5
VPERMQ ymm14,[rsi+6*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL6
VPERMQ ymm15,[rsi+7*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL7
;; initialize constants
VMOVDQA ymm7,[ENCODE_SHUFFLE_TO_HEX] ; p_23__ l3
VMOVDQA ymm6,[BITMASK_LOWER_HALF] ; p_23__ l3
;; do page overshoot checks
mov rax,NINP_BYTES_PER_ROUND
mov r9,rdx ; exact requested number of elements to process
add r9,rsi ; r9 last valid pointer +1 of requested input buffer
mov r10,rsi ; r10 saved start of input buffer
mov r12,r9 ; r12 save of end of input buffer+1
lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
lea r8,[rdx+NINP_BYTES_PER_ROUND-1]
shr r8,NINP_BITSHIFT ; number of loops
shl r8,NINP_BITSHIFT
add r8,rsi ; r8 address of last byte+1 read in complete loops
add r8,NINP_BYTES_PER_ROUND-1 ; r8 address of last byte read in normal loop with overshoot
mov r11,r8
; DISABLED for NO OVERSHOOT
; add r11,rax ; r11 address of last byte of prefetched data
shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input
shr r8,NSHIFT_ADDRESS_TO_PAGE ; r8 page number of last byte read after normal round
cmp rcx,r8 ; stay on same page
je .LSAME_PAGE_IN_ROUND
sub rdx,rax ; don't overshoot in reading: do one round less
.LSAME_PAGE_IN_ROUND:
shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
cmp rcx,r11
je .LSAME_PAGE_IN_PREFETCH
sub rdx,rax ; don't overshoot in prefetch reading: do one round less
.LSAME_PAGE_IN_PREFETCH:
add rdx,rsi ; rdx last valid pointer+1 for normal loop
; due to prefetch add one round to end checks
add rdx,rax
add r9,rax
mov r11,rdi ; r11 saved start of output buffer
mov rcx,NINP_BYTES_PER_ROUND<<1 ; increment of output buffer for each round
;; start preprocessing before loop
VPSRLQ ymm4,ymm8,4 ; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte
VPSRLQ ymm5,ymm9,4 ; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte
VPUNPCKLBW ymm0,ymm4,ymm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPUNPCKHBW ymm1,ymm4,ymm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
add rsi,rax ; add the number of processed array elements
align 32
; ;IACA START_MARKER
; mov ebx, 111
; db 0x64, 0x67, 0x90
.LHEXENCODE_LOOP:
;; process unpacked AAA in YMM0-YMM4 and YMM8-YMM11, UNPCK BBB to YMM0-YMM1, PreLoad AAA to YMM8-YMM11
;;RL00,RL01,RL02,RL03,RL04,RL05,RL06,RL07
VPUNPCKLBW ymm2,ymm5,ymm9 ; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0]
VPSRLQ ymm4,ymm10,4 ; AAA RL04,RL05 QL2 shift Hx to lower nibble in byte
VPUNPCKHBW ymm3,ymm5,ymm9 ; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0]
VPAND ymm0,ymm0,ymm6 ; AAA RL00 mask lower nibble
VPSRLQ ymm5,ymm11,4 ; AAA RL06,RL07 QL3 shift Hx to lower nibble in byte
VPUNPCKLBW ymm8,ymm4,ymm10 ; AAA RL04 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPAND ymm1,ymm1,ymm6 ; AAA RL01 mask lower nibble
VPUNPCKHBW ymm9,ymm4,ymm10 ; AAA RL05 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPAND ymm2,ymm2,ymm6 ; AAA RL02 mask lower nibble
VPUNPCKLBW ymm10,ymm5,ymm11 ; AAA RL06 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPAND ymm3,ymm3,ymm6 ; AAA RL03 mask lower nibble
VPUNPCKHBW ymm11,ymm5,ymm11 ; AAA RL07 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPSHUFB ymm0,ymm7,ymm0 ; AAA RL00 shuffle_to_hex_digits
VPAND ymm8,ymm8,ymm6 ; AAA RL04 mask lower nibble
VPSHUFB ymm1,ymm7,ymm1 ; AAA RL01 shuffle_to_hex_digits
VPAND ymm9,ymm9,ymm6 ; AAA RL05 mask lower nibble
VPSHUFB ymm2,ymm7,ymm2 ; AAA RL02 shuffle_to_hex_digits
VMOVDQU [rdi+0*32],ymm0 ; AAA RL00 Store Hexdump
VPAND ymm10,ymm10,ymm6 ; AAA RL06 mask lower nibble
VPSHUFB ymm3,ymm7,ymm3 ; AAA RL03 shuffle_to_hex_digits
VMOVDQU [rdi+1*32],ymm1 ; AAA RL01 Store Hexdump
VPAND ymm11,ymm11,ymm6 ; AAA RL07 mask lower nibble
VPSHUFB ymm8,ymm7,ymm8 ; AAA RL04 shuffle_to_hex_digits
VPSRLQ ymm4,ymm12,4 ; BBB RL08,RL09 QL4 shift Hx to lower nibble in byte
VMOVDQU [rdi+2*32],ymm2 ; AAA RL02 Store Hexdump
VPSHUFB ymm9,ymm7,ymm9 ; AAA RL05 shuffle_to_hex_digits
VPSRLQ ymm5,ymm13,4 ; BBB RL10,RL11 QL5 shift Hx to lower nibble in byte
VMOVDQU [rdi+3*32],ymm3 ; AAA RL03 Store Hexdump
VPSHUFB ymm10,ymm7,ymm10 ; AAA RL06 shuffle_to_hex_digits
VMOVDQU [rdi+4*32],ymm8 ; AAA RL04 Store Hexdump
VPERMQ ymm8, [rsi+0*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL0
VMOVDQU [rdi+5*32],ymm9 ; AAA RL05 Store Hexdump
VPERMQ ymm9, [rsi+1*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL1
VPSHUFB ymm11,ymm7,ymm11 ; AAA RL07 shuffle_to_hex_digits
VMOVDQU [rdi+6*32],ymm10 ; AAA RL06 Store Hexdump
VPUNPCKLBW ymm0,ymm4,ymm12 ; BBB RL08 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPERMQ ymm10,[rsi+2*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL2
VMOVDQU [rdi+7*32],ymm11 ; AAA RL07 Store Hexdump
VPERMQ ymm11,[rsi+3*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL3
VPUNPCKHBW ymm1,ymm4,ymm12 ; AAA RL09 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
;; process unpacked BBB in YMM0-YMM4 and YMM9-YMM15, UNPCK AAA to YMM0-YMM1, PreLoad BBB to YMM12-YMM15
;;RL08,RL09,RL10,RL11,RL12,RL13,RL14,RL15
VPUNPCKLBW ymm2,ymm5,ymm13 ; BBB RL10 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0]
VPSRLQ ymm4,ymm14,4 ; BBB RL12,RL13 QL6 shift Hx to lower nibble in byte
VPUNPCKHBW ymm3,ymm5,ymm13 ; BBB RL11 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0]
VPAND ymm0,ymm0,ymm6 ; BBB RL08 mask lower nibble
VPSRLQ ymm5,ymm15,4 ; BBB RL14,RL15 QL7 shift Hx to lower nibble in byte
VPUNPCKLBW ymm12,ymm4,ymm14 ; BBB RL12 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPAND ymm1,ymm1,ymm6 ; BBB RL09 mask lower nibble
VPUNPCKHBW ymm13,ymm4,ymm14 ; BBB RL13 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPAND ymm2,ymm2,ymm6 ; BBB RL10 mask lower nibble
VPUNPCKLBW ymm14,ymm5,ymm15 ; BBB RL14 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPAND ymm3,ymm3,ymm6 ; BBB RL11 mask lower nibble
VPUNPCKHBW ymm15,ymm5,ymm15 ; BBB RL15 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPSHUFB ymm0,ymm7,ymm0 ; BBB RL08 shuffle_to_hex_digits
VPAND ymm12,ymm12,ymm6 ; BBB RL12 mask lower nibble
VPSHUFB ymm1,ymm7,ymm1 ; BBB RL09 shuffle_to_hex_digits
VPAND ymm13,ymm13,ymm6 ; BBB RL13 mask lower nibble
VPSHUFB ymm2,ymm7,ymm2 ; BBB RL10 shuffle_to_hex_digits
VMOVDQU [rdi+8*32],ymm0 ; BBB RL08 Store Hexdump
VPAND ymm14,ymm14,ymm6 ; BBB RL14 mask lower nibble
VPSHUFB ymm3,ymm7,ymm3 ; BBB RL11 shuffle_to_hex_digits
VMOVDQU [rdi+9*32],ymm1 ; BBB RL09 Store Hexdump
VPAND ymm15,ymm15,ymm6 ; BBB RL15 mask lower nibble
VPSHUFB ymm12,ymm7,ymm12 ; BBB RL12 shuffle_to_hex_digits
VPSRLQ ymm4,ymm8,4 ; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte
VMOVDQU [rdi+10*32],ymm2 ; BBB RL10 Store Hexdump
VPSHUFB ymm13,ymm7,ymm13 ; BBB RL13 shuffle_to_hex_digits
VPSRLQ ymm5,ymm9,4 ; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte
VMOVDQU [rdi+11*32],ymm3 ; BBB RL11 Store Hexdump
VPSHUFB ymm14,ymm7,ymm14 ; BBB RL14 shuffle_to_hex_digits
VMOVDQU [rdi+12*32],ymm12 ; BBB RL12 Store Hexdump
VPERMQ ymm12, [rsi+4*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL0
VMOVDQU [rdi+13*32],ymm13 ; BBB RL13 Store Hexdump
VPERMQ ymm13, [rsi+5*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL1
VPSHUFB ymm15,ymm7,ymm15 ; BBB RL15 shuffle_to_hex_digits
VMOVDQU [rdi+14*32],ymm14 ; BBB RL14 Store Hexdump
VPUNPCKLBW ymm0,ymm4,ymm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPERMQ ymm14,[rsi+6*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL2
VMOVDQU [rdi+15*32],ymm15 ; BBB RL15 Store Hexdump
VPERMQ ymm15,[rsi+7*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL3
add rsi,rax ; add the number of processed array elements
VPUNPCKHBW ymm1,ymm4,ymm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
add rdi,rcx ; add the number of processed output bytes
cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
jl .LHEXENCODE_LOOP
; end of normal loop reached
; we can do one more round when original count has been reduced by one round
cmp rax,0
je .LFINISH_EXTRA
cmp rdx,r9 ; input buffer length was not reduced when equal
je .LFINISH_NORMAL
sub rsi,rax ; for prefetching the last round, load the last round again
sub rdx,rax ; adopt and condition for last round also
xor rax,rax
jmp .LHEXENCODE_LOOP
.LFINISH_EXTRA:
add rsi,NINP_BYTES_PER_ROUND ; add the extra round to get processed bytes
jmp .LFINISH
.LFINISH_NORMAL:
sub rsi,NINP_BYTES_PER_ROUND ; sub the added prefetch round to get processed bytes
.LFINISH:
; r9 = address of requested input bytes+1
; rsi = address of processed input bytes+1
; now get the minimum of rdx,rsi to rax
;; sub rsi,rax ; for last round do nothing (rax=0), else sub increment for one round
;; sub r9,rax
mov rax,r12
cmp rsi,r12 ; get min from r12 (address of requested input) and rsi (address of done input)
jge .LCALC_PROCESSED_BYTES
mov rax,rsi ; rax=address of last valid input byte+1
.LCALC_PROCESSED_BYTES:
sub rax,r10 ; sub the input buffer start address
; rax = number of valid processed input bytes = return value
cmp rsi,rdx ; compare rdx (address of requested input) and rsi (address of done input)
je .LNO_ZERO_OUT
mov r15,rax ; number of elements to process
shl r15,1 ; number of output bytes
add r15,r11 ; pointer to next byte after full valid output buffer
VPXOR ymm0,ymm0,ymm0 ; all zero
;ZERO VMOVDQU [r15],ymm0 ; zero out one register width after last output
.LNO_ZERO_OUT:
%ifdef __WIN__
VMOVDQA xmm6 ,[rsp ]
VMOVDQA xmm7 ,[rsp+1*16]
VMOVDQA xmm8 ,[rsp+2*16]
VMOVDQA xmm9 ,[rsp+3*16]
VMOVDQA xmm10,[rsp+4*16]
VMOVDQA xmm11,[rsp+5*16]
VMOVDQA xmm12,[rsp+6*16]
VMOVDQA xmm13,[rsp+7*16]
VMOVDQA xmm14,[rsp+8*16]
VMOVDQA xmm15,[rsp+9*16]
%endif
mov rdi,[rsp+STACK_FOR_XMM+0*8]
mov rsi,[rsp+STACK_FOR_XMM+1*8]
mov r12,[rsp+STACK_FOR_XMM+2*8]
mov r14,[rsp+STACK_FOR_XMM+3*8]
mov r15,[rsp+STACK_FOR_XMM+4*8]
add rsp,STACK_ADJ
ret
;----------------------------------------------------------------------------------------------
; xmm15 ; Source Load QL7
; xmm14 ; Source Load QL6
; xmm13 ; Source Load QL5
; xmm12 ; Source Load QL4
; xmm11 ; Source Load QL3
; xmm10 ; Source Load QL2
; xmm9 ; Source Load QL1
; xmm8 ; Source Load QL0
; xmm7 ; CONST ENCODE_SHUFFLE_TO_HEX
; xmm6 ; CONST BITMASK_NIBBLE_3_IN_WORD
; xmm5 ; Shift temp for High nibble 1
; xmm4 ; Shift temp for High nibble 0
; xmm3 ; Temp3
; xmm2 ; Temp2
; xmm1 ; Temp1
; xmm0 ; Temp0
%define NINP_BYTES_PER_ROUND 8*16
%define NINP_BITSHIFT 7
hex_encode_ssse3:
sub rsp,STACK_ADJ
mov [rsp+STACK_FOR_XMM+0*8],rdi
mov [rsp+STACK_FOR_XMM+1*8],rsi
mov [rsp+STACK_FOR_XMM+2*8],r12
mov [rsp+STACK_FOR_XMM+3*8],r14
mov [rsp+STACK_FOR_XMM+4*8],r15
%ifdef __WIN__
MOVDQA [rsp ],xmm6
MOVDQA [rsp+1*16],xmm7
MOVDQA [rsp+2*16],xmm8
MOVDQA [rsp+3*16],xmm9
MOVDQA [rsp+4*16],xmm10
MOVDQA [rsp+5*16],xmm11
MOVDQA [rsp+6*16],xmm12
MOVDQA [rsp+7*16],xmm13
MOVDQA [rsp+8*16],xmm14
MOVDQA [rsp+9*16],xmm15
mov rdi,rcx ; parameter 1 output buffer
mov rsi,rdx ; parameter 2 input buffer
mov rdx,r8 ; parameter 3 number of elements
%endif
;; Loading QL0-QL3, prefetching QL4-QL7
MOVDQU xmm8, [rsi+0*16] ; AAA p_____5 p1____5 l3+ QL0
MOVDQU xmm9, [rsi+1*16] ; AAA p_____5 p1____5 l3+ QL1
MOVDQU xmm10,[rsi+2*16] ; AAA p_____5 p1____5 l3+ QL2
MOVDQU xmm11,[rsi+3*16] ; AAA p_____5 p1____5 l3+ QL3
MOVDQU xmm12,[rsi+4*16] ; BBB p_____5 p1____5 l3+ QL4
MOVDQU xmm13,[rsi+5*16] ; BBB p_____5 p1____5 l3+ QL5
MOVDQU xmm14,[rsi+6*16] ; BBB p_____5 p1____5 l3+ QL6
MOVDQU xmm15,[rsi+7*16] ; BBB p_____5 p1____5 l3+ QL7
;; initialize constants
MOVDQA xmm7,[ENCODE_SHUFFLE_TO_HEX] ; p_23__ l3
MOVDQA xmm6,[BITMASK_LOWER_HALF] ; p_23__ l3
;; do page overshoot checks
mov rax,NINP_BYTES_PER_ROUND
mov r9,rdx ; exact requested number of elements to process
add r9,rsi ; r9 last valid pointer +1 of requested input buffer
mov r10,rsi ; r10 saved start of input buffer
mov r12,r9 ; r12 save of end of input buffer+1
lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
lea r8,[rdx+NINP_BYTES_PER_ROUND-1]
shr r8,NINP_BITSHIFT ; number of loops
shl r8,NINP_BITSHIFT
add r8,rsi ; r8 address of last byte+1 read in complete loops
add r8,NINP_BYTES_PER_ROUND-1 ; r8 address of last byte read in normal loop with overshoot
mov r11,r8
; DISABLED for NO OVERSHOOT
; add r11,rax ; r11 address of last byte of prefetched data
shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input
shr r8,NSHIFT_ADDRESS_TO_PAGE ; r8 page number of last byte read after normal round
cmp rcx,r8 ; stay on same page
je .LSAME_PAGE_IN_ROUND
sub rdx,rax ; don't overshoot in reading: do one round less
.LSAME_PAGE_IN_ROUND:
shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
cmp rcx,r11
je .LSAME_PAGE_IN_PREFETCH
sub rdx,rax ; don't overshoot in prefetch reading: do one round less
.LSAME_PAGE_IN_PREFETCH:
add rdx,rsi ; rdx last valid pointer+1 for normal loop
; due to prefetch add one round to end checks
add rdx,rax
add r9,rax
mov r11,rdi ; r11 saved start of output buffer
mov rcx,NINP_BYTES_PER_ROUND<<1 ; increment of output buffer for each round
;; start preprocessing before loop
MOVDQA xmm4,xmm8
PSRLQ xmm4,4 ; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte
MOVDQA xmm5,xmm9
PSRLQ xmm5,4 ; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte
MOVDQA xmm0,xmm4
PUNPCKLBW xmm0,xmm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
MOVDQA xmm1,xmm4
PUNPCKHBW xmm1,xmm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
add rsi,rax ; add the number of processed array elements
align 32
; ;IACA START_MARKER
; mov ebx, 111
; db 0x64, 0x67, 0x90
.LHEXENCODE_LOOP:
;; process unpacked AAA in XMM0-XMM4 and XMM8-XMM11, UNPCK BBB to XMM0-XMM1, PreLoad AAA to XMM8-XMM11
;;RL00,RL01,RL02,RL03,RL04,RL05,RL06,RL07
MOVDQA xmm2,xmm5
PUNPCKLBW xmm2,xmm9 ; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0]
MOVDQA xmm4,xmm10
PSRLQ xmm4,4 ; AAA RL04,RL05 QL2 shift Hx to lower nibble in byte
MOVDQA xmm3,xmm5
PUNPCKHBW xmm3,xmm9 ; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0]
PAND xmm0,xmm6 ; AAA RL00 mask lower nibble
MOVDQA xmm5,xmm11
PSRLQ xmm5,4 ; AAA RL06,RL07 QL3 shift Hx to lower nibble in byte
MOVDQA xmm8,xmm4
PUNPCKLBW xmm8,xmm10 ; AAA RL04 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
PAND xmm1,xmm6 ; AAA RL01 mask lower nibble
MOVDQA xmm9,xmm4
PUNPCKHBW xmm9,xmm10 ; AAA RL05 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
PAND xmm2,xmm6 ; AAA RL02 mask lower nibble
MOVDQA xmm10,xmm5
PUNPCKLBW xmm10,xmm11 ; AAA RL06 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
PAND xmm3,xmm6 ; AAA RL03 mask lower nibble
MOVDQA xmm4,xmm5
PUNPCKHBW xmm4,xmm11 ; AAA RL07 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
MOVDQA xmm11,xmm4
MOVDQA xmm4,xmm7
PSHUFB xmm4,xmm0 ; AAA RL00 shuffle_to_hex_digits
PAND xmm8,xmm6 ; AAA RL04 mask lower nibble
MOVDQA xmm5,xmm7
PSHUFB xmm5,xmm1 ; AAA RL01 shuffle_to_hex_digits
PAND xmm9,xmm6 ; AAA RL05 mask lower nibble
MOVDQA xmm0,xmm7
PSHUFB xmm0,xmm2 ; AAA RL02 shuffle_to_hex_digits
MOVDQU [rdi+0*16],xmm4 ; AAA RL00 Store Hexdump
PAND xmm10,xmm6 ; AAA RL06 mask lower nibble
MOVDQA xmm1,xmm7
PSHUFB xmm1,xmm3 ; AAA RL03 shuffle_to_hex_digits
MOVDQU [rdi+1*16],xmm5 ; AAA RL01 Store Hexdump
PAND xmm11,xmm6 ; AAA RL07 mask lower nibble
MOVDQA xmm2,xmm7
PSHUFB xmm2,xmm8 ; AAA RL04 shuffle_to_hex_digits
MOVDQA xmm4,xmm12
PSRLQ xmm4,4 ; BBB RL08,RL09 QL4 shift Hx to lower nibble in byte
MOVDQU [rdi+2*16],xmm0 ; AAA RL02 Store Hexdump
MOVDQA xmm3,xmm7
PSHUFB xmm3,xmm9 ; AAA RL05 shuffle_to_hex_digits
MOVDQA xmm5,xmm13
PSRLQ xmm5,4 ; BBB RL10,RL11 QL5 shift Hx to lower nibble in byte
MOVDQU [rdi+3*16],xmm1 ; AAA RL03 Store Hexdump
MOVDQA xmm0,xmm7
PSHUFB xmm0,xmm10 ; AAA RL06 shuffle_to_hex_digits
MOVDQU [rdi+4*16],xmm2 ; AAA RL04 Store Hexdump
MOVDQU xmm8, [rsi+0*16] ; AAA p_____5 p1____5 l3+ QL0
MOVDQU [rdi+5*16],xmm3 ; AAA RL05 Store Hexdump
MOVDQU xmm9, [rsi+1*16] ; AAA p_____5 p1____5 l3+ QL1
MOVDQA xmm1,xmm7
PSHUFB xmm1,xmm11 ; AAA RL07 shuffle_to_hex_digits
MOVDQU [rdi+6*16],xmm0 ; AAA RL06 Store Hexdump
MOVDQA xmm0,xmm4
PUNPCKLBW xmm0,xmm12 ; BBB RL08 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
MOVDQU xmm10,[rsi+2*16] ; AAA p_____5 p1____5 l3+ QL2
MOVDQU [rdi+7*16],xmm1 ; AAA RL07 Store Hexdump
MOVDQU xmm11,[rsi+3*16] ; AAA p_____5 p1____5 l3+ QL3
MOVDQA xmm1,xmm4
PUNPCKHBW xmm1,xmm12 ; AAA RL09 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
;; process unpacked BBB in XMM0-XMM4 and XMM9-XMM15, UNPCK AAA to XMM0-XMM1, PreLoad BBB to XMM12-XMM15
;;RL08,RL09,RL10,RL11,RL12,RL13,RL14,RL15
MOVDQA xmm2,xmm5
PUNPCKLBW xmm2,xmm13 ; BBB RL10 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0]
MOVDQA xmm4,xmm14
PSRLQ xmm4,4 ; BBB RL12,RL13 QL6 shift Hx to lower nibble in byte
MOVDQA xmm3,xmm5
PUNPCKHBW xmm3,xmm13 ; BBB RL11 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0]
PAND xmm0,xmm6 ; BBB RL08 mask lower nibble
MOVDQA xmm5,xmm15
PSRLQ xmm5,4 ; BBB RL14,RL15 QL7 shift Hx to lower nibble in byte
MOVDQA xmm12,xmm4
PUNPCKLBW xmm12,xmm14 ; BBB RL12 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
PAND xmm1,xmm6 ; BBB RL09 mask lower nibble
MOVDQA xmm13,xmm4
PUNPCKHBW xmm13,xmm14 ; BBB RL13 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
PAND xmm2,xmm6 ; BBB RL10 mask lower nibble
MOVDQA xmm14,xmm5
PUNPCKLBW xmm14,xmm15 ; BBB RL14 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
PAND xmm3,xmm6 ; BBB RL11 mask lower nibble
MOVDQA xmm4,xmm5
PUNPCKHBW xmm4,xmm15 ; BBB RL15 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
MOVDQA xmm15,xmm4
MOVDQA xmm4,xmm7
PSHUFB xmm4,xmm0 ; BBB RL08 shuffle_to_hex_digits
PAND xmm12,xmm6 ; BBB RL12 mask lower nibble
MOVDQA xmm5,xmm7
PSHUFB xmm5,xmm1 ; BBB RL09 shuffle_to_hex_digits
PAND xmm13,xmm6 ; BBB RL13 mask lower nibble
MOVDQA xmm0,xmm7
PSHUFB xmm0,xmm2 ; BBB RL10 shuffle_to_hex_digits
MOVDQU [rdi+8*16],xmm4 ; BBB RL08 Store Hexdump
PAND xmm14,xmm6 ; BBB RL14 mask lower nibble
MOVDQA xmm1,xmm7
PSHUFB xmm1,xmm3 ; BBB RL11 shuffle_to_hex_digits
MOVDQU [rdi+9*16],xmm5 ; BBB RL09 Store Hexdump
PAND xmm15,xmm6 ; BBB RL15 mask lower nibble
MOVDQA xmm2,xmm7
PSHUFB xmm2,xmm12 ; BBB RL12 shuffle_to_hex_digits
MOVDQA xmm4,xmm8
PSRLQ xmm4,4 ; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte
MOVDQU [rdi+10*16],xmm0 ; BBB RL10 Store Hexdump
MOVDQA xmm3,xmm7
PSHUFB xmm3,xmm13 ; BBB RL13 shuffle_to_hex_digits
MOVDQA xmm5,xmm9
PSRLQ xmm5,4 ; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte
MOVDQU [rdi+11*16],xmm1 ; BBB RL11 Store Hexdump
MOVDQA xmm0,xmm7
PSHUFB xmm0,xmm14 ; BBB RL14 shuffle_to_hex_digits
MOVDQU [rdi+12*16],xmm2 ; BBB RL12 Store Hexdump
MOVDQU xmm12, [rsi+4*16] ; BBB p_____5 p1____5 l3+ QL0
MOVDQU [rdi+13*16],xmm3 ; BBB RL13 Store Hexdump
MOVDQU xmm13, [rsi+5*16] ; BBB p_____5 p1____5 l3+ QL1
MOVDQA xmm1,xmm7
PSHUFB xmm1,xmm15 ; BBB RL15 shuffle_to_hex_digits
MOVDQU [rdi+14*16],xmm0 ; BBB RL14 Store Hexdump
MOVDQA xmm0,xmm4
PUNPCKLBW xmm0,xmm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
MOVDQU xmm14,[rsi+6*16] ; BBB p_____5 p1____5 l3+ QL2
MOVDQU [rdi+15*16],xmm1 ; BBB RL15 Store Hexdump
MOVDQU xmm15,[rsi+7*16] ; BBB p_____5 p1____5 l3+ QL3
add rsi,rax ; add the number of processed array elements
MOVDQA xmm1,xmm4
PUNPCKHBW xmm1,xmm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
add rdi,rcx ; add the number of processed output bytes
cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
jl .LHEXENCODE_LOOP
; end of normal loop reached
; we can do one more round when original count has been reduced by one round
cmp rax,0
je .LFINISH_EXTRA
cmp rdx,r9 ; input buffer length was not reduced when equal
je .LFINISH_NORMAL
sub rsi,rax ; for prefetching the last round, load the last round again
sub rdx,rax ; adopt and condition for last round also
xor rax,rax
jmp .LHEXENCODE_LOOP
.LFINISH_EXTRA:
add rsi,NINP_BYTES_PER_ROUND ; add the extra round to get processed bytes
jmp .LFINISH
.LFINISH_NORMAL:
sub rsi,NINP_BYTES_PER_ROUND ; sub the added prefetch round to get processed bytes
.LFINISH:
; r9 = address of requested input bytes+1
; rsi = address of processed input bytes+1
; now get the minimum of rdx,rsi to rax
;; sub rsi,rax ; for last round do nothing (rax=0), else sub increment for one round
;; sub r9,rax
mov rax,r12
cmp rsi,r12 ; get min from r12 (address of requested input) and rsi (address of done input)
jge .LCALC_PROCESSED_BYTES
mov rax,rsi ; rax=address of last valid input byte+1
.LCALC_PROCESSED_BYTES:
sub rax,r10 ; sub the input buffer start address
; rax = number of valid processed input bytes = return value
cmp rsi,rdx ; compare rdx (address of requested input) and rsi (address of done input)
je .LNO_ZERO_OUT
mov r15,rax ; number of elements to process
shl r15,1 ; number of output bytes
add r15,r11 ; pointer to next byte after full valid output buffer
PXOR xmm0,xmm0 ; all zero
;ZERO MOVDQU [r15],xmm0 ; zero out one register width after last output
.LNO_ZERO_OUT:
%ifdef __WIN__
MOVDQA xmm6 ,[rsp ]
MOVDQA xmm7 ,[rsp+1*16]
MOVDQA xmm8 ,[rsp+2*16]
MOVDQA xmm9 ,[rsp+3*16]
MOVDQA xmm10,[rsp+4*16]
MOVDQA xmm11,[rsp+5*16]
MOVDQA xmm12,[rsp+6*16]
MOVDQA xmm13,[rsp+7*16]
MOVDQA xmm14,[rsp+8*16]
MOVDQA xmm15,[rsp+9*16]
%endif
mov rdi,[rsp+STACK_FOR_XMM+0*8]
mov rsi,[rsp+STACK_FOR_XMM+1*8]
mov r12,[rsp+STACK_FOR_XMM+2*8]
mov r14,[rsp+STACK_FOR_XMM+3*8]
mov r15,[rsp+STACK_FOR_XMM+4*8]
add rsp,STACK_ADJ
ret
;----------------------------------------------------------------------------------------------
; k7 ; CONST BITMASK_ONE_ZERO 1010101010101010 selecting upper half
; k6 ;
; k5 ; CONST BITMASK_ZERO_ONE 0101010101010101 selecting lower half
; k4 ; digit flags QL3
; k3 ; digit flags QL2
; k2 ; digit flags QL1
; k1 ; digit flags QL0
; k0 ;
; zmm31 ; CONST ENCODE_SHUFFLE_TO_HEX
; zmm30 ; CONST BITMASK_NIBBLE_3_IN_WORD
; zmm29 ; CONST VPERM_ENCODE_OFFSETS
; zmm28 ; CONST ALL_BYTES_39 ; CONST 48 = 39+9, calculated in the middle
; zmm27 ; Unpack Upper RL5 RL7
; zmm26 ; Unpack Lower RL4 RL6
; zmm25 ; Unpack Upper RL1 RL3
; zmm24 ; Unpack Lower RL0 RL2
; zmm23 ; Source Load QLF
; zmm22 ; Source Load QLE
; zmm21 ; Source Load QLD
; zmm20 ; Source Load QLC
; zmm19 ; Source Load QLB
; zmm18 ; Source Load QLA
; zmm17 ; Source Load QL9
; zmm16 ; Source Load QL8
; zmm15 ; Source Load QL7
; zmm14 ; Source Load QL6
; zmm13 ; Source Load QL5
; zmm12 ; Source Load QL4
; zmm11 ; Source Load QL3
; zmm10 ; Source Load QL2
; zmm9 ; Source Load QL1
; zmm8 ; Source Load QL0
; zmm7 ; RL3
; zmm6 ; RL3
; zmm5 ; RL2
; zmm4 ; RL2
; zmm3 ;
; zmm2 ; RL1
; zmm1 ; CONST ALL bytes 48
; zmm0 ; RL0
%define NHALF_INP_BYTES_PER_ROUND 8*64
%define NINP_BYTES_PER_ROUND 2*NHALF_INP_BYTES_PER_ROUND
%define NINP_BITSHIFT 10
hex_encode_avx512bw:
sub rsp,STACK_ADJ
mov [rsp+STACK_FOR_XMM+0*8],rdi
mov [rsp+STACK_FOR_XMM+1*8],rsi
mov [rsp+STACK_FOR_XMM+2*8],r12
mov [rsp+STACK_FOR_XMM+3*8],r14
mov [rsp+STACK_FOR_XMM+4*8],r15
%ifdef __WIN__
VMOVDQA [rsp ],xmm6
VMOVDQA [rsp+1*16],xmm7
VMOVDQA [rsp+2*16],xmm8
VMOVDQA [rsp+3*16],xmm9
VMOVDQA [rsp+4*16],xmm10
VMOVDQA [rsp+5*16],xmm11
VMOVDQA [rsp+6*16],xmm12
VMOVDQA [rsp+7*16],xmm13
VMOVDQA [rsp+8*16],xmm14
VMOVDQA [rsp+9*16],xmm15
mov rdi,rcx ; parameter 1 output buffer
mov rsi,rdx ; parameter 2 input buffer
mov rdx,r8 ; parameter 3 number of elements
%endif
VMOVDQA64 zmm29,[VPERM_ENCODE_OFFSETS] ; p_23__ l3
;; initializer for QQ0 and QQ1
VPERMQ zmm8 ,zmm29,[rsi+0*64] ; AAA p____5 l3+ QL0 RL00,RL01
VPERMQ zmm9 ,zmm29,[rsi+1*64] ; AAA p____5 l3+ QL1 RL02,RL03
VPERMQ zmm10,zmm29,[rsi+2*64] ; AAA p____5 l3+ QL2 RL04,RL05
VPERMQ zmm11,zmm29,[rsi+3*64] ; AAA p____5 l3+ QL3 RL06,RL07
VPERMQ zmm12,zmm29,[rsi+4*64] ; BBB p____5 l3+ QL4 RL08,RL09
VPERMQ zmm13,zmm29,[rsi+5*64] ; BBB p____5 l3+ QL5 RL10,RL11
VPERMQ zmm14,zmm29,[rsi+6*64] ; BBB p____5 l3+ QL6 RL12,RL13
VPERMQ zmm15,zmm29,[rsi+7*64] ; BBB p____5 l3+ QL7 RL14,RL15
add rsi,rax ; add half the number of processed array elements
VPERMQ zmm16,zmm29,[rsi+0*64] ; CCC p____5 l3+ QL8 RL16,RL17
VPERMQ zmm17,zmm29,[rsi+1*64] ; CCC p____5 l3+ QL9 RL18,RL19
VPERMQ zmm18,zmm29,[rsi+2*64] ; CCC p____5 l3+ QLA RL20,RL21
VPERMQ zmm19,zmm29,[rsi+3*64] ; CCC p____5 l3+ QLB RL22,RL23
VPERMQ zmm20,zmm29,[rsi+4*64] ; DDD p____5 l3+ QLC RL24,RL25
VPERMQ zmm21,zmm29,[rsi+5*64] ; DDD p____5 l3+ QLD RL26,RL27
VPERMQ zmm22,zmm29,[rsi+6*64] ; DDD p____5 l3+ QLE RL28,RL29
VPERMQ zmm23,zmm29,[rsi+7*64] ; DDD p____5 l3+ QLF RL30,RL31
;; initialize constants
KMOVQ k7,[BITMASK_ONE_ZERO]
VMOVDQA64 zmm31,[ENCODE_SHUFFLE_TO_HEX] ; p_23__ l3
VMOVDQA64 zmm1,zmm31
VPBROADCASTQ zmm30,[BITMASK_LOWER_HALF] ; p_23__ l3
VMOVDQA64 zmm28,[ENCODE_SHUFFLE_TO_HIGH_LOW] ; p_23__ l3
;; do page overshoot checks
mov rax,NHALF_INP_BYTES_PER_ROUND
mov r9,rdx ; exact requested number of elements to process
add r9,rsi ; r9 last valid pointer +1 of requested input buffer
mov r10,rsi ; r10 saved start of input buffer
mov r12,r9 ; r12 save of end of input buffer+1
lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
lea r8,[rdx+NINP_BYTES_PER_ROUND-1]
shr r8,NINP_BITSHIFT ; number of loops
shl r8,NINP_BITSHIFT
add r8,rsi ; r8 address of last byte+1 read in complete loops
add r8,NINP_BYTES_PER_ROUND-1 ; r8 address of last byte read in normal loop with overshoot
mov r11,r8
; DISABLED for NO OVERSHOOT
; add r11,rax ; r11 address of last byte of prefetched data (ONLY HALF A ROUND!)
sub r11,rax ; r11 address of last byte of prefetched data (ONLY HALF A ROUND!)
shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte after normal round
shr r8,NSHIFT_ADDRESS_TO_PAGE ; r8 page number of last byte after prefetch
cmp rcx,r8 ; stay on same page
je .LSAME_PAGE_IN_ROUND
sub rdx,rax ; don't overshoot in reading: do one round less
sub rdx,rax ; don't overshoot in reading: do one round less
.LSAME_PAGE_IN_ROUND:
shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
cmp rcx,r11
je .LSAME_PAGE_IN_PREFETCH
sub rdx,rax ; don't overshoot in prefetch reading: do one round less
sub rdx,rax ; don't overshoot in prefetch reading: do one round less
.LSAME_PAGE_IN_PREFETCH:
add rdx,rsi ; rdx last valid pointer+1 for normal loop
; due to prefetch add one round to end checks
add rdx,rax
add r9,rax
mov r11,rdi ; r11 saved start of output buffer
mov rcx,NHALF_INP_BYTES_PER_ROUND<<1 ; increment of output buffer for each round
;; start preprocessing before loop
VPSRLQ zmm2,zmm8,4 ; AAA RL00+RL01 QL0 shift Hx to lower nibble in byte
VPSRLQ zmm3,zmm9,4 ; AAA RL02+RL03 QL1 shift Hx to lower nibble in byte
VPUNPCKLBW zmm0,zmm2,zmm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPUNPCKHBW zmm1,zmm2,zmm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPSRLQ zmm6,zmm10,4 ; AAA RL04+RL05 QL2 shift Hx to lower nibble in byte
VPSRLQ zmm7,zmm11,4 ; AAA RL06+RL07 QL3 shift Hx to lower nibble in byte
VPUNPCKLBW zmm2,zmm3,zmm9 ; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPUNPCKHBW zmm3,zmm3,zmm9 ; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
add rsi,rax ; add half the number of processed array elements
align 32
; ;IACA START_MARKER
; mov ebx, 111
; db 0x64, 0x67, 0x90
; Vector Port info AVX512
; ----------------------------------------
; VPShift p0 l1
; VPMax/Min p0 l1
; VPMUL p0 l5 ; with 2FMA-Units p05 (SKX,CLX etc.)
; VPMOVB2M p0 l3
; VPSUBUSB /SSB p0 l1
; VPALIGNR p5 l1 ;Shift of n*8 bits!
; VPERM p5 l3
; VPERMI2x 1*p05+2*p5 l7 ; (l9 with flags)
; VPCompare p5 l3-l4
; VP Pack/Unpack p5 l1(SKX) l3(TGL)
; VPSHUF p5 l1
.LHEXENCODE_LOOP:
;; AAA+BBB
; process unpacked AAA (QL0-QL4=RL00-RL07) in zmm0-zmm7 and process BBB (QL4-QL7=RL08-RL15) in zmm8-zmm15 and zmm2+zmm3
VPUNPCKLBW zmm4,zmm6,zmm10 ; AAA RL04 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPANDQ zmm0,zmm0,zmm30 ; AAA RL00 mask lower nibble
VPUNPCKHBW zmm5,zmm6,zmm10 ; AAA RL05 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPSRLQ zmm10,zmm12,4 ; BBB RL08+RL09 QL4 shift Hx to lower nibble in byte
VPUNPCKLBW zmm8,zmm10,zmm12 ; BBB RL08 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPANDQ zmm1,zmm1,zmm30 ; AAA RL01 mask lower nibble
VPUNPCKLBW zmm6,zmm7,zmm11 ; AAA RL06 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPANDQ zmm2,zmm2,zmm30 ; AAA RL02 mask lower nibble
VPUNPCKHBW zmm7,zmm7,zmm11 ; AAA RL07 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPSRLQ zmm11,zmm13,4 ; BBB RL10+RL11 QL5 shift Hx to lower nibble in byte
VPANDQ zmm8,zmm8,zmm30 ; BBB RL08 mask lower nibble
VPUNPCKHBW zmm9,zmm10,zmm12 ; BBB RL09 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPANDQ zmm3,zmm3,zmm30 ; AAA RL03 mask lower nibble
VPUNPCKLBW zmm10,zmm11,zmm13 ; BBB RL10 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPANDQ zmm9,zmm9,zmm30 ; BBB RL09 mask lower nibble
VPSHUFB zmm0,zmm31,zmm0 ; AAA RL00 shuffle_to_hex_digits
VPANDQ zmm4,zmm4,zmm30 ; AAA RL04 mask lower nibble
VPUNPCKHBW zmm11,zmm11,zmm13 ; BBB RL11 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPANDQ zmm10,zmm10,zmm30 ; BBB RL10 mask lower nibble
VPSHUFB zmm1,zmm31,zmm1 ; AAA RL01 shuffle_to_hex_digits
VPANDQ zmm5,zmm5,zmm30 ; AAA RL05 mask lower nibble
VPSHUFB zmm8,zmm31,zmm8 ; BBB RL08 shuffle_to_hex_digits
VPANDQ zmm11,zmm11,zmm30 ; BBB RL11 mask lower nibble
VPSHUFB zmm2,zmm31,zmm2 ; AAA RL02 shuffle_to_hex_digits
VMOVDQA64 [rdi+0*64],zmm0 ; AAA RL00 Store Hexdump
VMOVDQA64 [rdi+1*64],zmm1 ; AAA RL01 Store Hexdump
VPANDQ zmm6,zmm6,zmm30 ; AAA RL06 mask lower nibble
VPSHUFB zmm9,zmm31,zmm9 ; BBB RL09 shuffle_to_hex_digits
VPSHUFB zmm3,zmm31,zmm3 ; AAA RL03 shuffle_to_hex_digits
VPANDQ zmm7,zmm7,zmm30 ; AAA RL07 mask lower nibble
VMOVDQA64 [rdi+2*64],zmm2 ; AAA RL02 Store Hexdump
VPSRLQ zmm2,zmm14,4 ; BBB RL12+RL13 QL6 shift Hx to lower nibble in byte
VPSHUFB zmm10,zmm31,zmm10 ; BBB RL10 shuffle_to_hex_digits
VMOVDQA64 [rdi+3*64],zmm3 ; AAA RL03 Store Hexdump
VPSRLQ zmm3,zmm15,4 ; BBB RL14+RL15 QL7 shift Hx to lower nibble in byte
VPUNPCKLBW zmm12,zmm2,zmm14 ; BBB RL12 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPSHUFB zmm4,zmm31,zmm4 ; AAA RL04 shuffle_to_hex_digits
VMOVDQA64 [rdi+4*64],zmm4 ; AAA RL04 Store Hexdump
VPSHUFB zmm11,zmm31,zmm11 ; BBB RL11 shuffle_to_hex_digits
VPUNPCKHBW zmm13,zmm2,zmm14 ; BBB RL13 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPSHUFB zmm5,zmm31,zmm5 ; AAA RL05 shuffle_to_hex_digits
VPANDQ zmm12,zmm12,zmm30 ; BBB RL12 mask lower nibble
VPUNPCKLBW zmm14,zmm3,zmm15 ; BBB RL14 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VMOVDQA64 [rdi+5*64],zmm5 ; AAA RL05 Store Hexdump
VPSHUFB zmm6,zmm31,zmm6 ; AAA RL06 shuffle_to_hex_digits
VPANDQ zmm13,zmm13,zmm30 ; BBB RL13 mask lower nibble
VPUNPCKHBW zmm15,zmm3,zmm15 ; BBB RL15 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPSHUFB zmm7,zmm31,zmm7 ; AAA RL07 shuffle_to_hex_digits
VPANDQ zmm14,zmm14,zmm30 ; BBB RL14 mask lower nibble
VMOVDQA64 [rdi+6*64],zmm6 ; AAA RL06 Store Hexdump
VMOVDQA64 [rdi+7*64],zmm7 ; AAA RL07 Store Hexdump
VPSHUFB zmm12,zmm31,zmm12 ; BBB RL12 shuffle_to_hex_digits
VPANDQ zmm15,zmm15,zmm30 ; BBB RL15 mask lower nibble
;
VMOVDQA64 [rdi+8*64],zmm8 ; BBB RL08 Store Hexdump
VPERMQ zmm8 ,zmm29,[rsi+0*64] ; AAA p____5 l3+ QL0 RL00,RL01
VMOVDQA64 [rdi+9*64],zmm9 ; BBB RL09 Store Hexdump
VPERMQ zmm9 ,zmm29,[rsi+1*64] ; AAA p____5 l3+ QL1 RL02,RL03
VPSHUFB zmm13,zmm31,zmm13 ; BBB RL13 shuffle_to_hex_digits
VMOVDQA64 [rdi+10*64],zmm10 ; BBB RL10 Store Hexdump
VPERMQ zmm10,zmm29,[rsi+2*64] ; AAA p____5 l3+ QL2 RL04,RL05
VPSRLQ zmm2,zmm16,4 ; CCC RL16+RL17 QL8 shift Hx to lower nibble in byte
VMOVDQA64 [rdi+11*64],zmm11 ; BBB RL11 Store Hexdump
VPERMQ zmm11,zmm29,[rsi+3*64] ; AAA p____5 l3+ QL3 RL06,RL07
VPSHUFB zmm14,zmm31,zmm14 ; BBB RL14 shuffle_to_hex_digits
VPSRLQ zmm3,zmm17,4 ; CCC RL18+RL19 QL9 shift Hx to lower nibble in byte
VPSHUFB zmm15,zmm31,zmm15 ; BBB RL15 shuffle_to_hex_digits
VPUNPCKLBW zmm0,zmm2,zmm16 ; CCC RL16 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPSRLQ zmm6,zmm18,4 ; CCC RL20+RL21 QLA shift Hx to lower nibble in byte
VMOVDQA64 [rdi+12*64],zmm12 ; BBB RL12 Store Hexdump
VPERMQ zmm12,zmm29,[rsi+4*64] ; BBB p____5 l3+ QL4 RL08,RL09
VPUNPCKHBW zmm1,zmm2,zmm16 ; CCC RL17 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPSRLQ zmm7,zmm19,4 ; CCC RL22+RL23 QLB shift Hx to lower nibble in byte
VMOVDQA64 [rdi+13*64],zmm13 ; BBB RL13 Store Hexdump
VPERMQ zmm13,zmm29,[rsi+5*64] ; BBB p____5 l3+ QL5 RL10,RL11
VMOVDQA64 [rdi+14*64],zmm14 ; BBB RL14 Store Hexdump
VPERMQ zmm14,zmm29,[rsi+6*64] ; BBB p____5 l3+ QL6 RL12,RL13
VPUNPCKLBW zmm2,zmm3,zmm17 ; CCC RL18 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VMOVDQA64 [rdi+15*64],zmm15 ; BBB RL15 Store Hexdump
add rdi,rcx ; add half the number of processed output bytes
VPERMQ zmm15,zmm29,[rsi+7*64] ; BBB p____5 l3+ QL7 RL14,RL15
VPUNPCKHBW zmm3,zmm3,zmm17 ; CCC RL19 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
;; CCC+DDD
; process unpacked CCC (QL8-QLC=RL16-RL23) in zmm0-zmm7 and process DDD (QLC-QLF=RL24-RL31) in zmm16-zmm23 and zmm2+zmm3
add rsi,rax ; add half the number of processed array elements
VPUNPCKLBW zmm4,zmm6,zmm18 ; CCC RL20 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPANDQ zmm0,zmm0,zmm30 ; CCC RL16 mask lower nibble
VPUNPCKHBW zmm5,zmm6,zmm18 ; CCC RL21 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPSRLQ zmm18,zmm20,4 ; DDD RL24+RL25 QLC shift Hx to lower nibble in byte
VPUNPCKLBW zmm16,zmm18,zmm20 ; DDD RL24 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPANDQ zmm1,zmm1,zmm30 ; CCC RL17 mask lower nibble
VPUNPCKLBW zmm6,zmm7,zmm19 ; CCC RL22 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPANDQ zmm2,zmm2,zmm30 ; CCC RL18 mask lower nibble
VPUNPCKHBW zmm7,zmm7,zmm19 ; CCC RL23 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPSRLQ zmm19,zmm21,4 ; DDD RL26+RL27 QLD shift Hx to lower nibble in byte
VPANDQ zmm16,zmm16,zmm30 ; DDD RL24 mask lower nibble
VPUNPCKHBW zmm17,zmm18,zmm20 ; DDD RL25 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPANDQ zmm3,zmm3,zmm30 ; CCC RL19 mask lower nibble
VPUNPCKLBW zmm18,zmm19,zmm21 ; DDD RL26 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPANDQ zmm17,zmm17,zmm30 ; DDD RL25 mask lower nibble
VPSHUFB zmm0,zmm31,zmm0 ; CCC RL16 shuffle_to_hex_digits
VPANDQ zmm4,zmm4,zmm30 ; CCC RL20 mask lower nibble
VPUNPCKHBW zmm19,zmm19,zmm21 ; DDD RL27 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPANDQ zmm18,zmm18,zmm30 ; DDD RL26 mask lower nibble
VPSHUFB zmm1,zmm31,zmm1 ; CCC RL17 shuffle_to_hex_digits
VPANDQ zmm5,zmm5,zmm30 ; CCC RL21 mask lower nibble
VPSHUFB zmm16,zmm31,zmm16 ; DDD RL24 shuffle_to_hex_digits
VPANDQ zmm19,zmm19,zmm30 ; DDD RL27 mask lower nibble
VPSHUFB zmm2,zmm31,zmm2 ; CCC RL18 shuffle_to_hex_digits
VMOVDQA64 [rdi+0*64],zmm0 ; CCC RL16 Store Hexdump
VMOVDQA64 [rdi+1*64],zmm1 ; CCC RL17 Store Hexdump
VPANDQ zmm6,zmm6,zmm30 ; CCC RL22 mask lower nibble
VPSHUFB zmm17,zmm31,zmm17 ; DDD RL25 shuffle_to_hex_digits
VPSHUFB zmm3,zmm31,zmm3 ; CCC RL19 shuffle_to_hex_digits
VPANDQ zmm7,zmm7,zmm30 ; CCC RL23 mask lower nibble
VMOVDQA64 [rdi+2*64],zmm2 ; CCC RL18 Store Hexdump
VPSRLQ zmm2,zmm22,4 ; DDD RL28+RL29 QLE shift Hx to lower nibble in byte
VPSHUFB zmm18,zmm31,zmm18 ; DDD RL26 shuffle_to_hex_digits
VMOVDQA64 [rdi+3*64],zmm3 ; CCC RL19 Store Hexdump
VPSRLQ zmm3,zmm23,4 ; DDD RL30+RL31 QLF shift Hx to lower nibble in byte
VPUNPCKLBW zmm20,zmm2,zmm22 ; DDD RL28 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPSHUFB zmm4,zmm31,zmm4 ; CCC RL20 shuffle_to_hex_digits
VMOVDQA64 [rdi+4*64],zmm4 ; CCC RL20 Store Hexdump
VPSHUFB zmm19,zmm31,zmm19 ; DDD RL27 shuffle_to_hex_digits
VPUNPCKHBW zmm21,zmm2,zmm22 ; DDD RL29 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPSHUFB zmm5,zmm31,zmm5 ; CCC RL21 shuffle_to_hex_digits
VPANDQ zmm20,zmm20,zmm30 ; DDD RL28 mask lower nibble
VPUNPCKLBW zmm22,zmm3,zmm23 ; DDD RL30 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VMOVDQA64 [rdi+5*64],zmm5 ; CCC RL21 Store Hexdump
VPSHUFB zmm6,zmm31,zmm6 ; CCC RL22 shuffle_to_hex_digits
VPANDQ zmm21,zmm21,zmm30 ; DDD RL29 mask lower nibble
VPUNPCKHBW zmm23,zmm3,zmm23 ; DDD RL31 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPSHUFB zmm7,zmm31,zmm7 ; CCC RL23 shuffle_to_hex_digits
VPANDQ zmm22,zmm22,zmm30 ; DDD RL30 mask lower nibble
VMOVDQA64 [rdi+6*64],zmm6 ; CCC RL22 Store Hexdump
VMOVDQA64 [rdi+7*64],zmm7 ; CCC RL23 Store Hexdump
VPSHUFB zmm20,zmm31,zmm20 ; DDD RL28 shuffle_to_hex_digits
VPANDQ zmm23,zmm23,zmm30 ; DDD RL31 mask lower nibble
;
VMOVDQA64 [rdi+8*64],zmm16 ; DDD RL24 Store Hexdump
VPERMQ zmm16,zmm29,[rsi+0*64] ; CCC p____5 l3+ QL8 RL16,RL17
VMOVDQA64 [rdi+9*64],zmm17 ; DDD RL25 Store Hexdump
VPERMQ zmm17,zmm29,[rsi+1*64] ; CCC p____5 l3+ QL9 RL18,RL19
VPSHUFB zmm21,zmm31,zmm21 ; DDD RL29 shuffle_to_hex_digits
VMOVDQA64 [rdi+10*64],zmm18 ; DDD RL26 Store Hexdump
VPERMQ zmm18,zmm29,[rsi+2*64] ; CCC p____5 l3+ QLA RL20,RL21
VPSRLQ zmm2,zmm8,4 ; AAA RL00+RL01 QL0 shift Hx to lower nibble in byte
VMOVDQA64 [rdi+11*64],zmm19 ; DDD RL27 Store Hexdump
VPERMQ zmm19,zmm29,[rsi+3*64] ; CCC p____5 l3+ QLB RL22,RL23
VPSHUFB zmm22,zmm31,zmm22 ; DDD RL30 shuffle_to_hex_digits
VPSRLQ zmm3,zmm9,4 ; AAA RL02+RL03 QL1 shift Hx to lower nibble in byte
VPSHUFB zmm23,zmm31,zmm23 ; DDD RL31 shuffle_to_hex_digits
VPUNPCKLBW zmm0,zmm2,zmm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VPSRLQ zmm6,zmm10,4 ; AAA RL04+RL05 QL2 shift Hx to lower nibble in byte
VMOVDQA64 [rdi+12*64],zmm20 ; DDD RL28 Store Hexdump
VPERMQ zmm20,zmm29,[rsi+4*64] ; DDD p____5 l3+ QLC RL24,RL25
VPUNPCKHBW zmm1,zmm2,zmm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
VPSRLQ zmm7,zmm11,4 ; AAA RL06+RL07 QL3 shift Hx to lower nibble in byte
VMOVDQA64 [rdi+13*64],zmm21 ; DDD RL29 Store Hexdump
VPERMQ zmm21,zmm29,[rsi+5*64] ; DDD p____5 l3+ QLD RL26,RL27
VMOVDQA64 [rdi+14*64],zmm22 ; DDD RL30 Store Hexdump
VPERMQ zmm22,zmm29,[rsi+6*64] ; DDD p____5 l3+ QLE RL28,RL29
VPUNPCKLBW zmm2,zmm3,zmm9 ; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
VMOVDQA64 [rdi+15*64],zmm23 ; DDD RL31 Store Hexdump
VPERMQ zmm23,zmm29,[rsi+7*64] ; DDD p____5 l3+ QLF RL30,RL31
add rsi,rax ; add half the number of processed array elements
VPUNPCKHBW zmm3,zmm3,zmm9 ; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
add rdi,rcx ; add half the number of processed output bytes
cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
jl .LHEXENCODE_LOOP
; end of normal loop reached
; we can do one more round when original count has been reduced by one round
cmp rax,0
je .LFINISH_EXTRA
cmp rdx,r9 ; input buffer length was not reduced when equal
je .LFINISH_NORMAL
add rax,rax ; rax is only half the bytes of input round, so double it
sub rsi,rax ; for prefetching the last round, load the last round again
sub rdx,rax ; adopt and condition for last round also
mov rax,0
jmp .LHEXENCODE_LOOP
.LFINISH_EXTRA:
add rsi,NINP_BYTES_PER_ROUND ; add the extra round to get processed bytes
jmp .LFINISH
.LFINISH_NORMAL:
sub rsi,NINP_BYTES_PER_ROUND ; sub the added prefetch round to get processed bytes
.LFINISH:
; r9 = address of requested input bytes+1
; rsi = address of processed input bytes+1
; now get the minimum of rdx,rsi to rax
;; sub rsi,rax ; for last round do nothing (rax=0), else sub increment for one round
;; sub r9,rax
mov rax,r12
cmp rsi,r12 ; get min from rdx (address of requested input) and rsi (address of done input)
jge .LCALC_PROCESSED_BYTES
mov rax,rsi ; rax=address of last valid input byte+1
.LCALC_PROCESSED_BYTES:
sub rax,r10 ; sub the input buffer start address
; rax = number of valid processed input bytes = return value
cmp rsi,rdx ; compare rdx (address of requested input) and rsi (address of done input)
je .LNO_ZERO_OUT
mov r15,rax ; number of elements to process
shl r15,1 ; number of output bytes
add r15,r11 ; pointer to next byte after full valid output buffer
VPXORQ zmm0,zmm0,zmm0 ; all zero
;ZERO VMOVDQU64 [r15],zmm0 ; zero out one register width after last output
.LNO_ZERO_OUT:
%ifdef __WIN__
VMOVDQA xmm6 ,[rsp ]
VMOVDQA xmm7 ,[rsp+1*16]
VMOVDQA xmm8 ,[rsp+2*16]
VMOVDQA xmm9 ,[rsp+3*16]
VMOVDQA xmm10,[rsp+4*16]
VMOVDQA xmm11,[rsp+5*16]
VMOVDQA xmm12,[rsp+6*16]
VMOVDQA xmm13,[rsp+7*16]
VMOVDQA xmm14,[rsp+8*16]
VMOVDQA xmm15,[rsp+9*16]
%endif
mov rdi,[rsp+STACK_FOR_XMM+0*8]
mov rsi,[rsp+STACK_FOR_XMM+1*8]
mov r12,[rsp+STACK_FOR_XMM+2*8]
mov r14,[rsp+STACK_FOR_XMM+3*8]
mov r15,[rsp+STACK_FOR_XMM+4*8]
add rsp,STACK_ADJ
ret
;----------------------------------------------------------------------------------------------
%endif
readme_hex_bench.txt 0000664 0001750 0001750 00000003051 14163616227 015606 0 ustar buschmann buschmann
README for using hex_bench
1. download/install nasm
https://www.nasm.us/
2. download golink on windows or use other linker
http://www.godevtool.com/
3. unzip folder of hex_bench
4.a adjust the path and the hex_encode implementation if not on a AVX 512 machine
4.b adjust the path to the nasm exe in hex_bench.asm on windows
5. build the exe (see below)
6. run the benchmark and stop the time manually:
- it will run without IO for about 90 til 300? sec
- it encodes (more than a terrabyte!!)
; 1 million times = 1356 GB on windows
; 1 million times = 1718 GB on linux
BUILD commands:
%ifdef ASSEMBLE_COMMAND_LINES_ON_WINDOWS
:: commands to build on Windows (nasm and golink in the path)
nasm -f WIN64 -g hex_bench.asm -l hex_bench.lis
nasm -f WIN64 -g hex_x86_64.asm -l hex_x86_64.lis
nasm -f WIN64 -g HEX_BENCH_DATA_1300KB.asm
golink /console hex_bench.obj hex_x86_64.obj HEX_BENCH_DATA_1300KB.obj
%endif
%ifdef ASSEMBLE_COMMAND_LINES_ON_LINUX
# commands to build on LINUX without gdb symbols
nasm -f elf64 -g hex_bench.asm -l hex_bench.lis
nasm -f elf64 -g hex_x86_64.asm -l hex_x86_64.lis
nasm -f elf64 -g HEX_BENCH_DATA_1300KB.asm
ld -o hex_bench hex_bench.o hex_x86_64.o HEX_BENCH_DATA_1300KB.o
# commands to build on LINUX with gdb symbols
nasm -f elf64 hex_bench.asm -g -F stabs -l hex_bench.lis
nasm -f elf64 hex_x86_64.asm -g -F stabs -l hex_x86_64.lis
nasm -f elf64 HEX_BENCH_DATA_1300KB.asm -g -F stabs
gcc -o hex_bench hex_bench.o hex_x86_64.o HEX_BENCH_DATA_1300KB.o -g
%endif