0001_hex_encode.patch
application/octet-stream
Filename: 0001_hex_encode.patch
Type: application/octet-stream
Part: 0
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: unified
| File | + | − |
|---|---|---|
| src/backend/utils/adt/cpu_capabilities_x86_64.asm | 630 | 0 |
| src/backend/utils/adt/encode.c | 88 | 8 |
| src/backend/utils/adt/hex_x86_64.asm | 2915 | 0 |
| src/backend/utils/adt/Makefile | 2 | 0 |
| src/backend/utils/adt/varlena.c | 1 | 1 |
| src/include/utils/builtins.h | 3 | 0 |
| src/Makefile.global.in | 8 | 0 |
diff --git a/postgresql-15devel_orig/src/Makefile.global.in b/postgresql-15devel/src/Makefile.global.in
index 05c54b2..ea5c785 100644
--- a/postgresql-15devel_orig/src/Makefile.global.in
+++ b/postgresql-15devel/src/Makefile.global.in
@@ -270,6 +270,10 @@ LLVM_CPPFLAGS = @LLVM_CPPFLAGS@
LLVM_CFLAGS = @LLVM_CFLAGS@
LLVM_CXXFLAGS = @LLVM_CXXFLAGS@
+# TODO should be adapted to configure
+NASM = nasm
+NASMFLAGS = elf64
+
# Kind-of compilers
BISON = @BISON@
@@ -782,6 +786,10 @@ endif
%.bz2: %
$(BZIP2) -c $< >$@
+%.o: %.asm
+ $(NASM) -f $(NASMFLAGS) -g -o $@ $<
+
+
# Direct builds of foo.c -> foo are disabled to avoid generating
# *.dSYM junk on Macs. All builds should normally go through the
# foo.c -> foo.o -> foo steps. This also ensures that dependency
diff --git a/postgresql-15devel_orig/src/backend/utils/adt/Makefile b/postgresql-15devel/src/backend/utils/adt/Makefile
index 41b486b..fa74e69 100644
--- a/postgresql-15devel_orig/src/backend/utils/adt/Makefile
+++ b/postgresql-15devel/src/backend/utils/adt/Makefile
@@ -25,6 +25,7 @@ OBJS = \
bool.o \
cash.o \
char.o \
+ cpu_capabilities_x86_64.o \
cryptohashfuncs.o \
date.o \
datetime.o \
@@ -42,6 +43,7 @@ OBJS = \
geo_ops.o \
geo_selfuncs.o \
geo_spgist.o \
+ hex_x86_64.o \
inet_cidr_ntop.o \
inet_net_pton.o \
int.o \
diff --git a/postgresql-15devel/src/backend/utils/adt/cpu_capabilities_x86_64.asm b/postgresql-15devel/src/backend/utils/adt/cpu_capabilities_x86_64.asm
new file mode 100644
index 0000000..bcb7db3
--- /dev/null
+++ b/postgresql-15devel/src/backend/utils/adt/cpu_capabilities_x86_64.asm
@@ -0,0 +1,630 @@
+%ifdef __NASM_MAJOR__
+%ifdef COMPILE_C_STYLE_COMMENTS
+/*-------------------------------------------------------------------------
+ *
+ * cpu_capabilities_x86_64.asm
+ * Assembler routines for fetching the cpu_capabilities in a convenient int64
+ * and selecting the maximum possible implementation for all valid algorithms
+ *
+ * Copyright (c) 2021-2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/cpu_capabilities_x86_64.asm
+ *
+ *-------------------------------------------------------------------------
+ */
+%endif
+
+
+
+
+
+
+; cpu_capabilities_x86_64.sam
+; Assembler routines for converting a buffer to hex (cpu_capabilities_encode_xxx)
+; and restore the binary from cpu_capabilities_code (cpu_capabilities_decode_xxx) on Intel X64
+
+
+; nasm -f WIN64 -g cpu_capabilities_x86_64.asm -l cpu_capabilities_x86_64.lis
+
+; golink /console hexdump.obj cpu_capabilities_x86_64.obj hex_x86_64.obj base64_x86_64.obj /files
+
+; Linux register order: %rdi, %rsi, %rdx, %rcx, %r8 and %r9
+; Windows register order: rcx, rdx, r8, r9
+
+; Windows non volatile registers: rbx,rbp,rdi,rsi,rsp, r12,r13,r14,r15 and xmm6-xmm15
+; Linux non volatile registers: rbx,rbp, rsp, r12,r13,r14,r15
+
+; nasm -f elf64 -g cpu_capabilities_x86_64.asm -l cpu_capabilities_x86_64_elf64.lis
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define __WIN__ 1
+%elifidn __OUTPUT_FORMAT__, elf64
+%define __ELF__ 1
+%endif
+
+
+global apply_cpu_capabilities
+
+global get_instr_info
+
+;;global get_highest_impl_id
+
+
+
+default rel
+
+section .rdata align=64
+
+; these are the id defines for different algorithms implemented or planned
+; every implementation know its own IMPL_ID, which should never change
+;
+
+%define ALGORITHM_ID_HEX_ENCODE 0
+%define ALGORITHM_ID_HEX_DECODE 1
+%define ALGORITHM_ID_BASE64_ENCODE 2
+%define ALGORITHM_ID_BASE64_DECODE 3
+%define ALGORITHM_ID_CECKSUM 4
+%define ALGORITHM_ID_CECKSUM_COPY 5
+
+
+
+
+%define CPU_IS_ARCH_X86_64 1
+%define CPU_HAS_SSE2 8
+%define CPU_HAS_SSE3 9
+%define CPU_HAS_SSSE3 10
+%define CPU_HAS_SSE4_1 11
+%define CPU_HAS_SSE4_2 12
+%define CPU_HAS_AVX 13
+%define CPU_HAS_F16C 14
+%define CPU_HAS_AVX2 15
+%define CPU_HAS_AVX512_F 16
+%define CPU_HAS_AVX512_VL 17
+%define CPU_HAS_AVX512_DQ 18
+%define CPU_HAS_AVX512_BW 19
+%define CPU_HAS_AVX512_IFMA 20
+%define CPU_HAS_AVX512_VBMI 21
+%define CPU_HAS_AVX512_VBMI2 22
+%define CPU_HAS_AVX512_VNNI 23
+%define CPU_HAS_AVX512_BITALG 24
+%define CPU_HAS_AVX512_VPOPCNTDQ 25
+%define CPU_HAS_AVX512_VP2INTERSECT 26
+%define CPU_HAS_AVX512_FP16 27
+%define CPU_HAS_AMX_TILE 28
+%define CPU_HAS_AMX_BF16 29
+%define CPU_HAS_AMX_INT8 31
+
+
+REQUIREMENTS_ARR:
+HEX_ENC_CPU_REQUIREMENTS_ARR:
+ dq 0
+ dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_SSE2)
+ dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_SSSE3)
+ dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_AVX2)
+ dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_AVX512_BW)
+
+
+
+HEX_DEC_CPU_REQUIREMENTS_ARR:
+ dq 0
+ dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_SSE2)
+ dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_AVX2)
+ dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_AVX512_BW)
+
+
+CPU_REQUIREMENTS_OFFS_ARR_X86_64:
+ dq HEX_ENC_CPU_REQUIREMENTS_ARR - REQUIREMENTS_ARR ; ALGORITHM_ID_HEX_ENCODE 0
+ dq HEX_DEC_CPU_REQUIREMENTS_ARR - REQUIREMENTS_ARR ; ALGORITHM_ID_HEX_DECODE 1
+ dq 0 ; ALGORITHM_ID_BASE64_ENCODE 2
+ dq 0 ; ALGORITHM_ID_BASE64_DECODE 3
+ dq 0 ; ALGORITHM_ID_CECKSUM 4
+ dq 0 ; ALGORITHM_ID_CECKSUM_COPY 5
+
+ times 60 dq 0
+ dq -1
+
+
+
+
+VALID_IMPL_ID_ARR_X86_64:
+; TODO due to error reduced to 3 from 4 !!!
+ dq 3 ; ALGORITHM_ID_HEX_ENCODE 0
+ dq 3 ; ALGORITHM_ID_HEX_DECODE 1
+ dq 0 ; ALGORITHM_ID_BASE64_ENCODE 2
+ dq 0 ; ALGORITHM_ID_BASE64_DECODE 3
+ dq 0 ; ALGORITHM_ID_CECKSUM 4
+ dq 0 ; ALGORITHM_ID_CECKSUM_COPY 5
+
+ times 60 dq -1
+
+
+
+
+
+
+section .text align=32
+
+
+%use smartalign
+
+ ALIGNMODE p6
+
+
+%ifdef USE_LOCAL_IMPL_ID_ARR_LOOKUP
+;----------------------------------------------------------------------------------------------
+
+; get_highest_impl_id accepts the ALGORITHM_ID of the requested algorithm
+; and returns the highest impl_id valid on the current architecture.
+; A return value of zero indicates no valid implementation
+
+get_highest_impl_id:
+
+ sub rsp,0x28
+
+%ifdef __WIN__
+; mov rcx,rcx ; WIN parameter 1 ALGORITHM_ID
+%else
+ mov rcx,rdi ; LINUX parameter 1 ALGORITHM_ID
+%endif
+
+ mov rax,[8*rcx+VALID_IMPL_ID_ARR_X86_64]
+
+ add rsp,0x28
+
+ ret
+
+%endif
+
+
+;----------------------------------------------------------------------------------------------
+
+; apply_cpu_capabilities fetches the cpu capabilities and compres it to the cpu_requirements
+; of all Algorithms.
+; It uses a local array of algorithm_impl_id, which contains the maximum impl_id for
+; this specific algorithm. A zero indicates the algorithm has not benn implemented yet,
+; a -1 terminates the list
+; according to the cpu capabilities a public impl_id_array (passed by address parameter)
+; is filled with the maximum supported impl_ids of the algorithms
+; this public array is initialized to all zero, so the support of a specific algorithm must
+; be enabled by this routine
+; on other CPU architectures (only 1 architecture is active in any program) similar routeines
+; can fill the public impl_id_arr according to their implementations.
+; For reference the cpu_capabilites are set to a public int8 variable passed by address.
+; there is a unique value for every interesting CPU architecture.
+;
+; To mask one or more algorithms (bug hunting, debugging), a mask can be passed as third parameter.
+; every Algorithm_ID has the corresponding bit in the mask (bit 0..x corresponds to algorithm 0..x).
+; when the bit for a certain algorithm is set, the public impl_id_arr is not set and left at its
+; default of zero.
+
+%define STACK_ADJ 0x28+4*8
+
+apply_cpu_capabilities:
+ ; parameter 1 address cpu_capabilities (int8)
+ ; parameter 2 address valid_impl_arr (array of int8)
+ ; parameter 3 bitmask algorithm disable (int8 by value)
+
+
+ sub rsp,STACK_ADJ
+
+ mov [rsp+0*8],rdi
+ mov [rsp+1*8],rsi
+ mov [rsp+2*8],r9
+ mov [rsp+3*8],r15
+
+%ifdef __WIN__
+ mov rdi,rcx ; parameter 1 address cpu_capabilities
+
+ mov rsi,rdx ; parameter 2 address valid_impl_arr
+
+ mov rdx,r8 ; parameter 3 bitmask algorithm disable
+
+%endif
+
+ mov r9,[rdi] ; load current cpu_capabilities
+ cmp r9,0
+ jns .capabilities_set
+
+ call get_instr_info
+ mov [rdi],rax
+ mov r9,rax ; r9 = current capabilities
+
+.capabilities_set:
+
+ mov r8,1 ; r8 bitmask for current algorithm
+ lea rdi,[VALID_IMPL_ID_ARR_X86_64] ; rdi address of local impl_id_arr
+ xor r15,r15 ; r15 current offset into algorithm arrays
+
+.loop_algorithm:
+ mov rcx,[rdi+8*r15] ; rcx = max index for current algorithm
+ cmp rcx,0
+ jl .end_loop_algorithm ; current index = -1 -> goto end
+ je .skip_algorithm
+
+ lea rax,[CPU_REQUIREMENTS_OFFS_ARR_X86_64] ; start offset of requirement_arr of current algorithm
+ mov rax,[rax+8*r15] ; start offset of requirement_arr of current algorithm
+ lea r10,[REQUIREMENTS_ARR] ; r10 pointer to requirement_arr for current algorithm
+ add r10,rax
+
+.check_requirements:
+ mov rax,r9 ; rax temp for current capabilities
+ and rax,[8*rcx+r10]
+ cmp rax,[8*rcx+r10]
+ je .max_index_found
+ sub rcx,1
+ jnz .check_requirements
+
+.max_index_found:
+ mov rax,r8 ; rax temp for bitmask of current algorithm
+ test rax,rdx
+ jnz .skip_algorithm
+ mov [rsi+8*r15],rcx
+
+
+.skip_algorithm:
+ add r8,r8 ; shift bitmask of current algorithm 1 to the right
+ add r15,1
+ jmp .loop_algorithm
+
+.end_loop_algorithm:
+
+.return:
+
+ mov rdi,[rsp+0*8]
+ mov rsi,[rsp+1*8]
+ mov r9 ,[rsp+2*8]
+ mov r15,[rsp+3*8]
+
+ add rsp,STACK_ADJ
+
+ ret
+
+;----------------------------------------------------------------------------------------------
+
+ ; CPUID Input EAX=01h
+ ; Feature Information Returned in the ECX Register (according to Intel Instruction Manual)
+ ;ECX bit
+ ;-> 0 SSE3 Streaming SIMD Extensions 3
+ ; 1 PCLMULQDQ
+ ; 2 DTES64 64-bit DS Area.
+ ; 3 MONITOR MONITOR/MWAIT.
+ ; 4 DS-CPL CPL Qualified Debug Store.
+ ; 5 VMX Virtual Machine Extensions.
+ ; 6 SMX Safer Mode Extensions.
+ ; 7 EIST Enhanced Intel SpeedStep® technology.
+ ; 8 TM2 Thermal Monitor 2.
+ ;-> 9 SSSE3
+ ; 10 CNXT-ID L1 Context ID.
+ ; 11 SDBG
+ ; 12 FMA
+ ; 13 CMPXCHG16B
+ ; 14 xTPR Update Control
+ ; 15 PDCM Perfmon and Debug Capability.
+ ; 16 Reserved
+ ; 17 PCID Process-context identifiers.
+ ; 18 DCA
+ ;-> 19 SSE4_1
+ ;-> 20 SSE4_2
+ ; 21 x2APIC
+ ; 22 MOVBE
+ ; 23 POPCNT
+ ; 24 TSC-Deadline
+ ; 25 AESNI
+ ; 26 XSAVE
+ ; 27 OSXSAVE
+ ;-> 28 AVX
+ ;-> 29 F16C
+ ; 30 RDRAND
+ ; 31 Not Used
+
+ ; CPUID Input EAX=01h
+ ; Feature Information Returned in the EDX Register (according to Intel Instruction Manual)
+ ; EDX bit
+ ; 0 FPU Floating Point Unit On-Chip.
+ ; 1 VME Virtual 8086 Mode Enhancements.
+ ; 2 DE Debugging Extensions.
+ ; 3 PSE Page Size Extension.
+ ; 4 TSC Time Stamp Counter.
+ ; 5 MSR Model Specific Registers RDMSR and WRMSR Instructions.
+ ; 6 PAE Physical Address Extension.
+ ; 7 MCE Machine Check Exception.
+ ; 8 CX8 CMPXCHG8B Instruction. Th
+ ; 9 APIC APIC On-Chip.
+ ; 10 Reserved
+ ; 11 SEP SYSENTER and SYSEXIT Instructions.
+ ; 12 MTRR Memory Type Range Registers
+ ; 13 PGE Page Global Bit.
+ ; 14 MCA Machine Check Architecture.
+ ; 15 CMOV Conditional Move Instructions.
+ ; 16 PAT Page Attribute Table.
+ ; 17 PSE-36 36-Bit Page Size Extension.
+ ; 18 PSN Processor Serial Number.
+ ; 19 CLFSH CLFLUSH Instruction.
+ ; 20 Reserved
+ ; 21 DS Debug Store.
+ ; 22 ACPI Thermal Monitor and Software Controlled Clock Facilities.
+ ; 23 MMX Intel MMX Technology.
+ ; 24 FXSR FXSAVE and FXRSTOR Instructions.
+ ; 25 SSE SSE.
+ ;-> 26 SSE2 SSE2.
+ ; 27 SS Self Snoop.
+ ; 28 HTT Max APIC IDs reserved field is Valid.
+ ; 29 TM Thermal Monitor.
+ ; 30 Reserved
+ ; 31 PBE Pending Break Enable.
+ ;
+
+ ; CPUID Input EAX=07H
+ ; Feature Information returned in the EAX-EDX Registers (according to Intel Instruction Set extension Manual)
+
+; EBX bits
+ ; EBX Bit00: FSGSBASE. Supports RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE if 1.
+ ; EBX Bit01: IA32_TSC_ADJUST MSR is supported if 1.
+ ; EBX Bit02: SGX
+ ; EBX Bit03: BMI1
+ ; EBX Bit04: HLE
+ ;-> EBX Bit05: Intel® AVX2
+ ; EBX Bit06: FDP_EXCPTN_ONLY. x87 FPU Data Pointer updated only on x87 exceptions if 1.
+ ; EBX Bit07: SMEP. Supports Supervisor Mode Execution Protection if 1.
+ ; EBX Bit08: BMI2
+ ; EBX Bit09: Supports Enhanced REP MOVSB/STOSB if 1.
+ ; EBX Bit10: INVPCID
+ ; EBX Bit11: RTM
+ ; EBX Bit12: RDT-M. Supports Intel® Resource Director Technology (Intel® RDT) Monitoring capability if 1.
+ ; EBX Bit13: Deprecates FPU CS and FPU DS values if 1.
+ ; EBX Bit14: Intel® Memory Protection Extensions
+ ; EBX Bit15: RDT-A. Supports Intel® Resource Director Technology (Intel® RDT) Allocation capability if 1.
+ ;-> EBX Bit16: AVX512F
+ ;-> EBX Bit17: AVX512DQ
+ ; EBX Bit18: RDSEED
+ ; EBX Bit19: ADX
+ ; EBX Bit20: SMAP
+ ;-> EBX Bit21: AVX512_IFMA
+ ; EBX Bit22: Reserved
+ ; EBX Bit23: CLFLUSHOPT
+ ; EBX Bit24: CLWB
+ ; EBX Bit25: Intel Processor Trace
+ ; EBX Bit26: AVX512PF (Intel® Xeon Phi™ only.)
+ ; EBX Bit27: AVX512ER (Intel® Xeon Phi™ only.)
+ ; EBX Bit28: AVX512CD
+ ; EBX Bit29: SHA
+ ;-> EBX Bit30: AVX512BW
+ ;-> EBX Bit31: AVX512VL
+
+
+; ECX bits
+ ; ECX Bit00: PREFETCHWT1 (Intel® Xeon Phi™ only.)
+ ;-> ECX Bit01: AVX512_VBMI
+ ; ECX Bit02: UMIP. Supports user-mode instruction prevention if 1.
+ ; ECX Bit03: PKU. Supports protection keys for user-mode pages if 1.
+ ; ECX Bit04: OSPKE. If 1, OS has set CR4.PKE to enable protection keys (and the RDPKRU/WRPKRU instructions).
+ ; ECX Bit05: WAITPKG
+ ;-> ECX Bit06: AVX512_VBMI2
+ ; ECX Bit07: CET_SS. Supports CET shadow stack features if 1.
+ ; ECX Bit08: GFNI
+ ; ECX Bit09: VAES
+ ; ECX Bit10: VPCLMULQDQ
+ ;-> ECX Bit11: AVX512_VNNI
+ ;-> ECX Bit12: AVX512_BITALG
+ ; ECX Bit13: TME_EN.
+ ;-> ECX Bit14: AVX512_VPOPCNTDQ
+ ; ECX Bit15: Reserved
+ ; ECX Bit16: LA57. Supports 57-bit linear addresses and five-level paging if 1.
+ ; ECX Bits 21-17: The value of MAWAU used by the BNDLDX and BNDSTX instructions in 64-bit mode.
+ ; ECX Bit22: RDPID and IA32_TSC_AUX are available if 1.
+ ; ECX Bit23: KL. Supports Key Locker if 1.
+ ; ECX Bit24: Reserved
+ ; ECX Bit25: CLDEMOTE. Supports cache line demote if 1.
+ ; ECX Bit26: Reserved
+ ; ECX Bit27: MOVDIRI. Supports MOVDIRI if 1.
+ ; ECX Bit28: MOVDIR64B. Supports MOVDIR64B if 1.
+ ; ECX Bit29: ENQCMD: Supports Enqueue Stores if 1.
+ ; ECX Bit30: SGX_LC. Supports SGX Launch Configuration if 1.
+ ; ECX Bit31: PKS. Supports protection keys for supervisor-mode pages if 1.
+
+; EDX bits
+ ; EDX Bits 01-00: Reserved
+ ; EDX Bit02: AVX512_4VNNIW (Intel® Xeon Phi™ only.)
+ ; EDX Bit03: AVX512_4FMAPS (Intel® Xeon Phi™ only.)
+ ; EDX Bit04: Fast Short REP MOV
+ ; EDX Bit05: UINTR. If 1, the processor supports user interrupts.
+ ; EDX Bits 07-06: Reserved
+ ;-> EDX Bit08: AVX512_VP2INTERSECT
+ ; EDX Bit09: Reserved
+ ; EDX Bit10: MD_CLEAR supported.
+ ; EDX Bits 13-11: Reserved
+ ; EDX Bit14: SERIALIZE
+ ; EDX Bit15: Hybrid. If 1, the processor is identified as a hybrid part.
+ ; EDX Bit16: TSXLDTRK. If 1, the processor supports Intel TSX suspend load address tracking.
+ ; EDX Bit17: Reserved
+ ; EDX Bit18: PCONFIG
+ ; EDX Bit19: Reserved
+ ; EDX Bit20: CET_IBT. Supports CET indirect branch tracking features if 1.
+ ; EDX Bit21: Reserved
+ ;-> EDX Bit22: AMX-BF16. If 1, the processor supports tile computational operations on bfloat16 numbers.
+ ;-> EDX Bit23: AVX512_FP16
+ ;-> EDX Bit24: AMX-TILE. If 1, the processor supports tile architecture
+ ;-> EDX Bit25: AMX-INT8. If 1, the processor supports tile computational operations on 8-bit integers.
+ ; EDX Bit26: Enumerates support for indirect branch restricted speculation (IBRS) and the indirect branch predictor barrier (IBPB).
+ ; EDX Bit27: Enumerates support for single thread indirect branch predictors (STIBP).
+ ; EDX Bit29: Enumerates support for the IA32_ARCH_CAPABILITIES MSR.
+ ; EDX Bit30: Enumerates support for the IA32_CORE_CAPABILITIES MSR.
+ ; EDX Bit31: Enumerates support for Speculative Store Bypass Disable (SSBD).
+
+
+
+%define STACK_ADJ 0x28+6*8
+
+get_instr_info:
+
+ sub rsp,STACK_ADJ
+
+ mov [rsp+0*8],rbx
+ mov [rsp+1*8],rcx
+ mov [rsp+2*8],rdx
+ mov [rsp+3*8],r8
+ mov [rsp+4*8],r9
+ mov [rsp+5*8],r15
+
+
+ ; NOTE: the upper bits 32-63 of the corresponding 64bit register are zeroed on 32bit movs!
+ ; so it is easy to adapt the scheme to more CPU-features occupying the upper 32 bits
+ mov r9d,CPU_IS_ARCH_X86_64
+
+;LEAF_01H
+ mov eax,0x01
+ cpuid
+
+;ECX
+ mov r8d,ecx
+ and r8d,1<<0
+ shl r8d,CPU_HAS_SSE3-0
+ or r9d,r8d
+
+ mov r8d,ecx
+ and r8d,1<<9
+ shl r8d,CPU_HAS_SSSE3-9
+ or r9d,r8d
+
+ mov r8d,ecx
+ and r8d,1<<19
+ shr r8d,19-CPU_HAS_SSE4_1
+ or r9d,r8d
+
+ mov r8d,ecx
+ and r8d,1<<20
+ shr r8d,20-CPU_HAS_SSE4_2
+ or r9d,r8d
+
+ mov r8d,ecx
+ and r8d,1<<28
+ shr r8d,28-CPU_HAS_AVX
+ or r9d,r8d
+
+ mov r8d,ecx
+ and r8d,1<<29
+ shr r8d,29-CPU_HAS_F16C
+ or r9d,r8d
+
+
+;EDX
+ mov r8d,edx
+ and r8d,1<<26
+ shr r8d,26-CPU_HAS_SSE2
+ or r9d,r8d
+
+
+;LEAF_07H
+ mov eax,0x07
+ mov ecx,0
+ cpuid
+
+;EBX
+ mov r8d,ebx
+ and r8d,1<<5
+ shl r8d,CPU_HAS_AVX2-5
+ or r9d,r8d
+
+ mov r8d,ebx
+ and r8d,1<<16
+ shl r8d,CPU_HAS_AVX512_F-16
+ or r9d,r8d
+
+ mov r8d,ebx
+ and r8d,1<<17
+ shl r8d,CPU_HAS_AVX512_DQ-17
+ or r9d,r8d
+
+ mov r8d,ebx
+ and r8d,1<<21
+ shr r8d,21-CPU_HAS_AVX512_IFMA
+ or r9d,r8d
+
+ mov r8d,ebx
+ and r8d,1<<30
+ shr r8d,30-CPU_HAS_AVX512_BW
+ or r9d,r8d
+
+ mov r8d,ebx
+ and r8d,1<<31
+ shr r8d,31-CPU_HAS_AVX512_VL
+ or r9d,r8d
+
+;ECX
+ mov r8d,ecx
+ and r8d,1<<1
+ shl r8d,CPU_HAS_AVX512_VBMI-1
+ or r9d,r8d
+
+ mov r8d,ecx
+ and r8d,1<<6
+ shl r8d,CPU_HAS_AVX512_VBMI2-6
+ or r9d,r8d
+
+ mov r8d,ecx
+ and r8d,1<<11
+ shl r8d,CPU_HAS_AVX512_VNNI-11
+ or r9d,r8d
+
+ mov r8d,ecx
+ and r8d,1<<12
+ shl r8d,CPU_HAS_AVX512_BITALG-12
+ or r9d,r8d
+
+ mov r8d,ecx
+ and r8d,1<<14
+ shl r8d,CPU_HAS_AVX512_VPOPCNTDQ-14
+ or r9d,r8d
+
+;EDX
+ mov r8d,edx
+ and r8d,1<<8
+ shl r8d,CPU_HAS_AVX512_VP2INTERSECT-8
+ or r9d,r8d
+
+ mov r8d,edx
+ and r8d,1<<22
+ shl r8d,CPU_HAS_AMX_BF16-22
+ or r9d,r8d
+
+ mov r8d,edx
+ and r8d,1<<23
+ shl r8d,CPU_HAS_AVX512_FP16-23
+ or r9d,r8d
+
+ mov r8d,edx
+ and r8d,1<<24
+ shl r8d,CPU_HAS_AMX_TILE-24
+ or r9d,r8d
+
+ mov r8d,edx
+ and r8,1<<25
+ shl r8,CPU_HAS_AMX_INT8-25
+ or r9,r8
+
+; example for CPU_HAS_property_GT_31
+; mov r8d,edx
+; and r8,1<<26
+; shl r8,CPU_HAS_PROPERTY_GT_31-26
+; or r9,r8
+
+
+
+ mov rax,r9
+
+ mov rbx,[rsp+0*8]
+ mov rcx,[rsp+1*8]
+ mov rdx,[rsp+2*8]
+ mov r8 ,[rsp+3*8]
+ mov r9 ,[rsp+4*8]
+ mov r15,[rsp+5*8]
+
+ add rsp,STACK_ADJ
+
+ ret
+
+;----------------------------------------------------------------------------------------------
+%endif
diff --git a/postgresql-15devel_orig/src/backend/utils/adt/encode.c b/postgresql-15devel/src/backend/utils/adt/encode.c
index 6dd93f9..7c37989 100644
--- a/postgresql-15devel_orig/src/backend/utils/adt/encode.c
+++ b/postgresql-15devel/src/backend/utils/adt/encode.c
@@ -19,6 +19,7 @@
#include "utils/builtins.h"
#include "utils/memutils.h"
+#define ALGORITHM_ID_HEX_ENCODE 0
/*
* Encoding conversion API.
@@ -39,6 +40,28 @@ struct pg_encoding
static const struct pg_encoding *pg_find_encoding(const char *name);
+
+ /* TODO BEGIN of block which should be moved to global initialization */
+
+static int64 cpu_capabilities_unmasked = -1;
+static int64 cpu_capabilities = -1;
+static int64 cpu_capabilities_mask = -1;
+static int64 algorithm_disable_mask = 0;
+
+static int64 valid_impl_id_arr[64];
+
+extern size_t apply_cpu_capabilities (int64 *capabilities, int64 *impl_id_arr, int64 mask);
+
+
+extern size_t get_hex_encode_alloc_addon (size_t srclen, int64 impl_id);
+
+extern size_t get_hex_decode_alloc_addon (size_t srclen, int64 impl_id);
+
+extern size_t hex_encode_fast (char *dst, const char *src, size_t srclen, int64 impl_id);
+
+ /* END init */
+
+
/*
* SQL functions.
*/
@@ -161,14 +184,46 @@ uint64
hex_encode(const char *src, size_t len, char *dst)
{
const char *end = src + len;
+ size_t n_done = 0;
+ size_t resultlen;
+ size_t len_reduce = 256;
- while (src < end)
+ /* TODO BEGIN of block which should be moved to global initialization */
+
+ /*
+ * Check the CPU capabilities only once.
+ * we call it through hex_enc_len in case this has not been called before
+ */
+ if (cpu_capabilities < 0)
{
- *dst++ = hextbl[(*src >> 4) & 0xF];
- *dst++ = hextbl[*src & 0xF];
- src++;
+ resultlen = hex_enc_len(src,len);
+// len_reduce = (resultlen>>1)-len;
+// elog(NOTICE,"ppast_hex_enc_len len_reduce %d bytes",len_reduce);
+ }
+
+ /* END init */
+
+#if defined(__x86_64__) || defined(_M_AMD64)
+ if (len >= 512)
+ {
+ n_done = hex_encode_fast(dst,src,len-len_reduce,valid_impl_id_arr[ALGORITHM_ID_HEX_ENCODE]);
}
- return (uint64) len * 2;
+#endif
+
+ if (n_done < len)
+ {
+ src += n_done;
+ dst += n_done<<1;
+ while (src < end)
+ {
+ *dst++ = hextbl[(*src >> 4) & 0xF];
+ *dst++ = hextbl[*src & 0xF];
+ src++;
+ }
+ }
+// elog(NOTICE,"post_hex_encode return double_len %d bytes",len<<1);
+
+ return (uint64) len << 1;
}
static inline char
@@ -223,13 +278,38 @@ hex_decode(const char *src, size_t len, char *dst)
return p - dst;
}
-static uint64
+uint64
hex_enc_len(const char *src, size_t srclen)
{
- return (uint64) srclen << 1;
+ /* TODO BEGIN of block which should be moved to global initialization */
+
+ /*
+ * Check the CPU capabilities only once.
+ * When cpu_capabilities is not set (is < 0) we call the architecture-
+ * dependant instruction information.
+ * An architecture supported for ASM/SIMD acceleration returns a positive
+ * value, for all other (not yet) supported architectures we set it to 0.
+ */
+#if defined(__x86_64__) || defined(_M_AMD64)
+ if (cpu_capabilities < 0)
+ {
+ apply_cpu_capabilities(&cpu_capabilities_unmasked, valid_impl_id_arr, algorithm_disable_mask);
+ cpu_capabilities = cpu_capabilities_unmasked&cpu_capabilities_mask;
+ elog(NOTICE,"post_apply cpu_capabilities = %ld",cpu_capabilities);
+ elog(NOTICE,"post_apply valid_impl_id_0 = %ld",valid_impl_id_arr[ALGORITHM_ID_HEX_ENCODE]);
+
+ }
+#else
+ cpu_capabilities = 0;
+#endif
+
+ /* END init */
+
+ return (uint64) (srclen << 1) +
+ get_hex_encode_alloc_addon(srclen,valid_impl_id_arr[ALGORITHM_ID_HEX_ENCODE]);
}
-static uint64
+uint64
hex_dec_len(const char *src, size_t srclen)
{
return (uint64) srclen >> 1;
diff --git a/postgresql-15devel/src/backend/utils/adt/hex_x86_64.asm b/postgresql-15devel/src/backend/utils/adt/hex_x86_64.asm
new file mode 100644
index 0000000..c2fd0c6
--- /dev/null
+++ b/postgresql-15devel/src/backend/utils/adt/hex_x86_64.asm
@@ -0,0 +1,2915 @@
+%ifdef __NASM_MAJOR__
+%ifdef COMPILE_C_STYLE_COMMENTS
+/*-------------------------------------------------------------------------
+ *
+ * hex_x86_64.asm
+ * Assembler routines for converting a buffer to hex (hex_encode_xxx)
+ * and restore the binary from hex code (hex_decode_xxx) on Intel X64
+ *
+ * Copyright (c) 2021-2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/hex_x86_64.asm
+ *
+ *-------------------------------------------------------------------------
+ */
+%endif
+
+
+
+
+
+
+; hex_x86_64.sam
+; Assembler routines for converting a buffer to hex (hex_encode_xxx)
+; and restore the binary from hex_code (hex_decode_xxx) on Intel X64
+
+
+; nasm -f WIN64 -g hex_x86_64.asm -l hex_x86_64.lis
+
+; golink /console hexdump.obj hex_x86_64.obj base64_x86_64.obj /files
+
+; Linux register order: %rdi, %rsi, %rdx, %rcx, %r8 and %r9
+; Windows register order: rcx, rdx, r8, r9
+
+; Windows non volatile registers: rbx,rbp,rdi,rsi,rsp, r12,r13,r14,r15 and xmm6-xmm15
+; Linux non volatile registers: rbx,rbp, rsp, r12,r13,r14,r15
+
+; nasm -f elf64 -g hex_x86_64.asm -l hex_x86_64_elf64.lis
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define __WIN__ 1
+%elifidn __OUTPUT_FORMAT__, elf64
+%define __ELF__ 1
+%endif
+
+%define NSHIFT_ADDRESS_TO_PAGE 12
+
+%define N_BYTES_PER_SSE2 16
+%define N_BYTES_PER_AVX2 32
+%define N_BYTES_PER_AVX512 64
+
+global get_hex_encode_alloc_addon
+global get_hex_decode_alloc_addon
+
+global hex_encode_fast
+
+global hex_encode_sse2
+global hex_encode_ssse3
+global hex_encode_avx2
+global hex_encode_avx512bw
+
+global hex_decode_sse2
+global hex_decode_avx2
+global hex_decode_avx512bw
+
+
+default rel
+
+section .rdata align=64
+
+; values loaded with VMOVDQA64 in AVX512, so 64 bytes needed
+
+%define VPERM_AVX2_OFFS 0b11_01_10_00
+
+VPERM_ENCODE_OFFSETS dq 0,4,1,5,2,6,3,7
+VPERM_DECODE_OFFSETS dq 0,2,4,6,1,3,5,7
+
+ENCODE_SHUFFLE_TO_HEX times 4 db '0123456789abcdef'
+
+ENCODE_SHUFFLE_TO_HIGH_LOW times 4 db 8,0,9,1, 10,2,11,3, 12,4,13,5, 14,6,15,7
+
+
+; from here on values used with VPBROADCASTQ in AVX512 / VMOVDQA in AVX2, so only 16/32 bytes needed
+
+;BITMASK_UPPER_HALF times 32 db 0b1111_0000
+BITMASK_LOWER_HALF times 32 db 0b0000_1111
+
+BITMASK_NIBBLE_3_IN_WORD times 16 dw 0x0F00
+
+BITMASK_LITTLE_TO_BIG_ASCII times 32 db 0b1101_1111
+BITMASK_BIG_TO_LITTLE_ASCII times 32 db 0b0010_0000
+
+BITMASK_ZERO_ONE times 32 db 0b0101_0101
+
+BITMASK_ONE_ZERO times 32 db 0b1010_1010
+
+BITMASK_SELECT_DIGIT times 32 db 0b0011_1111
+
+ALL_BYTES_9 times 32 db 9
+
+ASCII_LITTLE_A_ADD:
+ALL_BYTES_39 times 32 db 39
+
+ASCII_0_OFFSET:
+ALL_BYTES_48 times 32 db 48
+
+;ASCII_DIGIT_9 times 32 db 48+9
+
+ASCII_LETTER_LITTLE_A times 32 db 'a'
+ASCII_LETTER_LITTLE_F times 32 db 'f'
+
+HEX_ENCODE_ARRAYS:
+HEX_ENC_MIN_SRC_LEN_ARR:
+ dq 0
+ dq 128
+ dq 512
+ dq 512
+ dq 1024
+
+HEX_ENC_RESULT_BUFFER_OVERFLOW_ARR:
+ dq 0
+ dq 64
+ dq 128
+ dq 128
+ dq 256
+
+HEX_ENC_IMPL_ROUTINE_ARR:
+ dq 0
+ dq hex_encode_sse2
+ dq hex_encode_ssse3
+ dq hex_encode_avx2
+ dq hex_encode_avx512bw
+
+
+HEX_DECODE_ARRAYS:
+HEX_DEC_MIN_SRC_LEN_ARR:
+ dq 0
+ dq 128
+ dq 512
+ dq 1024
+
+HEX_DEC_RESULT_BUFFER_OVERFLOW_ARR:
+ dq 0
+ dq 64
+ dq 128
+ dq 256
+
+HEX_DEC_IMPL_ROUTINE_ARR:
+ dq 0
+ dq hex_decode_sse2
+ dq hex_decode_avx2
+ dq hex_decode_avx512bw
+
+
+
+section .text align=32
+
+
+%use smartalign
+
+ ALIGNMODE p6
+
+%ifdef __WIN__
+%define STACK_FOR_XMM 10*16
+%else
+%define STACK_FOR_XMM 0
+%endif
+
+;----------------------------------------------------------------------------------------------
+
+; get_hex_encode_alloc_addon returns the tail-handling-required allocation addon
+; according to the request length and the maximum valid impl_id
+; it looks for the correct values in the hex_enc_tables indexed by impl_id
+
+get_hex_encode_alloc_addon:
+
+ sub rsp,0x28
+
+%ifdef __WIN__
+; mov rcx,rcx ; WIN parameter 1 requested source len
+; mov rdx,rdx ; WIN parameter 2 maximum valid impl_id
+%else
+ mov rcx,rdi ; LINUX parameter 1 requested source len
+ mov rdx,rsi ; LINUX parameter 2 maximum valid impl_id
+%endif
+
+ lea r8,[HEX_ENC_MIN_SRC_LEN_ARR]
+.loop_search:
+ cmp rcx,[r8+8*rdx] ; compare requested length with current impl_id
+ jge .offset_found
+ sub rdx,1 ; lower impl_id
+ jnz .loop_search
+.offset_found:
+ lea r8,[HEX_ENC_RESULT_BUFFER_OVERFLOW_ARR]
+ mov rax,[r8+8*rdx] ; return the alloc_overflow
+
+ add rsp,0x28
+
+ ret
+
+
+;----------------------------------------------------------------------------------------------
+
+; get_hex_decode_alloc_addon returns the tail-handling-required allocation addon
+; according to the request length and the maximum valid impl_id
+; It looks for the correct values in the hex_enc_tables indexed by impl_id
+
+get_hex_decode_alloc_addon:
+
+ sub rsp,0x28
+
+%ifdef __WIN__
+; mov rcx,rcx ; WIN parameter 1 requested source len
+; mov rdx,rdx ; WIN parameter 2 maximum valid impl_id
+%else
+ mov rcx,rdi ; LINUX parameter 1 requested source len
+ mov rdx,rsi ; LINUX parameter 2 maximum valid impl_id
+%endif
+
+ lea r8,[HEX_DEC_MIN_SRC_LEN_ARR]
+.loop_search:
+ cmp rcx,[r8+8*rdx] ; compare requested length with current impl_id
+ jge .offset_found
+ sub rdx,1 ; lower impl_id
+ jnz .loop_search
+.offset_found:
+ lea r8,[HEX_DEC_RESULT_BUFFER_OVERFLOW_ARR]
+ mov rax,[r8+8*rdx] ; return the alloc_overflow
+
+ add rsp,0x28
+
+ ret
+
+
+
+
+;----------------------------------------------------------------------------------------------
+
+; hex_encode_fast is the dispatcher routine according to the cpu capabilities and
+; the length of the encode request.
+;
+; Parameter 4 (moved to r15) is the maximum valid impl_id fullfilling the cpu requirements
+; (determined at program initialization time outside this routine)
+; The index into the HEX_ENCODE_ARRAYS is set to the maximum supported requirements.
+; When r15 == 0 no fast encode is supported and a zero length is returned.
+
+%define STACK_ADJ 0x28+2*8
+
+hex_encode_fast:
+
+ sub rsp,STACK_ADJ
+
+ mov [rsp+0*8],r9
+ mov [rsp+1*8],r15
+
+
+ ; r15 = checked highest valid index
+%ifdef __WIN__
+ mov rax,r8 ; WIN parameter 3 number of elements
+ mov r15,r9 ; WIN parameter 4 maximum valid impl_id
+%else
+ mov rax,rdx ; LINUX parameter 3 number of elements
+ mov r15,rcx ; LINUX parameter 4 maximum valid impl_id
+%endif
+
+ lea r10,[HEX_ENC_MIN_SRC_LEN_ARR]
+
+.check_length:
+ cmp rax,[r10+8*r15]
+ jge .max_length_found
+ sub r15,1
+ jnz .check_length
+
+.max_length_found:
+ xor rax,rax
+ cmp r15,0
+ jz .return
+
+ lea r10,[HEX_ENC_IMPL_ROUTINE_ARR]
+ call [r10+8*r15]
+
+.return:
+ mov r9,[rsp+0*8]
+ mov r15,[rsp+1*8]
+
+ add rsp,STACK_ADJ
+
+ ret
+
+
+
+%define STACK_ADJ 0x28+6*8+STACK_FOR_XMM
+
+
+
+;----------------------------------------------------------------------------------------------
+
+
+; xmm15 ; CONST ALL bytes 9
+; xmm14 ; CONST BITMASK_LOWER_HALF
+; xmm13 ; CONST ASCII_0_OFFSET
+; xmm12 ; CONST ASCII_LITTLE_A_ADD
+; xmm11 ; Prefetch Input line 3
+; xmm10 ; Prefetch Input line 2
+; xmm9 ; Input Line 1
+; xmm8 ; Input Line 0
+
+; xmm7 ; Unpack RL1 Rght Half low bits secnd line
+; xmm6 ; Unpack RH1 Rght Half high bits secnd line
+; xmm5 ; Unpack LL1 Left Half low bits secnd line
+; xmm4 ; Unpack LH1 Left Half high bits secnd line
+; xmm3 ; Unpack RL0 Rght Half low bits first line
+; xmm2 ; Unpack RH0 Rght Half high bits first line
+; xmm1 ; Unpack LL0 Left Half low bits first line
+; xmm0 ; Unpack LH0 Left Half high bits first line
+
+
+
+%define NINP_BYTES_PER_ROUND 2*16
+%define NINP_BITSHIFT 5
+
+hex_encode_sse2:
+
+ sub rsp,STACK_ADJ
+
+ mov [rsp+STACK_FOR_XMM+0*8],rdi
+ mov [rsp+STACK_FOR_XMM+1*8],rsi
+ mov [rsp+STACK_FOR_XMM+2*8],r12
+ mov [rsp+STACK_FOR_XMM+3*8],r14
+ mov [rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+ MOVDQA [rsp ],xmm6
+ MOVDQA [rsp+1*16],xmm7
+ MOVDQA [rsp+2*16],xmm8
+ MOVDQA [rsp+3*16],xmm9
+ MOVDQA [rsp+4*16],xmm10
+ MOVDQA [rsp+5*16],xmm11
+ MOVDQA [rsp+6*16],xmm12
+ MOVDQA [rsp+7*16],xmm13
+ MOVDQA [rsp+8*16],xmm14
+ MOVDQA [rsp+9*16],xmm15
+
+ mov rdi,rcx ; parameter 1 output buffer
+
+ mov rsi,rdx ; parameter 2 input buffer
+
+ mov rdx,r8 ; parameter 3 number of elements
+
+%endif
+
+
+;; initializer for QQ0 and QQ1
+
+ MOVDQU xmm8,[rsi+0*16] ; QQ0 p__23__ p__23__ l8 QL0
+ MOVDQU xmm9,[rsi+1*16] ; QQ1 p__23__ p__23__ l8 QL0
+
+;; initialize constants
+
+ MOVDQA xmm15,[ALL_BYTES_9] ; p_23__ l3
+
+ MOVDQA xmm14,[BITMASK_NIBBLE_3_IN_WORD] ; p_23__ l3
+
+; MOVDQA xmm13,[ALL_BYTES_48] ; p_23__ l3
+
+ MOVDQA xmm12,[ALL_BYTES_39] ; p_23__ l3
+
+
+ MOVDQA xmm13,xmm12
+ PADDB xmm13,xmm15 ; 48 = 39+9
+
+
+;; do page overshoot checks
+
+ mov rax,NINP_BYTES_PER_ROUND
+
+
+ mov r9,rdx ; exact requested number of elements to process
+ add r9,rsi ; r9 last valid pointer +1 of requested input buffer
+
+ mov r10,rsi ; r10 saved start of input buffer
+ mov r12,r9 ; r12 save of end of input buffer+1
+
+ lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
+
+ lea r8,[rdx+NINP_BYTES_PER_ROUND-1]
+ shr r8,NINP_BITSHIFT ; number of loops
+ shl r8,NINP_BITSHIFT
+ add r8,rsi ; r8 address of last byte+1 read in complete loops
+ add r8,NINP_BYTES_PER_ROUND-1 ; r8 address of last byte read in normal loop with overshoot
+
+ mov r11,r8
+
+; DISABLED for NO OVERSHOOT
+; add r11,rax ; r11 address of last byte of prefetched data
+
+ shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input
+ shr r8,NSHIFT_ADDRESS_TO_PAGE ; r8 page number of last byte read after normal round
+
+ cmp rcx,r8 ; stay on same page
+ je .LSAME_PAGE_IN_ROUND
+ sub rdx,rax ; don't overshoot in reading: do one round less
+
+.LSAME_PAGE_IN_ROUND:
+ shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
+ cmp rcx,r11
+ je .LSAME_PAGE_IN_PREFETCH
+ sub rdx,rax ; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+ add rdx,rsi ; rdx last valid pointer+1 for normal loop
+
+ ; due to prefetch add one round to end checks
+ add rdx,rax
+ add r9,rax
+
+ mov r11,rdi ; r11 saved start of output buffer
+
+ mov rcx,NINP_BYTES_PER_ROUND<<1 ; increment of output buffer for each round
+
+;; start preprocessing before loop
+
+ add rsi,rax ; add the number of processed array elements
+
+ align 32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+.LHEXENCODE_LOOP:
+
+ MOVDQA xmm6,xmm8
+ PUNPCKLBW xmm6,xmm9 ; QL0 p____5 l1 QQ0 [Lin0_RghtH] [00 HL_0 00 HL_1 ...]
+
+
+ MOVDQA xmm7,xmm8
+ PUNPCKHBW xmm7,xmm9 ; QL0 p____5 l1 QQ0 [Lin0_LeftH] [00 HL_0 00 HL_1 ...]
+
+ MOVDQA xmm8,xmm10
+
+ MOVDQU xmm10,[rsi+0*16] ; QL0 p_____5 p1____5 l3+ QL0
+
+
+;;
+ MOVDQA xmm4,xmm6
+ PSRLW xmm4,12 ; RL2 shift RL2 Hx to lower byte in word
+ MOVDQA xmm5,xmm6
+ PAND xmm5,xmm14 ; RL2 mask nibble 3 in word (lower nibble shifted 8 bits left)
+
+ MOVDQA xmm9,xmm11
+
+ MOVDQU xmm11,[rsi+1*16] ; QL1 p_____5 p1____5 l3+ QL0
+
+ add rsi,rax ; add the number of processed array elements
+
+
+ PSLLW xmm6,8 ; RL0 rotate (shift) RL0 1 byte to left
+
+ MOVDQA xmm0,xmm6
+ PSRLW xmm0,4+8 ; RL0 shift RL0 Hx to lower byte in word
+ POR xmm4,xmm5 ; RL2 low nibble, high nibble at correct position (0L0H)
+
+
+
+ MOVDQA xmm1,xmm6
+ PAND xmm1,xmm14 ; RL0 mask nibble 3 in word (lower nibble shifted 8 bits left)
+
+ MOVDQA xmm6,xmm7
+ PSLLW xmm6,8 ; RL1 rotate (shift) RL1 1 byte to left
+
+
+ MOVDQA xmm5,xmm4
+ PCMPGTB xmm5,xmm15 ; RL2 all letters set to 0xFF, all digits to 0
+
+
+ POR xmm0,xmm1 ; RL0 low nibble, high nibble at correct position (0L0H)
+
+ PADDB xmm4,xmm13 ; RL2 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39
+
+
+ MOVDQA xmm2,xmm6
+ PSRLW xmm2,4+8 ; RL1 shift RL1 Hx to lower byte in word
+ MOVDQA xmm3,xmm6
+ PAND xmm3,xmm14 ; RL1 mask nibble 3 in word (lower nibble shifted 8 bits left)
+
+ MOVDQA xmm1,xmm0
+ PCMPGTB xmm1,xmm15 ; RL0 all letters set to 0xFF, all digits to 0
+
+
+ PAND xmm5,xmm12 ; RL2 for all letters set to 39, else 0 (
+ ; RL2 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters)
+
+ POR xmm2,xmm3 ; RL1 low nibble, high nibble at correct position (0L0H)
+
+ PAND xmm1,xmm12 ; RL0 for all letters set to 39, else 0 (
+ ; RL0 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters)
+ PADDB xmm4,xmm5 ; RL2 final result line RL2
+
+ PADDB xmm0,xmm13 ; RL0 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39
+
+ MOVDQA xmm3,xmm2
+ PCMPGTB xmm3,xmm15 ; RL1 all letters set to 0xFF, all digits to 0
+
+ PADDB xmm2,xmm13 ; RL1 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39
+
+
+ PADDB xmm0,xmm1 ; RL0 final result line RL0
+
+ MOVDQA xmm1,xmm7
+ PSRLW xmm1,12 ; RL3 shift RL3 Hx to lower byte in word
+
+ PAND xmm3,xmm12 ; RL1 for all letters set to 39, else 0 (
+ ; RL1 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters)
+ PADDB xmm2,xmm3 ; RL1 final result line RL1
+
+ MOVDQU [rdi+0*16],xmm0 ; RL0 RL0 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+ PAND xmm7,xmm14 ; RL3 mask nibble 3 in word (lower nibble shifted 8 bits left)
+
+ MOVDQA xmm6,xmm7
+ POR xmm6,xmm1 ; RL3 low nibble, high nibble at correct position (0L0H)
+
+
+ MOVDQU [rdi+1*16],xmm2 ; RL1 RL1 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+
+ MOVDQA xmm7,xmm6
+ PCMPGTB xmm7,xmm15 ; RL3 all letters set to 0xFF, all digits to 0
+
+ PADDB xmm6,xmm13 ; RL3 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39
+
+ PAND xmm7,xmm12 ; RL3 for all letters set to 39, else 0 (
+ ; RL3 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters)
+ MOVDQU [rdi+2*16],xmm4 ; RL2 RL2 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+
+ PADDB xmm6,xmm7 ; RL3 final result line RL2
+
+ MOVDQU [rdi+3*16],xmm6 ; RL3 RL3 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+
+ add rdi,rcx ; add the number of processed output bytes
+
+ cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+ jl .LHEXENCODE_LOOP
+
+ ; end of normal loop reached
+ ; we can do one more round when original count has been reduced by one round
+ cmp rax,0
+ je .LFINISH_EXTRA
+
+ cmp rdx,r9 ; input buffer length was not reduced when equal
+ je .LFINISH_NORMAL
+
+ sub rsi,rax ; for prefetching the last round, load the last round again
+ sub rdx,rax ; adopt and condition for last round also
+ xor rax,rax
+ jmp .LHEXENCODE_LOOP
+
+
+.LFINISH_EXTRA:
+ add rsi,NINP_BYTES_PER_ROUND ; add the extra round to get processed bytes
+ jmp .LFINISH
+
+.LFINISH_NORMAL:
+ sub rsi,NINP_BYTES_PER_ROUND ; sub the added prefetch round to get processed bytes
+
+.LFINISH:
+ ; r9 = address of requested input bytes+1
+ ; rsi = address of processed input bytes+1
+ ; now get the minimum of rdx,rsi to rax
+;; sub rsi,rax ; for last round do nothing (rax=0), else sub increment for one round
+;; sub r9,rax
+
+ mov rax,r12
+ cmp rsi,r12 ; get min from r12 (address of requested input) and rsi (address of done input)
+
+ jge .LCALC_PROCESSED_BYTES
+ mov rax,rsi ; rax=address of last valid input byte+1
+
+.LCALC_PROCESSED_BYTES:
+ sub rax,r10 ; sub the input buffer start address
+ ; rax = number of valid processed input bytes = return value
+
+ cmp rsi,rdx ; compare rdx (address of requested input) and rsi (address of done input)
+ je .LNO_ZERO_OUT
+
+ mov r15,rax ; number of elements to process
+
+ shl r15,1 ; number of output bytes
+
+ add r15,r11 ; pointer to next byte after full valid output buffer
+
+ PXOR xmm0,xmm0 ; all zero
+;ZERO MOVDQU [r15],xmm0 ; zero out one register width after last output
+
+.LNO_ZERO_OUT:
+
+%ifdef __WIN__
+
+ MOVDQA xmm6 ,[rsp ]
+ MOVDQA xmm7 ,[rsp+1*16]
+ MOVDQA xmm8 ,[rsp+2*16]
+ MOVDQA xmm9 ,[rsp+3*16]
+ MOVDQA xmm10,[rsp+4*16]
+ MOVDQA xmm11,[rsp+5*16]
+ MOVDQA xmm12,[rsp+6*16]
+ MOVDQA xmm13,[rsp+7*16]
+ MOVDQA xmm14,[rsp+8*16]
+ MOVDQA xmm15,[rsp+9*16]
+
+
+%endif
+
+ mov rdi,[rsp+STACK_FOR_XMM+0*8]
+ mov rsi,[rsp+STACK_FOR_XMM+1*8]
+ mov r12,[rsp+STACK_FOR_XMM+2*8]
+ mov r14,[rsp+STACK_FOR_XMM+3*8]
+ mov r15,[rsp+STACK_FOR_XMM+4*8]
+
+ add rsp,STACK_ADJ
+
+ ret
+
+
+;----------------------------------------------------------------------------------------------
+
+
+
+
+; xmm15 ; CONST ALL bytes 9
+; xmm14 ; MAX byte value of all lines (Init all to letter 'F' (0x46) ;CONST BITMASK_LOWER_HALF
+; xmm13 ; MIN byte value of non-ascii-digit values (not 3x) (Init all to letter 'A' (0x41) ;CONST BITMASK_ASCII_0
+; xmm12 ; ORed compare of all digit-values cmp > 9 (Init all zero) ;CONST BITMASK_WORD_LOWER_BYTE
+; xmm11 ; Input line 3
+; xmm10 ; Input line 2
+; xmm9 ; Input line 1
+; xmm8 ; Input Line 0
+
+; xmm7 ; Unpack RL1 Rght Half low bits secnd line
+; xmm6 ; Unpack RH1 Rght Half high bits secnd line
+; xmm5 ; Unpack LL1 Left Half low bits secnd line
+; xmm4 ; Unpack LH1 Left Half high bits secnd line
+; xmm3 ; Unpack RL0 Rght Half low bits first line
+; xmm2 ; Unpack RH0 Rght Half high bits first line
+; xmm1 ; Unpack LL0 Left Half low bits first line
+; xmm0 ; Unpack LH0 Left Half high bits first line
+
+
+%define NINP_BYTES_PER_ROUND 4*16
+%define NINP_BITSHIFT 6
+
+hex_decode_sse2:
+
+ sub rsp,STACK_ADJ
+
+ mov [rsp+STACK_FOR_XMM+0*8],rdi
+ mov [rsp+STACK_FOR_XMM+1*8],rsi
+ mov [rsp+STACK_FOR_XMM+2*8],r12
+ mov [rsp+STACK_FOR_XMM+3*8],r14
+ mov [rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+ MOVDQA [rsp ],xmm6
+ MOVDQA [rsp+1*16],xmm7
+ MOVDQA [rsp+2*16],xmm8
+ MOVDQA [rsp+3*16],xmm9
+ MOVDQA [rsp+4*16],xmm10
+ MOVDQA [rsp+5*16],xmm11
+ MOVDQA [rsp+6*16],xmm12
+ MOVDQA [rsp+7*16],xmm13
+ MOVDQA [rsp+8*16],xmm14
+ MOVDQA [rsp+9*16],xmm15
+
+ mov rdi,rcx ; parameter 1 output buffer
+
+ mov rsi,rdx ; parameter 2 input buffer
+
+ mov rdx,r8 ; parameter 3 number of elements
+
+%endif
+
+;; initializer for QQ0 and QQ1
+
+ MOVDQU xmm8,[rsi] ;
+ MOVDQU xmm9,[rsi+1*16] ;
+
+ MOVDQU xmm10,[rsi+2*16] ;
+ MOVDQU xmm11,[rsi+3*16] ;
+
+;; initialize constants
+
+ mov r15,[BITMASK_BIG_TO_LITTLE_ASCII]
+
+ MOVDQA xmm7,[BITMASK_LOWER_HALF]
+
+ MOVDQA xmm15,[ALL_BYTES_9] ; p_23__ l3
+
+ MOVDQA xmm14,[ASCII_LETTER_LITTLE_F] ; p_23__ l3
+
+ MOVDQA xmm13,[ASCII_LETTER_LITTLE_A] ; p_23__ l3
+
+ PXOR xmm12,xmm12 ; all zero
+
+ MOVQ xmm2,r15 ; 0b0010_0000
+
+
+;; do page overshoot checks
+;; due to end condition handling not done here, we only process full rounds
+
+ mov rax,NINP_BYTES_PER_ROUND
+
+ add rdx,NINP_BYTES_PER_ROUND-1
+ shr rdx,NINP_BITSHIFT ;
+ shl rdx,NINP_BITSHIFT ; rdx number of bytes read in normal loop equiv to xxx full loops
+
+ mov r9,rdx ; exact requested number of elements to process
+ add r9,rsi ; r9 last valid pointer +1 of requested input buffer
+
+ mov r10,rsi ; r10 saved start of input buffer
+ mov r12,r9 ; r12 save of end of input buffer+1
+
+ lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
+
+ mov r11,r9
+; DISABLED for NO OVERSHOOT
+; add r11,rax ; r11 address of last byte of prefetched data
+
+ shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input
+ shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
+ cmp rcx,r11
+ je .LSAME_PAGE_IN_PREFETCH
+ sub rdx,rax ; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+ add rdx,rsi ; rdx last valid pointer+1 for normal loop
+
+ ; due to prefetch add one round to end checks
+ add rdx,rax
+ add r9,rax
+
+ mov r11,rdi ; r11 saved start of output buffer
+
+ mov rcx,NINP_BYTES_PER_ROUND>>1 ; increment of output buffer for each round
+
+;; start preprocessing before loop
+
+ PUNPCKLQDQ xmm2,xmm2 ; all bytes 0b0010_0000
+
+; PUNPCKLQDQ xmm7,xmm7 ; all bytes 0b0000_1111
+
+ MOVDQA xmm0,xmm2
+ MOVDQA xmm1,xmm2
+
+ MOVDQA xmm4,xmm2
+ MOVDQA xmm5,xmm2
+
+ add rsi,rax ; add the number of processed array elements
+
+ align 32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+.LHEXDECODE_LOOP:
+
+ MOVDQA xmm6,xmm2
+
+
+ PSRAD xmm6,1 ; all bytes 0b0001_0000
+
+ POR xmm0,xmm8 ; line 0 all letters set to little ASCII a-f
+ POR xmm1,xmm9
+ POR xmm4,xmm10
+ POR xmm5,xmm11
+
+ PMAXUB xmm14,xmm0
+ PMAXUB xmm14,xmm1
+ PMAXUB xmm14,xmm4
+ PMAXUB xmm14,xmm5
+
+;max check finished
+
+ POR xmm0,xmm6 ; line 0 with bits for ASCII_0 set (Byte OR 0bxx11_xxxx)
+ POR xmm1,xmm6
+ POR xmm4,xmm6
+ POR xmm5,xmm6
+
+ PCMPEQD xmm6,xmm6 ; all ONE
+
+ PCMPEQB xmm0,xmm8 ; set to all ONE when ASCI Digit (forced bit 0bxx11_xxxx equal to orig value)
+ PCMPEQB xmm1,xmm9
+ PCMPEQB xmm4,xmm10
+ PCMPEQB xmm5,xmm11
+
+;start min check line0+1
+ MOVDQA xmm2,xmm0 ; copy all one when digit
+ MOVDQA xmm3,xmm1
+
+ PANDN xmm2,xmm6 ; set to all one for values NOT digits
+ PANDN xmm3,xmm6
+
+ PAND xmm2,xmm8 ; set to orig value when NOT ASCI Digit
+ PAND xmm3,xmm9
+
+ POR xmm2,xmm0 ; set all zero bytes to all one
+ POR xmm3,xmm1
+
+ PMINUB xmm13,xmm2
+ PMINUB xmm13,xmm3
+
+
+;start min check line2+3
+ MOVDQA xmm2,xmm4 ; copy all one when digit
+ MOVDQA xmm3,xmm5
+
+
+ PANDN xmm2,xmm6 ; set to all one for values NOT digits
+ PANDN xmm3,xmm6
+
+ PAND xmm2,xmm10 ; set to orig value when NOT ASCI Digit
+ PAND xmm3,xmm11
+
+ POR xmm2,xmm4 ; set all zero bytes to all one
+ POR xmm3,xmm5
+
+ PMINUB xmm13,xmm2
+ PMINUB xmm13,xmm3
+
+
+; start legal digit check
+
+ MOVDQA xmm2,xmm0 ; copy all one when digit
+ MOVDQA xmm3,xmm1
+
+ PAND xmm2,xmm8 ; set to orig value when ASCI Digit
+ PAND xmm3,xmm9
+
+ PAND xmm2,xmm7 ; set to lower nibble value when ASCI Digit
+ PAND xmm2,xmm7
+
+ PCMPGTB xmm2,xmm15 ; set to all ONE when ASCI Digit and value > 9
+ PCMPGTB xmm3,xmm15
+
+ POR xmm12,xmm2 ; accumulate illegal chars like ASCII digit and value > 9
+ POR xmm12,xmm3
+
+ PAND xmm2,xmm10 ; set to orig value when ASCI Digit
+ PAND xmm3,xmm11
+
+ PAND xmm2,xmm7 ; set to lower nibble value when ASCI Digit
+ PAND xmm3,xmm7
+
+
+ PCMPGTB xmm2,xmm15 ; set to all ONE when ASCI Digit and value > 9
+ PCMPGTB xmm3,xmm15 ; set to orig value when ASCI Digit
+
+ POR xmm12,xmm2
+ POR xmm12,xmm3
+
+
+
+;-- ; all checks accumulated, xmm2,xmm3,xmm6,xmm7 have lower nibbles of lines 0-3
+ PCMPEQD xmm6,xmm6 ; all ONE
+ PSRLW xmm6,8 ; QQ0 p01____ p01____ l1
+
+ MOVDQA xmm2,xmm7 ; all bytes 0b0000_1111
+ MOVDQA xmm3,xmm7
+
+ PAND xmm2,xmm8 ; all byte values only lower half (nibble) Line 0+1
+ MOVDQU xmm8,[rsi+0*16] ;
+ PAND xmm3,xmm9
+ MOVDQU xmm9,[rsi+1*16] ;
+
+ PANDN xmm0,xmm15 ; put 9 to every element not DIGIT
+ PANDN xmm1,xmm15
+
+ PADDB xmm2,xmm0 ; add 9 to every nibble not DIGIT
+ PADDB xmm3,xmm1
+
+ MOVDQA xmm0,xmm2
+ PSRLW xmm0,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ PSLLW xmm2,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ MOVDQA xmm1,xmm3
+ PSRLW xmm1,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ PSLLW xmm3,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ POR xmm0,xmm2 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+ POR xmm1,xmm3 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+
+ PAND xmm0,xmm6 ; line 0
+ PAND xmm1,xmm6 ; line 1
+
+ PACKUSWB xmm0,xmm1 ; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...]
+
+; line 0 and 1 processed
+
+
+ MOVDQA xmm2,xmm7 ; all bytes 0b0000_1111
+ MOVDQA xmm3,xmm7
+
+ PAND xmm2,xmm10 ; all byte values only lower half (nibble) Line 0+1
+ MOVDQU xmm10,[rsi+2*16] ;
+ PAND xmm3,xmm11
+ MOVDQU xmm11,[rsi+3*16] ;
+
+ PANDN xmm4,xmm15 ; put 9 to every element not DIGIT
+ PANDN xmm5,xmm15
+
+ PADDB xmm2,xmm4 ; add 9 to every nibble not DIGIT
+ PADDB xmm3,xmm5
+
+ add rsi,rax ; add the number of processed array elements
+
+ MOVDQU [rdi+0*16],xmm0 ; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+
+ MOVDQA xmm4,xmm2
+ PSRLW xmm4,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ PSLLW xmm2,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ MOVDQA xmm5,xmm3
+ PSRLW xmm5,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ PSLLW xmm3,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+
+ POR xmm4,xmm2 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+ POR xmm5,xmm3 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+
+ MOVQ xmm2,r15 ;
+ PUNPCKLQDQ xmm2,xmm2 ; all bytes 0b0010_0000
+
+ MOVDQA xmm0,xmm2
+ MOVDQA xmm1,xmm2
+
+; MOVQ xmm7,rcx ;
+
+ PAND xmm4,xmm6 ;
+ PAND xmm5,xmm6 ; line 1
+
+ PACKUSWB xmm4,xmm5 ; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...]
+
+; MOVDQA xmm1,xmm11
+
+
+ MOVDQU [rdi+1*16],xmm4 ; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+ MOVDQA xmm4,xmm2
+ MOVDQA xmm5,xmm2
+
+; PUNPCKLQDQ xmm7,xmm7 ; all bytes 0b0000_1111
+
+
+ add rdi,rcx ; add the number of processed output bytes
+
+ cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+ jl .LHEXDECODE_LOOP
+
+ ; end of normal loop reached
+ ; we can do one more round when original count has been reduced by one round
+ cmp rax,0
+ je .LFINISH
+
+ cmp rdx,r9 ; input buffer length was not reduced when equal
+ je .LFINISH
+
+ sub rsi,rax ; for prefetching the last round, load the last round again
+ sub rdx,rax ; adopt and condition for last round also
+ xor rax,rax
+ jmp .LHEXDECODE_LOOP
+
+.LFINISH:
+
+ mov rax,rdi
+ sub rax,r11 ; rax = number of output bytes
+ add rax,rax ; rax = number of valid processed input bytes = return value
+
+%ifdef __WIN__
+
+ MOVDQA xmm6 ,[rsp ]
+ MOVDQA xmm7 ,[rsp+1*16]
+ MOVDQA xmm8 ,[rsp+2*16]
+ MOVDQA xmm9 ,[rsp+3*16]
+ MOVDQA xmm10,[rsp+4*16]
+ MOVDQA xmm11,[rsp+5*16]
+ MOVDQA xmm12,[rsp+6*16]
+ MOVDQA xmm13,[rsp+7*16]
+ MOVDQA xmm14,[rsp+8*16]
+ MOVDQA xmm15,[rsp+9*16]
+
+%endif
+
+ mov rdi,[rsp+STACK_FOR_XMM+0*8]
+ mov rsi,[rsp+STACK_FOR_XMM+1*8]
+ mov r12,[rsp+STACK_FOR_XMM+2*8]
+ mov r14,[rsp+STACK_FOR_XMM+3*8]
+ mov r15,[rsp+STACK_FOR_XMM+4*8]
+
+ add rsp,STACK_ADJ
+
+ ret
+
+
+
+;----------------------------------------------------------------------------------------------
+
+
+
+
+; ymm15 ; CONST ALL bytes 9
+; ymm14 ; MAX byte value of all lines (Init all to letter 'F' (0x46) ;CONST BITMASK_LOWER_HALF
+; ymm13 ; MIN byte value of non-ascii-digit values (not 3x) (Init all to letter 'A' (0x41) ;CONST BITMASK_ASCII_0
+; ymm12 ; ORed compare of all digit-values cmp > 9 (Init all zero) ;CONST BITMASK_WORD_LOWER_BYTE
+; ymm11 ; Input line 3
+; ymm10 ; Input line 2
+; ymm9 ; Input line 1
+; ymm8 ; Input Line 0
+
+; ymm7 ; CONST BITMASK_LOWER_HALF ;Unpack RL1 Rght Half low bits secnd line
+; ymm6 ; Unpack RH1 Rght Half high bits secnd line
+; ymm5 ; Unpack LL1 Left Half low bits secnd line
+; ymm4 ; Unpack LH1 Left Half high bits secnd line
+; ymm3 ; Unpack RL0 Rght Half low bits first line
+; ymm2 ; Unpack RH0 Rght Half high bits first line
+; ymm1 ; Unpack LL0 Left Half low bits first line
+; ymm0 ; Unpack LH0 Left Half high bits first line
+
+
+%define NINP_BYTES_PER_ROUND 4*32
+%define NINP_BITSHIFT 7
+
+hex_decode_avx2:
+
+ sub rsp,STACK_ADJ
+
+ mov [rsp+STACK_FOR_XMM+0*8],rdi
+ mov [rsp+STACK_FOR_XMM+1*8],rsi
+ mov [rsp+STACK_FOR_XMM+2*8],r12
+ mov [rsp+STACK_FOR_XMM+3*8],r14
+ mov [rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+ VMOVDQA [rsp ],xmm6
+ VMOVDQA [rsp+1*16],xmm7
+ VMOVDQA [rsp+2*16],xmm8
+ VMOVDQA [rsp+3*16],xmm9
+ VMOVDQA [rsp+4*16],xmm10
+ VMOVDQA [rsp+5*16],xmm11
+ VMOVDQA [rsp+6*16],xmm12
+ VMOVDQA [rsp+7*16],xmm13
+ VMOVDQA [rsp+8*16],xmm14
+ VMOVDQA [rsp+9*16],xmm15
+
+ mov rdi,rcx ; parameter 1 output buffer
+
+ mov rsi,rdx ; parameter 2 input buffer
+
+ mov rdx,r8 ; parameter 3 number of elements
+
+%endif
+
+;; initializer for QQ0 and QQ1
+
+ VMOVDQU ymm8,[rsi+0*32] ;
+ VMOVDQU ymm9,[rsi+1*32] ;
+
+ VMOVDQU ymm10,[rsi+2*32] ;
+ VMOVDQU ymm11,[rsi+3*32] ;
+
+;; initialize constants
+
+ VMOVDQA ymm15,[ALL_BYTES_9] ; p_23__ l3
+
+ VMOVDQA ymm14,[ASCII_LETTER_LITTLE_F] ; p_23__ l3
+
+ VMOVDQA ymm13,[ASCII_LETTER_LITTLE_A] ; p_23__ l3
+
+ VMOVDQA ymm7,[BITMASK_LOWER_HALF] ; 0b0000_1111
+
+ VPXOR ymm12,ymm12 ; all zero
+
+ VMOVDQA ymm2,[BITMASK_BIG_TO_LITTLE_ASCII] ; 0b0010_0000
+
+
+
+;; do page overshoot checks
+;; due to end condition handling not done here, we only process full rounds
+
+ mov rax,NINP_BYTES_PER_ROUND
+
+ add rdx,NINP_BYTES_PER_ROUND-1
+ shr rdx,NINP_BITSHIFT ;
+ shl rdx,NINP_BITSHIFT ; rdx number of bytes read in normal loop equiv to xxx full loops
+
+ mov r9,rdx ; exact requested number of elements to process
+ add r9,rsi ; r9 last valid pointer +1 of requested input buffer
+
+ mov r10,rsi ; r10 saved start of input buffer
+ mov r12,r9 ; r12 save of end of input buffer+1
+
+ lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
+
+ mov r11,r9
+; DISABLED for NO OVERSHOOT
+; add r11,rax ; r11 address of last byte of prefetched data
+
+ shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input
+ shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
+ cmp rcx,r11
+ je .LSAME_PAGE_IN_PREFETCH
+ sub rdx,rax ; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+ add rdx,rsi ; rdx last valid pointer+1 for normal loop
+
+ ; due to prefetch add one round to end checks
+ add rdx,rax
+ add r9,rax
+
+ mov r11,rdi ; r11 saved start of output buffer
+
+ mov rcx,NINP_BYTES_PER_ROUND>>1 ; increment of output buffer for each round
+
+;; start preprocessing before loop
+
+ add rsi,rax ; add the number of processed array elements
+
+ align 32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+.LHEXDECODE_LOOP:
+
+ VMOVDQA ymm6,ymm2
+
+
+ VPSRAD ymm6,1 ; all bytes 0b0001_0000
+
+ VPOR ymm0,ymm2,ymm8 ; line 0 all letters set to little ASCII a-f
+ VPOR ymm1,ymm2,ymm9
+ VPOR ymm4,ymm2,ymm10
+ VPOR ymm5,ymm2,ymm11
+
+ VPMAXUB ymm14,ymm0
+ VPMAXUB ymm14,ymm1
+ VPMAXUB ymm14,ymm4
+ VPMAXUB ymm14,ymm5
+
+;max check finished
+
+ VPOR ymm0,ymm6 ; line 0 with bits for ASCII_0 set (Byte OR 0bxx11_xxxx)
+ VPOR ymm1,ymm6
+ VPOR ymm4,ymm6
+ VPOR ymm5,ymm6
+
+ VPCMPEQD ymm6,ymm6 ; all ONE
+
+ VPCMPEQB ymm0,ymm8 ; set to all ONE when ASCI Digit (forced bit 0bxx11_xxxx equal to orig value)
+ VPCMPEQB ymm1,ymm9
+ VPCMPEQB ymm4,ymm10
+ VPCMPEQB ymm5,ymm11
+
+;start min check line0+1
+ VPANDN ymm2,ymm0,ymm6 ; set to all one for values NOT digits
+ VPANDN ymm3,ymm1,ymm6
+
+ VPAND ymm2,ymm8 ; set to orig value when NOT ASCI Digit
+ VPAND ymm3,ymm9
+
+ VPOR ymm2,ymm0 ; set all zero bytes to all one
+ VPOR ymm3,ymm1
+
+ VPMINUB ymm13,ymm2
+ VPMINUB ymm13,ymm3
+
+
+;start min check line2+3
+
+
+ VPANDN ymm2,ymm4,ymm6 ; set to all one for values NOT digits
+ VPANDN ymm3,ymm5,ymm6
+
+ VPAND ymm2,ymm10 ; set to orig value when NOT ASCI Digit
+ VPAND ymm3,ymm11
+
+ VPOR ymm2,ymm4 ; set all zero bytes to all one
+ VPOR ymm3,ymm5
+
+ VPMINUB ymm13,ymm2
+ VPMINUB ymm13,ymm3
+
+
+; start legal digit check
+
+ VPAND ymm2,ymm0,ymm8 ; set to orig value when ASCI Digit
+ VPAND ymm3,ymm1,ymm9
+
+ VPAND ymm2,ymm7 ; set to lower nibble value when ASCI Digit
+ VPAND ymm2,ymm7
+
+ VPCMPGTB ymm2,ymm15 ; set to all ONE when ASCI Digit and value > 9
+ VPCMPGTB ymm3,ymm15
+
+ VPOR ymm12,ymm2 ; accumulate illegal chars like ASCII digit and value > 9
+ VPOR ymm12,ymm3
+
+ VPAND ymm2,ymm10 ; set to orig value when ASCI Digit
+ VPAND ymm3,ymm11
+
+ VPAND ymm2,ymm7 ; set to lower nibble value when ASCI Digit
+ VPAND ymm3,ymm7
+
+
+ VPCMPGTB ymm2,ymm15 ; set to all ONE when ASCI Digit and value > 9
+ VPCMPGTB ymm3,ymm15 ; set to orig value when ASCI Digit
+
+ VPOR ymm12,ymm2
+ VPOR ymm12,ymm3
+
+; all (max, min and >9) checks finished
+
+
+;-- ; all checks accumulated, ymm2,ymm3,ymm6,ymm7 have lower nibbles of lines 0-3
+ VPCMPEQD ymm6,ymm6 ; all ONE
+ VPSRLW ymm6,8 ; QQ0 p01____ p01____ l1
+
+ VPAND ymm2,ymm7,ymm8 ; all byte values only lower half (nibble) Line 0+1
+ VMOVDQU ymm8,[rsi+0*32] ;
+ VPAND ymm3,ymm7,ymm9
+ VMOVDQU ymm9,[rsi+1*32] ;
+
+ VPANDN ymm0,ymm15 ; put 9 to every element not DIGIT
+ VPANDN ymm1,ymm15
+
+ VPADDB ymm2,ymm0 ; add 9 to every nibble not DIGIT
+ VPADDB ymm3,ymm1
+
+ VPSRLW ymm0,ymm2,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ VPSLLW ymm2,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ VPSRLW ymm1,ymm3,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ VPSLLW ymm3,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ VPOR ymm0,ymm2 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+ VPOR ymm1,ymm3 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+
+ VPAND ymm0,ymm6 ; line 0
+ VPAND ymm1,ymm6 ; line 1
+
+ VPACKUSWB ymm0,ymm1 ; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...]
+
+; line 0 and 1 processed
+
+
+ VPAND ymm2,ymm7,ymm10 ; all byte values only lower half (nibble) Line 0+1
+ VMOVDQU ymm10,[rsi+2*32] ;
+ VPAND ymm3,ymm7,ymm11
+ VMOVDQU ymm11,[rsi+3*32] ;
+
+ VPANDN ymm4,ymm15 ; put 9 to every element not DIGIT
+ VPANDN ymm5,ymm15
+
+ VPADDB ymm2,ymm4 ; add 9 to every nibble not DIGIT
+ VPADDB ymm3,ymm5
+
+ add rsi,rax ; add the number of processed array elements
+
+ VMOVDQU [rdi+0*32],ymm0 ; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+
+ VPSRLW ymm4,ymm2,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ VPSLLW ymm2,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ VPSRLW ymm5,ymm3,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+ VPSLLW ymm3,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+
+ VPOR ymm4,ymm2 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+ VPOR ymm5,ymm3 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+
+ VMOVDQA ymm2,[BITMASK_BIG_TO_LITTLE_ASCII] ;
+
+
+ VPAND ymm4,ymm6 ;
+ VPAND ymm5,ymm6 ; line 1
+
+ VPACKUSWB ymm4,ymm5 ; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...]
+
+
+ VMOVDQU [rdi+1*32],ymm4 ; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+ add rdi,rcx ; add the number of processed output bytes
+
+
+
+ cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+ jl .LHEXDECODE_LOOP
+
+ ; end of normal loop reached
+ ; we can do one more round when original count has been reduced by one round
+ cmp rax,0
+ je .LFINISH
+
+ cmp rdx,r9 ; input buffer length was not reduced when equal
+ je .LFINISH
+
+ sub rsi,rax ; for prefetching the last round, load the last round again
+ sub rdx,rax ; adopt and condition for last round also
+ xor rax,rax
+ jmp .LHEXDECODE_LOOP
+
+.LFINISH:
+
+ mov rax,rdi
+ sub rax,r11 ; rax = number of output bytes
+ add rax,rax ; rax = number of valid processed input bytes = return value
+
+%ifdef __WIN__
+
+ VMOVDQA xmm6 ,[rsp ]
+ VMOVDQA xmm7 ,[rsp+1*16]
+ VMOVDQA xmm8 ,[rsp+2*16]
+ VMOVDQA xmm9 ,[rsp+3*16]
+ VMOVDQA xmm10,[rsp+4*16]
+ VMOVDQA xmm11,[rsp+5*16]
+ VMOVDQA xmm12,[rsp+6*16]
+ VMOVDQA xmm13,[rsp+7*16]
+ VMOVDQA xmm14,[rsp+8*16]
+ VMOVDQA xmm15,[rsp+9*16]
+
+%endif
+
+ mov rdi,[rsp+STACK_FOR_XMM+0*8]
+ mov rsi,[rsp+STACK_FOR_XMM+1*8]
+ mov r12,[rsp+STACK_FOR_XMM+2*8]
+ mov r14,[rsp+STACK_FOR_XMM+3*8]
+ mov r15,[rsp+STACK_FOR_XMM+4*8]
+
+ add rsp,STACK_ADJ
+
+ ret
+
+
+;----------------------------------------------------------------------------------------------
+
+; k7 ; compare flags lower eq little f, initially all ONE
+; k6 ; compare flags greater eq little a, initially all ONE QL0,QL1
+; k5 ; CONST BITMASK_ZERO_ONE 0101010101010101 selecting lower half
+; k4 ; digit flags QL3
+; k3 ; digit flags QL2
+; k2 ; digit flags QL1
+; k1 ; digit flags QL0
+; k0 ;
+
+; zmm31 ; CONST ALL ZERO
+; zmm30 ; CONST BITMASK_LOWER_HALF
+; zmm29 ; CONST ASCII_0_OFFSET
+; zmm28 ; CONST ASCII_LITTLE_A_ADD
+; zmm27 ; CONST VPERM_DECODE_OFFSETS
+; zmm26 ; CONST ALL bytes 9
+; zmm25 ; Ouptut Line OH1 (Line 0 is stored in the source load regs zmm8-zmm23
+; zmm24 ; Output Line OL1
+; zmm23 ; Preload QL3
+; zmm22 ; Preload QL2
+; zmm21 ; Preload QL1
+; zmm20 ; Preload QL0
+; zmm19 ; Source Load QL3
+; zmm18 ; Source Load QL2
+; zmm17 ; Source Load QL1
+; zmm16 ; Source Load QL0
+
+; zmm15 ; QL3 little a
+; zmm14 ; QL2 little a
+; zmm13 ; QL1 little a
+; zmm12 ; QL0 little a
+; zmm11 ; QL3 masked for digit
+; zmm10 ; QL2 masked for digit
+; zmm9 ; QL1 masked for digit
+; zmm8 ; QL0 masked for digit
+; zmm7 ; lower nibble masked QL3
+; zmm6 ; lower nibble masked QL2
+; zmm5 ; lower nibble masked QL1
+; zmm4 ; lower nibble masked QL0
+; zmm3 ;
+; zmm2 ;
+; zmm1 ;
+; zmm0 ;
+
+
+%define NINP_BYTES_PER_ROUND 4*64
+%define NINP_BITSHIFT 8
+
+hex_decode_avx512bw:
+
+ sub rsp,STACK_ADJ
+
+ mov [rsp+STACK_FOR_XMM+0*8],rdi
+ mov [rsp+STACK_FOR_XMM+1*8],rsi
+ mov [rsp+STACK_FOR_XMM+2*8],r12
+ mov [rsp+STACK_FOR_XMM+3*8],r14
+ mov [rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+ VMOVDQA [rsp ],xmm6
+ VMOVDQA [rsp+1*16],xmm7
+ VMOVDQA [rsp+2*16],xmm8
+ VMOVDQA [rsp+3*16],xmm9
+ VMOVDQA [rsp+4*16],xmm10
+ VMOVDQA [rsp+5*16],xmm11
+ VMOVDQA [rsp+6*16],xmm12
+ VMOVDQA [rsp+7*16],xmm13
+ VMOVDQA [rsp+8*16],xmm14
+ VMOVDQA [rsp+9*16],xmm15
+
+ mov rdi,rcx ; parameter 1 output buffer
+
+ mov rsi,rdx ; parameter 2 input buffer
+
+ mov rdx,r8 ; parameter 3 number of elements
+
+%endif
+
+;; initializer for QQ0 and QQ1
+
+ VMOVDQU64 zmm20,[rsi+0*64] ; QQ0 p____5 l3+ QL4
+ VMOVDQU64 zmm21,[rsi+1*64] ; QQ0 p____5 l3+ QL5
+ VMOVDQU64 zmm22,[rsi+2*64] ; QQ0 p____5 l3+ QL6
+ VMOVDQU64 zmm23,[rsi+3*64] ; QQ0 p____5 l3+ QL7
+
+;; initialize constants
+
+ KXNORQ k7,k7,k7 ; all one
+
+ VPBROADCASTQ zmm31,[ALL_BYTES_9] ; p_23__ l3
+
+ VPBROADCASTQ zmm30,[BITMASK_LOWER_HALF] ; p_23__ l3
+
+ KXNORQ k6,k6,k6 ; all one
+
+ VPBROADCASTQ zmm29,[ASCII_LETTER_LITTLE_F] ; p_23__ l3
+
+ VPBROADCASTQ zmm28,[ASCII_LETTER_LITTLE_A] ; p_23__ l3
+
+ KMOVQ k5,[BITMASK_ZERO_ONE]
+
+ VMOVDQA64 zmm27,[VPERM_DECODE_OFFSETS] ; p_23__ l3
+
+ VPBROADCASTQ zmm26,[BITMASK_BIG_TO_LITTLE_ASCII]; p_23__ l3
+
+ VPBROADCASTQ zmm25,[BITMASK_SELECT_DIGIT] ; p_23__ l3
+
+
+;; do page overshoot checks
+;; due to end condition handling not done here, we only process full rounds
+
+ mov rax,NINP_BYTES_PER_ROUND
+
+ add rdx,NINP_BYTES_PER_ROUND-1
+ shr rdx,NINP_BITSHIFT ;
+ shl rdx,NINP_BITSHIFT ; rdx number of bytes read in normal loop equiv to xxx full loops
+
+ mov r9,rdx ; exact requested number of elements to process
+ add r9,rsi ; r9 last valid pointer +1 of requested input buffer
+
+ mov r10,rsi ; r10 saved start of input buffer
+ mov r12,r9 ; r12 save of end of input buffer+1
+
+ lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
+
+ mov r11,r9
+; DISABLED for NO OVERSHOOT
+; add r11,rax ; r11 address of last byte of prefetched data
+
+ shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input
+ shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
+ cmp rcx,r11
+ je .LSAME_PAGE_IN_PREFETCH
+ sub rdx,rax ; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+ add rdx,rsi ; rdx last valid pointer+1 for normal loop
+
+ ; due to prefetch add one round to end checks
+ add rdx,rax
+ add r9,rax
+
+ mov r11,rdi ; r11 saved start of output buffer
+
+ mov rcx,NINP_BYTES_PER_ROUND>>1 ; increment of output buffer for each round
+
+;; start preprocessing before loop
+
+; VPUNPCKHBW zmm1,zmm16,zmm31 ; QL0 p____5 l1 QQ0 [Lin0_LeftH] [00 HL_0 00 HL_1 ...]
+; VPUNPCKLBW zmm3,zmm16,zmm31 ; QL0 p____5 l1 QQ0 [Lin0_RghtH] [00 HL_0 00 HL_1 ...]
+
+ add rsi,rax ; add the number of processed array elements
+
+ align 32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+
+; Vector Port info AVX512
+; ----------------------------------------
+; VPShift p0 l1
+; VPMax/Min p0 l1
+; VPMUL p0 l5 ; with 2FMA-Units p05 (SKX,CLX etc.)
+; VPMOVB2M p0 l3
+; VPSUBUSB /SSB p0 l1
+
+; VPALIGNR p5 l1 ;Shift of n*8 bits!
+; VPERM p5 l3
+; VPERMI2x 1*p05+2*p5 l7 ; (l9 with flags)
+; VPCompare p5 l3-l4
+; VP Pack/Unpack p5 l1(SKX) l3(TGL)
+; VPSHUF p5 l1
+
+
+.LHEXDECODE_LOOP:
+
+ VMOVDQA64 zmm16,zmm20 ; QL0 copy preload to load
+ VMOVDQA64 zmm17,zmm21 ; QL1 copy preload to load
+ VPANDQ zmm8,zmm25,zmm20 ; QL0 set bitmask for digits only
+ VPMAXUB zmm0,zmm20,zmm21 ; QL0,QL1 max from both lines
+;;; VPCMPB k7{k7},zmm29,zmm20,2 ; QL0 compare lower_eq little f
+
+ VMOVDQA64 zmm18,zmm22 ; QL2
+ VMOVDQA64 zmm19,zmm23 ; QL3
+ VPCMPEQB k1,zmm8,zmm20 ; QL0 compare for is digit
+ VPANDQ zmm9,zmm25,zmm21 ; QL1 set bitmask for digits only
+
+ VMOVDQU64 zmm20,[rsi+0*64] ; QQ0 p____5 l3+ QL4
+ VMOVDQU64 zmm21,[rsi+1*64] ; QL1 p____5 l3+ QL5
+
+ VPANDQ zmm10,zmm25,zmm18 ; QL2 set bitmask for digits only
+ VPCMPEQB k2,zmm9,zmm17 ; QL1 compare for is digit
+
+ VMOVDQU64 zmm22,[rsi+2*64] ; QQ0 p____5 l3+ QL6
+ VMOVDQU64 zmm23,[rsi+3*64] ; QQ0 p____5 l3+ QL7
+
+ VPANDQ zmm11,zmm25,zmm19 ; QL3 set bitmask for digits only
+ VPCMPEQB k3,zmm10,zmm18 ; QL2 compare for is digit
+
+ VPMAXUB zmm1,zmm18,zmm19 ; QL2,QL3 max from both lines
+;;; VPCMPB k7{k7},zmm29,zmm17,2 ; QL1 compare lower_eq little f
+ VPCMPEQB k4,zmm11,zmm19 ; QL2 compare for is digit
+
+ add rsi,rax ; add the number of processed array elements
+
+ VPORQ zmm12,zmm26,zmm16 ; QL0 set bit for little a
+ VPANDQ zmm4,zmm30,zmm16 ; QL0 bitmask lower nibble
+
+ VPORQ zmm13,zmm26,zmm17 ; QL1 set bit for little a
+ VPANDQ zmm5,zmm30,zmm17 ; QL1 bitmask lower nibble
+
+ VPMAXUB zmm0,zmm0,zmm1 ; QL0,QL1,QL2,QL3 max from 4 lines
+ VPADDB zmm4,zmm4,zmm31 ; QL0 add 9
+
+ VPORQ zmm14,zmm26,zmm18 ; QL2 set bit for little a
+ VPANDQ zmm6,zmm30,zmm18 ; QL2 bitmask lower nibble
+
+ VPANDQ zmm7,zmm30,zmm19 ; QL3 bitmask lower nibble
+ VPCMPB k7{k7},zmm29,zmm0,2 ; QL0,QL1,QL2,QL3 compare lower_eq little f
+
+ VPADDB zmm5,zmm5,zmm31 ; QL1 add 9
+ VPORQ zmm15,zmm26,zmm19 ; QL3 set bit for little a
+
+ VPADDB zmm6,zmm6,zmm31 ; QL2 add 9
+ VPADDB zmm7,zmm7,zmm31 ; QL3 add 9
+
+ VPSUBB zmm4{k1},zmm4,zmm31 ; QL0 sub 9 for digits
+ VPSUBB zmm5{k2},zmm5,zmm31 ; QL1 sub 9 for digits
+ VPSUBB zmm6{k3},zmm6,zmm31 ; QL2 sub 9 for digits
+ VPSUBB zmm7{k4},zmm7,zmm31 ; QL3 sub 9 for digits
+
+;
+
+ VPSRLW zmm0,zmm4,8 ; QL0 lower nibble-value
+ VPSLLW zmm4,zmm4,2 ; QL0 upper nibble_value
+
+ VPADDB zmm0{k5}{z},zmm0,zmm4 ; QL0 values in lower byte of dword
+
+ VPSRLW zmm1,zmm5,8 ; QL1 lower nibble-value
+ VPSLLW zmm5,zmm5,2 ; QL1 upper nibble_value
+
+ VPADDB zmm1{k5}{z},zmm1,zmm5 ; QL1 values in lower byte of dword
+
+ VPACKUSWB zmm0,zmm0,zmm1 ; QL0 vlues in single bytes
+
+ VMOVDQA64 [rdi+0*64],zmm0 ;DEBUG ########### ; QL0 p____4_ p____49 l4 [Lin0 Left Half] Store Hexdump
+
+ VPERMQ zmm1,zmm27,zmm0 ; QL0,QL1 byte values in right order
+
+;
+
+ VPSRLW zmm2,zmm6,8 ; QL2 lower nibble-value
+ VPSLLW zmm6,zmm6,2 ; QL2 upper nibble_value
+
+ VPADDB zmm2{k5}{z},zmm2,zmm6 ; QL2 values in lower byte of dword
+
+ VPSRLW zmm3,zmm7,8 ; QL3 lower nibble-value
+ VPSLLW zmm7,zmm7,2 ; QL3 upper nibble_value
+
+ VPADDB zmm3{k5}{z},zmm3,zmm7 ; QL3 values in lower byte of dword
+
+ VPACKUSWB zmm2,zmm2,zmm3 ; QL2,QL3 vlues in single bytes
+
+ VMOVDQA64 [rdi+1*64],zmm2 ;DEBUG ############## ; QL0 p____4_ p____49 l4 [Lin0 Rght Half] Store Hexdump
+
+ VPERMQ zmm3,zmm27,zmm2 ; QL2,QL3 byte values in right order
+
+; -------- MISSING check for little a
+
+ VMOVDQA64 [rdi+0*64],zmm1 ; QL0 p____4_ p____49 l4 [Lin0 Left Half] Store Hexdump
+ VMOVDQA64 [rdi+1*64],zmm3 ; QL0 p____4_ p____49 l4 [Lin0 Rght Half] Store Hexdump
+
+ add rdi,rcx ; add the number of processed output bytes
+
+
+ cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+ jl .LHEXDECODE_LOOP
+
+ ; end of normal loop reached
+ ; we can do one more round when original count has been reduced by one round
+ cmp rax,0
+ je .LFINISH
+
+ cmp rdx,r9 ; input buffer length was not reduced when equal
+ je .LFINISH
+
+ sub rsi,rax ; for prefetching the last round, load the last round again
+ sub rdx,rax ; adopt and condition for last round also
+ xor rax,rax
+ jmp .LHEXDECODE_LOOP
+
+.LFINISH:
+
+ mov rax,rdi
+ sub rax,r11 ; rax = number of output bytes
+ add rax,rax ; rax = number of valid processed input bytes = return value
+
+%ifdef __WIN__
+
+ VMOVDQA xmm6 ,[rsp ]
+ VMOVDQA xmm7 ,[rsp+1*16]
+ VMOVDQA xmm8 ,[rsp+2*16]
+ VMOVDQA xmm9 ,[rsp+3*16]
+ VMOVDQA xmm10,[rsp+4*16]
+ VMOVDQA xmm11,[rsp+5*16]
+ VMOVDQA xmm12,[rsp+6*16]
+ VMOVDQA xmm13,[rsp+7*16]
+ VMOVDQA xmm14,[rsp+8*16]
+ VMOVDQA xmm15,[rsp+9*16]
+
+%endif
+
+ mov rdi,[rsp+STACK_FOR_XMM+0*8]
+ mov rsi,[rsp+STACK_FOR_XMM+1*8]
+ mov r12,[rsp+STACK_FOR_XMM+2*8]
+ mov r14,[rsp+STACK_FOR_XMM+3*8]
+ mov r15,[rsp+STACK_FOR_XMM+4*8]
+
+ add rsp,STACK_ADJ
+
+ ret
+
+
+;----------------------------------------------------------------------------------------------
+
+
+; ymm15 ; Source Load QL7
+; ymm14 ; Source Load QL6
+; ymm13 ; Source Load QL5
+; ymm12 ; Source Load QL4
+; ymm11 ; Source Load QL3
+; ymm10 ; Source Load QL2
+; ymm9 ; Source Load QL1
+; ymm8 ; Source Load QL0
+
+; ymm7 ; CONST ENCODE_SHUFFLE_TO_HEX
+; ymm6 ; CONST BITMASK_NIBBLE_3_IN_WORD
+; ymm5 ; Shift temp for High nibble 1
+; ymm4 ; Shift temp for High nibble 0
+; ymm3 ; Temp3
+; ymm2 ; Temp2
+; ymm1 ; Temp1
+; ymm0 ; Temp0
+
+
+%define NINP_BYTES_PER_ROUND 8*32
+%define NINP_BITSHIFT 8
+
+hex_encode_avx2:
+
+ sub rsp,STACK_ADJ
+
+ mov [rsp+STACK_FOR_XMM+0*8],rdi
+ mov [rsp+STACK_FOR_XMM+1*8],rsi
+ mov [rsp+STACK_FOR_XMM+2*8],r12
+ mov [rsp+STACK_FOR_XMM+3*8],r14
+ mov [rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+ VMOVDQA [rsp ],xmm6
+ VMOVDQA [rsp+1*16],xmm7
+ VMOVDQA [rsp+2*16],xmm8
+ VMOVDQA [rsp+3*16],xmm9
+ VMOVDQA [rsp+4*16],xmm10
+ VMOVDQA [rsp+5*16],xmm11
+ VMOVDQA [rsp+6*16],xmm12
+ VMOVDQA [rsp+7*16],xmm13
+ VMOVDQA [rsp+8*16],xmm14
+ VMOVDQA [rsp+9*16],xmm15
+
+ mov rdi,rcx ; parameter 1 output buffer
+
+ mov rsi,rdx ; parameter 2 input buffer
+
+ mov rdx,r8 ; parameter 3 number of elements
+
+%endif
+
+;; Loading QL0-QL3, prefetching QL4-QL7
+
+ VPERMQ ymm8, [rsi+0*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL0
+ VPERMQ ymm9, [rsi+1*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL1
+ VPERMQ ymm10,[rsi+2*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL2
+ VPERMQ ymm11,[rsi+3*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL3
+
+ VPERMQ ymm12,[rsi+4*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL4
+ VPERMQ ymm13,[rsi+5*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL5
+ VPERMQ ymm14,[rsi+6*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL6
+ VPERMQ ymm15,[rsi+7*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL7
+
+;; initialize constants
+
+ VMOVDQA ymm7,[ENCODE_SHUFFLE_TO_HEX] ; p_23__ l3
+
+ VMOVDQA ymm6,[BITMASK_LOWER_HALF] ; p_23__ l3
+
+;; do page overshoot checks
+
+ mov rax,NINP_BYTES_PER_ROUND
+
+
+ mov r9,rdx ; exact requested number of elements to process
+ add r9,rsi ; r9 last valid pointer +1 of requested input buffer
+
+ mov r10,rsi ; r10 saved start of input buffer
+ mov r12,r9 ; r12 save of end of input buffer+1
+
+ lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
+
+ lea r8,[rdx+NINP_BYTES_PER_ROUND-1]
+ shr r8,NINP_BITSHIFT ; number of loops
+ shl r8,NINP_BITSHIFT
+ add r8,rsi ; r8 address of last byte+1 read in complete loops
+ add r8,NINP_BYTES_PER_ROUND-1 ; r8 address of last byte read in normal loop with overshoot
+
+ mov r11,r8
+
+; DISABLED for NO OVERSHOOT
+; add r11,rax ; r11 address of last byte of prefetched data
+
+ shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input
+ shr r8,NSHIFT_ADDRESS_TO_PAGE ; r8 page number of last byte read after normal round
+
+ cmp rcx,r8 ; stay on same page
+ je .LSAME_PAGE_IN_ROUND
+ sub rdx,rax ; don't overshoot in reading: do one round less
+
+.LSAME_PAGE_IN_ROUND:
+ shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
+ cmp rcx,r11
+ je .LSAME_PAGE_IN_PREFETCH
+ sub rdx,rax ; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+ add rdx,rsi ; rdx last valid pointer+1 for normal loop
+
+ ; due to prefetch add one round to end checks
+ add rdx,rax
+ add r9,rax
+
+ mov r11,rdi ; r11 saved start of output buffer
+
+ mov rcx,NINP_BYTES_PER_ROUND<<1 ; increment of output buffer for each round
+
+;; start preprocessing before loop
+
+ VPSRLQ ymm4,ymm8,4 ; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte
+ VPSRLQ ymm5,ymm9,4 ; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte
+
+ VPUNPCKLBW ymm0,ymm4,ymm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPUNPCKHBW ymm1,ymm4,ymm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ add rsi,rax ; add the number of processed array elements
+
+
+ align 32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+
+.LHEXENCODE_LOOP:
+
+;; process unpacked AAA in YMM0-YMM4 and YMM8-YMM11, UNPCK BBB to YMM0-YMM1, PreLoad AAA to YMM8-YMM11
+;;RL00,RL01,RL02,RL03,RL04,RL05,RL06,RL07
+ VPUNPCKLBW ymm2,ymm5,ymm9 ; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0]
+ VPSRLQ ymm4,ymm10,4 ; AAA RL04,RL05 QL2 shift Hx to lower nibble in byte
+ VPUNPCKHBW ymm3,ymm5,ymm9 ; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0]
+ VPAND ymm0,ymm0,ymm6 ; AAA RL00 mask lower nibble
+
+ VPSRLQ ymm5,ymm11,4 ; AAA RL06,RL07 QL3 shift Hx to lower nibble in byte
+ VPUNPCKLBW ymm8,ymm4,ymm10 ; AAA RL04 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPAND ymm1,ymm1,ymm6 ; AAA RL01 mask lower nibble
+ VPUNPCKHBW ymm9,ymm4,ymm10 ; AAA RL05 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ VPAND ymm2,ymm2,ymm6 ; AAA RL02 mask lower nibble
+ VPUNPCKLBW ymm10,ymm5,ymm11 ; AAA RL06 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPAND ymm3,ymm3,ymm6 ; AAA RL03 mask lower nibble
+ VPUNPCKHBW ymm11,ymm5,ymm11 ; AAA RL07 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ VPSHUFB ymm0,ymm7,ymm0 ; AAA RL00 shuffle_to_hex_digits
+ VPAND ymm8,ymm8,ymm6 ; AAA RL04 mask lower nibble
+ VPSHUFB ymm1,ymm7,ymm1 ; AAA RL01 shuffle_to_hex_digits
+ VPAND ymm9,ymm9,ymm6 ; AAA RL05 mask lower nibble
+
+ VPSHUFB ymm2,ymm7,ymm2 ; AAA RL02 shuffle_to_hex_digits
+ VMOVDQU [rdi+0*32],ymm0 ; AAA RL00 Store Hexdump
+ VPAND ymm10,ymm10,ymm6 ; AAA RL06 mask lower nibble
+
+ VPSHUFB ymm3,ymm7,ymm3 ; AAA RL03 shuffle_to_hex_digits
+ VMOVDQU [rdi+1*32],ymm1 ; AAA RL01 Store Hexdump
+ VPAND ymm11,ymm11,ymm6 ; AAA RL07 mask lower nibble
+
+ VPSHUFB ymm8,ymm7,ymm8 ; AAA RL04 shuffle_to_hex_digits
+ VPSRLQ ymm4,ymm12,4 ; BBB RL08,RL09 QL4 shift Hx to lower nibble in byte
+ VMOVDQU [rdi+2*32],ymm2 ; AAA RL02 Store Hexdump
+ VPSHUFB ymm9,ymm7,ymm9 ; AAA RL05 shuffle_to_hex_digits
+ VPSRLQ ymm5,ymm13,4 ; BBB RL10,RL11 QL5 shift Hx to lower nibble in byte
+ VMOVDQU [rdi+3*32],ymm3 ; AAA RL03 Store Hexdump
+
+ VPSHUFB ymm10,ymm7,ymm10 ; AAA RL06 shuffle_to_hex_digits
+ VMOVDQU [rdi+4*32],ymm8 ; AAA RL04 Store Hexdump
+ VPERMQ ymm8, [rsi+0*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL0
+ VMOVDQU [rdi+5*32],ymm9 ; AAA RL05 Store Hexdump
+ VPERMQ ymm9, [rsi+1*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL1
+
+ VPSHUFB ymm11,ymm7,ymm11 ; AAA RL07 shuffle_to_hex_digits
+ VMOVDQU [rdi+6*32],ymm10 ; AAA RL06 Store Hexdump
+ VPUNPCKLBW ymm0,ymm4,ymm12 ; BBB RL08 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPERMQ ymm10,[rsi+2*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL2
+
+ VMOVDQU [rdi+7*32],ymm11 ; AAA RL07 Store Hexdump
+ VPERMQ ymm11,[rsi+3*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL3
+ VPUNPCKHBW ymm1,ymm4,ymm12 ; AAA RL09 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+
+;; process unpacked BBB in YMM0-YMM4 and YMM9-YMM15, UNPCK AAA to YMM0-YMM1, PreLoad BBB to YMM12-YMM15
+;;RL08,RL09,RL10,RL11,RL12,RL13,RL14,RL15
+ VPUNPCKLBW ymm2,ymm5,ymm13 ; BBB RL10 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0]
+ VPSRLQ ymm4,ymm14,4 ; BBB RL12,RL13 QL6 shift Hx to lower nibble in byte
+ VPUNPCKHBW ymm3,ymm5,ymm13 ; BBB RL11 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0]
+ VPAND ymm0,ymm0,ymm6 ; BBB RL08 mask lower nibble
+
+ VPSRLQ ymm5,ymm15,4 ; BBB RL14,RL15 QL7 shift Hx to lower nibble in byte
+ VPUNPCKLBW ymm12,ymm4,ymm14 ; BBB RL12 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPAND ymm1,ymm1,ymm6 ; BBB RL09 mask lower nibble
+ VPUNPCKHBW ymm13,ymm4,ymm14 ; BBB RL13 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ VPAND ymm2,ymm2,ymm6 ; BBB RL10 mask lower nibble
+ VPUNPCKLBW ymm14,ymm5,ymm15 ; BBB RL14 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPAND ymm3,ymm3,ymm6 ; BBB RL11 mask lower nibble
+ VPUNPCKHBW ymm15,ymm5,ymm15 ; BBB RL15 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ VPSHUFB ymm0,ymm7,ymm0 ; BBB RL08 shuffle_to_hex_digits
+ VPAND ymm12,ymm12,ymm6 ; BBB RL12 mask lower nibble
+ VPSHUFB ymm1,ymm7,ymm1 ; BBB RL09 shuffle_to_hex_digits
+ VPAND ymm13,ymm13,ymm6 ; BBB RL13 mask lower nibble
+
+ VPSHUFB ymm2,ymm7,ymm2 ; BBB RL10 shuffle_to_hex_digits
+ VMOVDQU [rdi+8*32],ymm0 ; BBB RL08 Store Hexdump
+ VPAND ymm14,ymm14,ymm6 ; BBB RL14 mask lower nibble
+
+ VPSHUFB ymm3,ymm7,ymm3 ; BBB RL11 shuffle_to_hex_digits
+ VMOVDQU [rdi+9*32],ymm1 ; BBB RL09 Store Hexdump
+ VPAND ymm15,ymm15,ymm6 ; BBB RL15 mask lower nibble
+
+ VPSHUFB ymm12,ymm7,ymm12 ; BBB RL12 shuffle_to_hex_digits
+ VPSRLQ ymm4,ymm8,4 ; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte
+ VMOVDQU [rdi+10*32],ymm2 ; BBB RL10 Store Hexdump
+ VPSHUFB ymm13,ymm7,ymm13 ; BBB RL13 shuffle_to_hex_digits
+ VPSRLQ ymm5,ymm9,4 ; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte
+ VMOVDQU [rdi+11*32],ymm3 ; BBB RL11 Store Hexdump
+
+ VPSHUFB ymm14,ymm7,ymm14 ; BBB RL14 shuffle_to_hex_digits
+ VMOVDQU [rdi+12*32],ymm12 ; BBB RL12 Store Hexdump
+ VPERMQ ymm12, [rsi+4*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL0
+ VMOVDQU [rdi+13*32],ymm13 ; BBB RL13 Store Hexdump
+ VPERMQ ymm13, [rsi+5*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL1
+
+ VPSHUFB ymm15,ymm7,ymm15 ; BBB RL15 shuffle_to_hex_digits
+ VMOVDQU [rdi+14*32],ymm14 ; BBB RL14 Store Hexdump
+ VPUNPCKLBW ymm0,ymm4,ymm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPERMQ ymm14,[rsi+6*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL2
+
+ VMOVDQU [rdi+15*32],ymm15 ; BBB RL15 Store Hexdump
+ VPERMQ ymm15,[rsi+7*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL3
+
+ add rsi,rax ; add the number of processed array elements
+
+ VPUNPCKHBW ymm1,ymm4,ymm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ add rdi,rcx ; add the number of processed output bytes
+
+
+ cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+ jl .LHEXENCODE_LOOP
+
+ ; end of normal loop reached
+ ; we can do one more round when original count has been reduced by one round
+ cmp rax,0
+ je .LFINISH_EXTRA
+
+ cmp rdx,r9 ; input buffer length was not reduced when equal
+ je .LFINISH_NORMAL
+
+ sub rsi,rax ; for prefetching the last round, load the last round again
+ sub rdx,rax ; adopt and condition for last round also
+ xor rax,rax
+ jmp .LHEXENCODE_LOOP
+
+
+.LFINISH_EXTRA:
+ add rsi,NINP_BYTES_PER_ROUND ; add the extra round to get processed bytes
+ jmp .LFINISH
+
+.LFINISH_NORMAL:
+ sub rsi,NINP_BYTES_PER_ROUND ; sub the added prefetch round to get processed bytes
+
+.LFINISH:
+ ; r9 = address of requested input bytes+1
+ ; rsi = address of processed input bytes+1
+ ; now get the minimum of rdx,rsi to rax
+;; sub rsi,rax ; for last round do nothing (rax=0), else sub increment for one round
+;; sub r9,rax
+
+ mov rax,r12
+ cmp rsi,r12 ; get min from r12 (address of requested input) and rsi (address of done input)
+
+ jge .LCALC_PROCESSED_BYTES
+ mov rax,rsi ; rax=address of last valid input byte+1
+
+.LCALC_PROCESSED_BYTES:
+ sub rax,r10 ; sub the input buffer start address
+ ; rax = number of valid processed input bytes = return value
+
+ cmp rsi,rdx ; compare rdx (address of requested input) and rsi (address of done input)
+ je .LNO_ZERO_OUT
+
+ mov r15,rax ; number of elements to process
+
+ shl r15,1 ; number of output bytes
+
+ add r15,r11 ; pointer to next byte after full valid output buffer
+
+
+ VPXOR ymm0,ymm0,ymm0 ; all zero
+;ZERO VMOVDQU [r15],ymm0 ; zero out one register width after last output
+
+.LNO_ZERO_OUT:
+
+%ifdef __WIN__
+
+ VMOVDQA xmm6 ,[rsp ]
+ VMOVDQA xmm7 ,[rsp+1*16]
+ VMOVDQA xmm8 ,[rsp+2*16]
+ VMOVDQA xmm9 ,[rsp+3*16]
+ VMOVDQA xmm10,[rsp+4*16]
+ VMOVDQA xmm11,[rsp+5*16]
+ VMOVDQA xmm12,[rsp+6*16]
+ VMOVDQA xmm13,[rsp+7*16]
+ VMOVDQA xmm14,[rsp+8*16]
+ VMOVDQA xmm15,[rsp+9*16]
+
+%endif
+
+ mov rdi,[rsp+STACK_FOR_XMM+0*8]
+ mov rsi,[rsp+STACK_FOR_XMM+1*8]
+ mov r12,[rsp+STACK_FOR_XMM+2*8]
+ mov r14,[rsp+STACK_FOR_XMM+3*8]
+ mov r15,[rsp+STACK_FOR_XMM+4*8]
+
+ add rsp,STACK_ADJ
+
+
+ ret
+
+;----------------------------------------------------------------------------------------------
+
+
+; xmm15 ; Source Load QL7
+; xmm14 ; Source Load QL6
+; xmm13 ; Source Load QL5
+; xmm12 ; Source Load QL4
+; xmm11 ; Source Load QL3
+; xmm10 ; Source Load QL2
+; xmm9 ; Source Load QL1
+; xmm8 ; Source Load QL0
+
+; xmm7 ; CONST ENCODE_SHUFFLE_TO_HEX
+; xmm6 ; CONST BITMASK_NIBBLE_3_IN_WORD
+; xmm5 ; Shift temp for High nibble 1
+; xmm4 ; Shift temp for High nibble 0
+; xmm3 ; Temp3
+; xmm2 ; Temp2
+; xmm1 ; Temp1
+; xmm0 ; Temp0
+
+
+
+%define NINP_BYTES_PER_ROUND 8*16
+%define NINP_BITSHIFT 7
+
+
+hex_encode_ssse3:
+
+ sub rsp,STACK_ADJ
+
+ mov [rsp+STACK_FOR_XMM+0*8],rdi
+ mov [rsp+STACK_FOR_XMM+1*8],rsi
+ mov [rsp+STACK_FOR_XMM+2*8],r12
+ mov [rsp+STACK_FOR_XMM+3*8],r14
+ mov [rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+ MOVDQA [rsp ],xmm6
+ MOVDQA [rsp+1*16],xmm7
+ MOVDQA [rsp+2*16],xmm8
+ MOVDQA [rsp+3*16],xmm9
+ MOVDQA [rsp+4*16],xmm10
+ MOVDQA [rsp+5*16],xmm11
+ MOVDQA [rsp+6*16],xmm12
+ MOVDQA [rsp+7*16],xmm13
+ MOVDQA [rsp+8*16],xmm14
+ MOVDQA [rsp+9*16],xmm15
+
+ mov rdi,rcx ; parameter 1 output buffer
+
+ mov rsi,rdx ; parameter 2 input buffer
+
+ mov rdx,r8 ; parameter 3 number of elements
+
+%endif
+
+;; Loading QL0-QL3, prefetching QL4-QL7
+
+ MOVDQU xmm8, [rsi+0*16] ; AAA p_____5 p1____5 l3+ QL0
+ MOVDQU xmm9, [rsi+1*16] ; AAA p_____5 p1____5 l3+ QL1
+ MOVDQU xmm10,[rsi+2*16] ; AAA p_____5 p1____5 l3+ QL2
+ MOVDQU xmm11,[rsi+3*16] ; AAA p_____5 p1____5 l3+ QL3
+
+ MOVDQU xmm12,[rsi+4*16] ; BBB p_____5 p1____5 l3+ QL4
+ MOVDQU xmm13,[rsi+5*16] ; BBB p_____5 p1____5 l3+ QL5
+ MOVDQU xmm14,[rsi+6*16] ; BBB p_____5 p1____5 l3+ QL6
+ MOVDQU xmm15,[rsi+7*16] ; BBB p_____5 p1____5 l3+ QL7
+
+;; initialize constants
+
+ MOVDQA xmm7,[ENCODE_SHUFFLE_TO_HEX] ; p_23__ l3
+
+ MOVDQA xmm6,[BITMASK_LOWER_HALF] ; p_23__ l3
+
+;; do page overshoot checks
+
+ mov rax,NINP_BYTES_PER_ROUND
+
+
+ mov r9,rdx ; exact requested number of elements to process
+ add r9,rsi ; r9 last valid pointer +1 of requested input buffer
+
+ mov r10,rsi ; r10 saved start of input buffer
+ mov r12,r9 ; r12 save of end of input buffer+1
+
+ lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
+
+ lea r8,[rdx+NINP_BYTES_PER_ROUND-1]
+ shr r8,NINP_BITSHIFT ; number of loops
+ shl r8,NINP_BITSHIFT
+ add r8,rsi ; r8 address of last byte+1 read in complete loops
+ add r8,NINP_BYTES_PER_ROUND-1 ; r8 address of last byte read in normal loop with overshoot
+
+ mov r11,r8
+
+; DISABLED for NO OVERSHOOT
+; add r11,rax ; r11 address of last byte of prefetched data
+
+ shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input
+ shr r8,NSHIFT_ADDRESS_TO_PAGE ; r8 page number of last byte read after normal round
+
+ cmp rcx,r8 ; stay on same page
+ je .LSAME_PAGE_IN_ROUND
+ sub rdx,rax ; don't overshoot in reading: do one round less
+
+.LSAME_PAGE_IN_ROUND:
+ shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
+ cmp rcx,r11
+ je .LSAME_PAGE_IN_PREFETCH
+ sub rdx,rax ; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+ add rdx,rsi ; rdx last valid pointer+1 for normal loop
+
+ ; due to prefetch add one round to end checks
+ add rdx,rax
+ add r9,rax
+
+ mov r11,rdi ; r11 saved start of output buffer
+
+ mov rcx,NINP_BYTES_PER_ROUND<<1 ; increment of output buffer for each round
+
+;; start preprocessing before loop
+
+ MOVDQA xmm4,xmm8
+ PSRLQ xmm4,4 ; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte
+ MOVDQA xmm5,xmm9
+ PSRLQ xmm5,4 ; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte
+
+ MOVDQA xmm0,xmm4
+ PUNPCKLBW xmm0,xmm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ MOVDQA xmm1,xmm4
+ PUNPCKHBW xmm1,xmm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ add rsi,rax ; add the number of processed array elements
+
+
+ align 32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+.LHEXENCODE_LOOP:
+
+;; process unpacked AAA in XMM0-XMM4 and XMM8-XMM11, UNPCK BBB to XMM0-XMM1, PreLoad AAA to XMM8-XMM11
+;;RL00,RL01,RL02,RL03,RL04,RL05,RL06,RL07
+ MOVDQA xmm2,xmm5
+ PUNPCKLBW xmm2,xmm9 ; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0]
+ MOVDQA xmm4,xmm10
+ PSRLQ xmm4,4 ; AAA RL04,RL05 QL2 shift Hx to lower nibble in byte
+ MOVDQA xmm3,xmm5
+ PUNPCKHBW xmm3,xmm9 ; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0]
+ PAND xmm0,xmm6 ; AAA RL00 mask lower nibble
+
+ MOVDQA xmm5,xmm11
+ PSRLQ xmm5,4 ; AAA RL06,RL07 QL3 shift Hx to lower nibble in byte
+ MOVDQA xmm8,xmm4
+ PUNPCKLBW xmm8,xmm10 ; AAA RL04 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ PAND xmm1,xmm6 ; AAA RL01 mask lower nibble
+ MOVDQA xmm9,xmm4
+ PUNPCKHBW xmm9,xmm10 ; AAA RL05 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ PAND xmm2,xmm6 ; AAA RL02 mask lower nibble
+ MOVDQA xmm10,xmm5
+ PUNPCKLBW xmm10,xmm11 ; AAA RL06 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ PAND xmm3,xmm6 ; AAA RL03 mask lower nibble
+ MOVDQA xmm4,xmm5
+ PUNPCKHBW xmm4,xmm11 ; AAA RL07 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+ MOVDQA xmm11,xmm4
+
+ MOVDQA xmm4,xmm7
+ PSHUFB xmm4,xmm0 ; AAA RL00 shuffle_to_hex_digits
+ PAND xmm8,xmm6 ; AAA RL04 mask lower nibble
+ MOVDQA xmm5,xmm7
+ PSHUFB xmm5,xmm1 ; AAA RL01 shuffle_to_hex_digits
+ PAND xmm9,xmm6 ; AAA RL05 mask lower nibble
+
+ MOVDQA xmm0,xmm7
+ PSHUFB xmm0,xmm2 ; AAA RL02 shuffle_to_hex_digits
+ MOVDQU [rdi+0*16],xmm4 ; AAA RL00 Store Hexdump
+ PAND xmm10,xmm6 ; AAA RL06 mask lower nibble
+
+ MOVDQA xmm1,xmm7
+ PSHUFB xmm1,xmm3 ; AAA RL03 shuffle_to_hex_digits
+ MOVDQU [rdi+1*16],xmm5 ; AAA RL01 Store Hexdump
+ PAND xmm11,xmm6 ; AAA RL07 mask lower nibble
+
+ MOVDQA xmm2,xmm7
+ PSHUFB xmm2,xmm8 ; AAA RL04 shuffle_to_hex_digits
+ MOVDQA xmm4,xmm12
+ PSRLQ xmm4,4 ; BBB RL08,RL09 QL4 shift Hx to lower nibble in byte
+ MOVDQU [rdi+2*16],xmm0 ; AAA RL02 Store Hexdump
+ MOVDQA xmm3,xmm7
+ PSHUFB xmm3,xmm9 ; AAA RL05 shuffle_to_hex_digits
+ MOVDQA xmm5,xmm13
+ PSRLQ xmm5,4 ; BBB RL10,RL11 QL5 shift Hx to lower nibble in byte
+ MOVDQU [rdi+3*16],xmm1 ; AAA RL03 Store Hexdump
+
+ MOVDQA xmm0,xmm7
+ PSHUFB xmm0,xmm10 ; AAA RL06 shuffle_to_hex_digits
+ MOVDQU [rdi+4*16],xmm2 ; AAA RL04 Store Hexdump
+ MOVDQU xmm8, [rsi+0*16] ; AAA p_____5 p1____5 l3+ QL0
+ MOVDQU [rdi+5*16],xmm3 ; AAA RL05 Store Hexdump
+ MOVDQU xmm9, [rsi+1*16] ; AAA p_____5 p1____5 l3+ QL1
+
+ MOVDQA xmm1,xmm7
+ PSHUFB xmm1,xmm11 ; AAA RL07 shuffle_to_hex_digits
+ MOVDQU [rdi+6*16],xmm0 ; AAA RL06 Store Hexdump
+ MOVDQA xmm0,xmm4
+ PUNPCKLBW xmm0,xmm12 ; BBB RL08 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ MOVDQU xmm10,[rsi+2*16] ; AAA p_____5 p1____5 l3+ QL2
+
+ MOVDQU [rdi+7*16],xmm1 ; AAA RL07 Store Hexdump
+ MOVDQU xmm11,[rsi+3*16] ; AAA p_____5 p1____5 l3+ QL3
+ MOVDQA xmm1,xmm4
+ PUNPCKHBW xmm1,xmm12 ; AAA RL09 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+
+;; process unpacked BBB in XMM0-XMM4 and XMM9-XMM15, UNPCK AAA to XMM0-XMM1, PreLoad BBB to XMM12-XMM15
+;;RL08,RL09,RL10,RL11,RL12,RL13,RL14,RL15
+ MOVDQA xmm2,xmm5
+ PUNPCKLBW xmm2,xmm13 ; BBB RL10 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0]
+ MOVDQA xmm4,xmm14
+ PSRLQ xmm4,4 ; BBB RL12,RL13 QL6 shift Hx to lower nibble in byte
+ MOVDQA xmm3,xmm5
+ PUNPCKHBW xmm3,xmm13 ; BBB RL11 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0]
+ PAND xmm0,xmm6 ; BBB RL08 mask lower nibble
+
+ MOVDQA xmm5,xmm15
+ PSRLQ xmm5,4 ; BBB RL14,RL15 QL7 shift Hx to lower nibble in byte
+ MOVDQA xmm12,xmm4
+ PUNPCKLBW xmm12,xmm14 ; BBB RL12 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ PAND xmm1,xmm6 ; BBB RL09 mask lower nibble
+ MOVDQA xmm13,xmm4
+ PUNPCKHBW xmm13,xmm14 ; BBB RL13 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ PAND xmm2,xmm6 ; BBB RL10 mask lower nibble
+ MOVDQA xmm14,xmm5
+ PUNPCKLBW xmm14,xmm15 ; BBB RL14 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ PAND xmm3,xmm6 ; BBB RL11 mask lower nibble
+ MOVDQA xmm4,xmm5
+ PUNPCKHBW xmm4,xmm15 ; BBB RL15 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+ MOVDQA xmm15,xmm4
+
+ MOVDQA xmm4,xmm7
+ PSHUFB xmm4,xmm0 ; BBB RL08 shuffle_to_hex_digits
+ PAND xmm12,xmm6 ; BBB RL12 mask lower nibble
+ MOVDQA xmm5,xmm7
+ PSHUFB xmm5,xmm1 ; BBB RL09 shuffle_to_hex_digits
+ PAND xmm13,xmm6 ; BBB RL13 mask lower nibble
+
+ MOVDQA xmm0,xmm7
+ PSHUFB xmm0,xmm2 ; BBB RL10 shuffle_to_hex_digits
+ MOVDQU [rdi+8*16],xmm4 ; BBB RL08 Store Hexdump
+ PAND xmm14,xmm6 ; BBB RL14 mask lower nibble
+
+ MOVDQA xmm1,xmm7
+ PSHUFB xmm1,xmm3 ; BBB RL11 shuffle_to_hex_digits
+ MOVDQU [rdi+9*16],xmm5 ; BBB RL09 Store Hexdump
+ PAND xmm15,xmm6 ; BBB RL15 mask lower nibble
+
+ MOVDQA xmm2,xmm7
+ PSHUFB xmm2,xmm12 ; BBB RL12 shuffle_to_hex_digits
+ MOVDQA xmm4,xmm8
+ PSRLQ xmm4,4 ; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte
+ MOVDQU [rdi+10*16],xmm0 ; BBB RL10 Store Hexdump
+ MOVDQA xmm3,xmm7
+ PSHUFB xmm3,xmm13 ; BBB RL13 shuffle_to_hex_digits
+ MOVDQA xmm5,xmm9
+ PSRLQ xmm5,4 ; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte
+ MOVDQU [rdi+11*16],xmm1 ; BBB RL11 Store Hexdump
+
+ MOVDQA xmm0,xmm7
+ PSHUFB xmm0,xmm14 ; BBB RL14 shuffle_to_hex_digits
+ MOVDQU [rdi+12*16],xmm2 ; BBB RL12 Store Hexdump
+ MOVDQU xmm12, [rsi+4*16] ; BBB p_____5 p1____5 l3+ QL0
+ MOVDQU [rdi+13*16],xmm3 ; BBB RL13 Store Hexdump
+ MOVDQU xmm13, [rsi+5*16] ; BBB p_____5 p1____5 l3+ QL1
+
+ MOVDQA xmm1,xmm7
+ PSHUFB xmm1,xmm15 ; BBB RL15 shuffle_to_hex_digits
+ MOVDQU [rdi+14*16],xmm0 ; BBB RL14 Store Hexdump
+ MOVDQA xmm0,xmm4
+ PUNPCKLBW xmm0,xmm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ MOVDQU xmm14,[rsi+6*16] ; BBB p_____5 p1____5 l3+ QL2
+
+ MOVDQU [rdi+15*16],xmm1 ; BBB RL15 Store Hexdump
+ MOVDQU xmm15,[rsi+7*16] ; BBB p_____5 p1____5 l3+ QL3
+
+ add rsi,rax ; add the number of processed array elements
+
+ MOVDQA xmm1,xmm4
+ PUNPCKHBW xmm1,xmm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ add rdi,rcx ; add the number of processed output bytes
+
+
+ cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+ jl .LHEXENCODE_LOOP
+
+ ; end of normal loop reached
+ ; we can do one more round when original count has been reduced by one round
+ cmp rax,0
+ je .LFINISH_EXTRA
+
+ cmp rdx,r9 ; input buffer length was not reduced when equal
+ je .LFINISH_NORMAL
+
+ sub rsi,rax ; for prefetching the last round, load the last round again
+ sub rdx,rax ; adopt and condition for last round also
+ xor rax,rax
+ jmp .LHEXENCODE_LOOP
+
+
+.LFINISH_EXTRA:
+ add rsi,NINP_BYTES_PER_ROUND ; add the extra round to get processed bytes
+ jmp .LFINISH
+
+.LFINISH_NORMAL:
+ sub rsi,NINP_BYTES_PER_ROUND ; sub the added prefetch round to get processed bytes
+
+.LFINISH:
+ ; r9 = address of requested input bytes+1
+ ; rsi = address of processed input bytes+1
+ ; now get the minimum of rdx,rsi to rax
+;; sub rsi,rax ; for last round do nothing (rax=0), else sub increment for one round
+;; sub r9,rax
+
+ mov rax,r12
+ cmp rsi,r12 ; get min from r12 (address of requested input) and rsi (address of done input)
+
+ jge .LCALC_PROCESSED_BYTES
+ mov rax,rsi ; rax=address of last valid input byte+1
+
+.LCALC_PROCESSED_BYTES:
+ sub rax,r10 ; sub the input buffer start address
+ ; rax = number of valid processed input bytes = return value
+
+ cmp rsi,rdx ; compare rdx (address of requested input) and rsi (address of done input)
+ je .LNO_ZERO_OUT
+
+ mov r15,rax ; number of elements to process
+
+ shl r15,1 ; number of output bytes
+
+ add r15,r11 ; pointer to next byte after full valid output buffer
+
+
+ PXOR xmm0,xmm0 ; all zero
+;ZERO MOVDQU [r15],xmm0 ; zero out one register width after last output
+
+.LNO_ZERO_OUT:
+
+%ifdef __WIN__
+
+ MOVDQA xmm6 ,[rsp ]
+ MOVDQA xmm7 ,[rsp+1*16]
+ MOVDQA xmm8 ,[rsp+2*16]
+ MOVDQA xmm9 ,[rsp+3*16]
+ MOVDQA xmm10,[rsp+4*16]
+ MOVDQA xmm11,[rsp+5*16]
+ MOVDQA xmm12,[rsp+6*16]
+ MOVDQA xmm13,[rsp+7*16]
+ MOVDQA xmm14,[rsp+8*16]
+ MOVDQA xmm15,[rsp+9*16]
+
+%endif
+
+ mov rdi,[rsp+STACK_FOR_XMM+0*8]
+ mov rsi,[rsp+STACK_FOR_XMM+1*8]
+ mov r12,[rsp+STACK_FOR_XMM+2*8]
+ mov r14,[rsp+STACK_FOR_XMM+3*8]
+ mov r15,[rsp+STACK_FOR_XMM+4*8]
+
+ add rsp,STACK_ADJ
+
+ ret
+
+
+
+;----------------------------------------------------------------------------------------------
+
+; k7 ; CONST BITMASK_ONE_ZERO 1010101010101010 selecting upper half
+; k6 ;
+; k5 ; CONST BITMASK_ZERO_ONE 0101010101010101 selecting lower half
+; k4 ; digit flags QL3
+; k3 ; digit flags QL2
+; k2 ; digit flags QL1
+; k1 ; digit flags QL0
+; k0 ;
+
+; zmm31 ; CONST ENCODE_SHUFFLE_TO_HEX
+; zmm30 ; CONST BITMASK_NIBBLE_3_IN_WORD
+; zmm29 ; CONST VPERM_ENCODE_OFFSETS
+; zmm28 ; CONST ALL_BYTES_39 ; CONST 48 = 39+9, calculated in the middle
+; zmm27 ; Unpack Upper RL5 RL7
+; zmm26 ; Unpack Lower RL4 RL6
+; zmm25 ; Unpack Upper RL1 RL3
+; zmm24 ; Unpack Lower RL0 RL2
+; zmm23 ; Source Load QLF
+; zmm22 ; Source Load QLE
+; zmm21 ; Source Load QLD
+; zmm20 ; Source Load QLC
+; zmm19 ; Source Load QLB
+; zmm18 ; Source Load QLA
+; zmm17 ; Source Load QL9
+; zmm16 ; Source Load QL8
+
+; zmm15 ; Source Load QL7
+; zmm14 ; Source Load QL6
+; zmm13 ; Source Load QL5
+; zmm12 ; Source Load QL4
+; zmm11 ; Source Load QL3
+; zmm10 ; Source Load QL2
+; zmm9 ; Source Load QL1
+; zmm8 ; Source Load QL0
+; zmm7 ; RL3
+; zmm6 ; RL3
+; zmm5 ; RL2
+; zmm4 ; RL2
+; zmm3 ;
+; zmm2 ; RL1
+; zmm1 ; CONST ALL bytes 48
+; zmm0 ; RL0
+
+%define NHALF_INP_BYTES_PER_ROUND 8*64
+%define NINP_BYTES_PER_ROUND 2*NHALF_INP_BYTES_PER_ROUND
+%define NINP_BITSHIFT 10
+
+hex_encode_avx512bw:
+
+ sub rsp,STACK_ADJ
+
+ mov [rsp+STACK_FOR_XMM+0*8],rdi
+ mov [rsp+STACK_FOR_XMM+1*8],rsi
+ mov [rsp+STACK_FOR_XMM+2*8],r12
+ mov [rsp+STACK_FOR_XMM+3*8],r14
+ mov [rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+ VMOVDQA [rsp ],xmm6
+ VMOVDQA [rsp+1*16],xmm7
+ VMOVDQA [rsp+2*16],xmm8
+ VMOVDQA [rsp+3*16],xmm9
+ VMOVDQA [rsp+4*16],xmm10
+ VMOVDQA [rsp+5*16],xmm11
+ VMOVDQA [rsp+6*16],xmm12
+ VMOVDQA [rsp+7*16],xmm13
+ VMOVDQA [rsp+8*16],xmm14
+ VMOVDQA [rsp+9*16],xmm15
+
+ mov rdi,rcx ; parameter 1 output buffer
+
+ mov rsi,rdx ; parameter 2 input buffer
+
+ mov rdx,r8 ; parameter 3 number of elements
+
+%endif
+
+ VMOVDQA64 zmm29,[VPERM_ENCODE_OFFSETS] ; p_23__ l3
+
+;; initializer for QQ0 and QQ1
+
+ VPERMQ zmm8 ,zmm29,[rsi+0*64] ; AAA p____5 l3+ QL0 RL00,RL01
+ VPERMQ zmm9 ,zmm29,[rsi+1*64] ; AAA p____5 l3+ QL1 RL02,RL03
+ VPERMQ zmm10,zmm29,[rsi+2*64] ; AAA p____5 l3+ QL2 RL04,RL05
+ VPERMQ zmm11,zmm29,[rsi+3*64] ; AAA p____5 l3+ QL3 RL06,RL07
+
+ VPERMQ zmm12,zmm29,[rsi+4*64] ; BBB p____5 l3+ QL4 RL08,RL09
+ VPERMQ zmm13,zmm29,[rsi+5*64] ; BBB p____5 l3+ QL5 RL10,RL11
+ VPERMQ zmm14,zmm29,[rsi+6*64] ; BBB p____5 l3+ QL6 RL12,RL13
+ VPERMQ zmm15,zmm29,[rsi+7*64] ; BBB p____5 l3+ QL7 RL14,RL15
+
+ add rsi,rax ; add half the number of processed array elements
+
+ VPERMQ zmm16,zmm29,[rsi+0*64] ; CCC p____5 l3+ QL8 RL16,RL17
+ VPERMQ zmm17,zmm29,[rsi+1*64] ; CCC p____5 l3+ QL9 RL18,RL19
+ VPERMQ zmm18,zmm29,[rsi+2*64] ; CCC p____5 l3+ QLA RL20,RL21
+ VPERMQ zmm19,zmm29,[rsi+3*64] ; CCC p____5 l3+ QLB RL22,RL23
+
+ VPERMQ zmm20,zmm29,[rsi+4*64] ; DDD p____5 l3+ QLC RL24,RL25
+ VPERMQ zmm21,zmm29,[rsi+5*64] ; DDD p____5 l3+ QLD RL26,RL27
+ VPERMQ zmm22,zmm29,[rsi+6*64] ; DDD p____5 l3+ QLE RL28,RL29
+ VPERMQ zmm23,zmm29,[rsi+7*64] ; DDD p____5 l3+ QLF RL30,RL31
+
+;; initialize constants
+
+ KMOVQ k7,[BITMASK_ONE_ZERO]
+
+ VMOVDQA64 zmm31,[ENCODE_SHUFFLE_TO_HEX] ; p_23__ l3
+ VMOVDQA64 zmm1,zmm31
+
+ VPBROADCASTQ zmm30,[BITMASK_LOWER_HALF] ; p_23__ l3
+
+
+ VMOVDQA64 zmm28,[ENCODE_SHUFFLE_TO_HIGH_LOW] ; p_23__ l3
+
+;; do page overshoot checks
+
+ mov rax,NHALF_INP_BYTES_PER_ROUND
+
+
+ mov r9,rdx ; exact requested number of elements to process
+ add r9,rsi ; r9 last valid pointer +1 of requested input buffer
+
+ mov r10,rsi ; r10 saved start of input buffer
+ mov r12,r9 ; r12 save of end of input buffer+1
+
+ lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read
+
+ lea r8,[rdx+NINP_BYTES_PER_ROUND-1]
+ shr r8,NINP_BITSHIFT ; number of loops
+ shl r8,NINP_BITSHIFT
+ add r8,rsi ; r8 address of last byte+1 read in complete loops
+ add r8,NINP_BYTES_PER_ROUND-1 ; r8 address of last byte read in normal loop with overshoot
+
+ mov r11,r8
+
+; DISABLED for NO OVERSHOOT
+; add r11,rax ; r11 address of last byte of prefetched data (ONLY HALF A ROUND!)
+ sub r11,rax ; r11 address of last byte of prefetched data (ONLY HALF A ROUND!)
+ sub r11,rax ; r11 address of last byte of prefetched data (ONLY HALF A ROUND!)
+ sub r11,rax ; r11 address of last byte of prefetched data (ONLY HALF A ROUND!)
+
+ shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte after normal round
+ shr r8,NSHIFT_ADDRESS_TO_PAGE ; r8 page number of last byte after prefetch
+
+ cmp rcx,r8 ; stay on same page
+ je .LSAME_PAGE_IN_ROUND
+ sub rdx,rax ; don't overshoot in reading: do one round less
+ sub rdx,rax ; don't overshoot in reading: do one round less
+
+.LSAME_PAGE_IN_ROUND:
+ shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data
+ cmp rcx,r11
+ je .LSAME_PAGE_IN_PREFETCH
+ sub rdx,rax ; don't overshoot in prefetch reading: do one round less
+ sub rdx,rax ; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+ add rdx,rsi ; rdx last valid pointer+1 for normal loop
+
+ ; due to prefetch add one round to end checks
+ add rdx,rax
+ add r9,rax
+
+ mov r11,rdi ; r11 saved start of output buffer
+
+ mov rcx,NHALF_INP_BYTES_PER_ROUND<<1 ; increment of output buffer for each round
+
+
+;; start preprocessing before loop
+
+ VPSRLQ zmm2,zmm8,4 ; AAA RL00+RL01 QL0 shift Hx to lower nibble in byte
+ VPSRLQ zmm3,zmm9,4 ; AAA RL02+RL03 QL1 shift Hx to lower nibble in byte
+
+ VPUNPCKLBW zmm0,zmm2,zmm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPUNPCKHBW zmm1,zmm2,zmm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ VPSRLQ zmm6,zmm10,4 ; AAA RL04+RL05 QL2 shift Hx to lower nibble in byte
+ VPSRLQ zmm7,zmm11,4 ; AAA RL06+RL07 QL3 shift Hx to lower nibble in byte
+
+ VPUNPCKLBW zmm2,zmm3,zmm9 ; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPUNPCKHBW zmm3,zmm3,zmm9 ; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ add rsi,rax ; add half the number of processed array elements
+
+ align 32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+; Vector Port info AVX512
+; ----------------------------------------
+; VPShift p0 l1
+; VPMax/Min p0 l1
+; VPMUL p0 l5 ; with 2FMA-Units p05 (SKX,CLX etc.)
+; VPMOVB2M p0 l3
+; VPSUBUSB /SSB p0 l1
+
+; VPALIGNR p5 l1 ;Shift of n*8 bits!
+; VPERM p5 l3
+; VPERMI2x 1*p05+2*p5 l7 ; (l9 with flags)
+; VPCompare p5 l3-l4
+; VP Pack/Unpack p5 l1(SKX) l3(TGL)
+; VPSHUF p5 l1
+
+
+.LHEXENCODE_LOOP:
+
+;; AAA+BBB
+; process unpacked AAA (QL0-QL4=RL00-RL07) in zmm0-zmm7 and process BBB (QL4-QL7=RL08-RL15) in zmm8-zmm15 and zmm2+zmm3
+
+ VPUNPCKLBW zmm4,zmm6,zmm10 ; AAA RL04 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPANDQ zmm0,zmm0,zmm30 ; AAA RL00 mask lower nibble
+
+ VPUNPCKHBW zmm5,zmm6,zmm10 ; AAA RL05 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+ VPSRLQ zmm10,zmm12,4 ; BBB RL08+RL09 QL4 shift Hx to lower nibble in byte
+ VPUNPCKLBW zmm8,zmm10,zmm12 ; BBB RL08 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPANDQ zmm1,zmm1,zmm30 ; AAA RL01 mask lower nibble
+
+ VPUNPCKLBW zmm6,zmm7,zmm11 ; AAA RL06 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPANDQ zmm2,zmm2,zmm30 ; AAA RL02 mask lower nibble
+ VPUNPCKHBW zmm7,zmm7,zmm11 ; AAA RL07 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+ VPSRLQ zmm11,zmm13,4 ; BBB RL10+RL11 QL5 shift Hx to lower nibble in byte
+ VPANDQ zmm8,zmm8,zmm30 ; BBB RL08 mask lower nibble
+
+ VPUNPCKHBW zmm9,zmm10,zmm12 ; BBB RL09 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+ VPANDQ zmm3,zmm3,zmm30 ; AAA RL03 mask lower nibble
+ VPUNPCKLBW zmm10,zmm11,zmm13 ; BBB RL10 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPANDQ zmm9,zmm9,zmm30 ; BBB RL09 mask lower nibble
+
+ VPSHUFB zmm0,zmm31,zmm0 ; AAA RL00 shuffle_to_hex_digits
+ VPANDQ zmm4,zmm4,zmm30 ; AAA RL04 mask lower nibble
+ VPUNPCKHBW zmm11,zmm11,zmm13 ; BBB RL11 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+ VPANDQ zmm10,zmm10,zmm30 ; BBB RL10 mask lower nibble
+
+ VPSHUFB zmm1,zmm31,zmm1 ; AAA RL01 shuffle_to_hex_digits
+ VPANDQ zmm5,zmm5,zmm30 ; AAA RL05 mask lower nibble
+ VPSHUFB zmm8,zmm31,zmm8 ; BBB RL08 shuffle_to_hex_digits
+ VPANDQ zmm11,zmm11,zmm30 ; BBB RL11 mask lower nibble
+
+ VPSHUFB zmm2,zmm31,zmm2 ; AAA RL02 shuffle_to_hex_digits
+ VMOVDQA64 [rdi+0*64],zmm0 ; AAA RL00 Store Hexdump
+ VMOVDQA64 [rdi+1*64],zmm1 ; AAA RL01 Store Hexdump
+ VPANDQ zmm6,zmm6,zmm30 ; AAA RL06 mask lower nibble
+ VPSHUFB zmm9,zmm31,zmm9 ; BBB RL09 shuffle_to_hex_digits
+
+ VPSHUFB zmm3,zmm31,zmm3 ; AAA RL03 shuffle_to_hex_digits
+ VPANDQ zmm7,zmm7,zmm30 ; AAA RL07 mask lower nibble
+ VMOVDQA64 [rdi+2*64],zmm2 ; AAA RL02 Store Hexdump
+ VPSRLQ zmm2,zmm14,4 ; BBB RL12+RL13 QL6 shift Hx to lower nibble in byte
+ VPSHUFB zmm10,zmm31,zmm10 ; BBB RL10 shuffle_to_hex_digits
+ VMOVDQA64 [rdi+3*64],zmm3 ; AAA RL03 Store Hexdump
+
+ VPSRLQ zmm3,zmm15,4 ; BBB RL14+RL15 QL7 shift Hx to lower nibble in byte
+ VPUNPCKLBW zmm12,zmm2,zmm14 ; BBB RL12 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPSHUFB zmm4,zmm31,zmm4 ; AAA RL04 shuffle_to_hex_digits
+ VMOVDQA64 [rdi+4*64],zmm4 ; AAA RL04 Store Hexdump
+ VPSHUFB zmm11,zmm31,zmm11 ; BBB RL11 shuffle_to_hex_digits
+ VPUNPCKHBW zmm13,zmm2,zmm14 ; BBB RL13 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ VPSHUFB zmm5,zmm31,zmm5 ; AAA RL05 shuffle_to_hex_digits
+ VPANDQ zmm12,zmm12,zmm30 ; BBB RL12 mask lower nibble
+ VPUNPCKLBW zmm14,zmm3,zmm15 ; BBB RL14 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VMOVDQA64 [rdi+5*64],zmm5 ; AAA RL05 Store Hexdump
+
+ VPSHUFB zmm6,zmm31,zmm6 ; AAA RL06 shuffle_to_hex_digits
+ VPANDQ zmm13,zmm13,zmm30 ; BBB RL13 mask lower nibble
+ VPUNPCKHBW zmm15,zmm3,zmm15 ; BBB RL15 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ VPSHUFB zmm7,zmm31,zmm7 ; AAA RL07 shuffle_to_hex_digits
+ VPANDQ zmm14,zmm14,zmm30 ; BBB RL14 mask lower nibble
+ VMOVDQA64 [rdi+6*64],zmm6 ; AAA RL06 Store Hexdump
+ VMOVDQA64 [rdi+7*64],zmm7 ; AAA RL07 Store Hexdump
+ VPSHUFB zmm12,zmm31,zmm12 ; BBB RL12 shuffle_to_hex_digits
+ VPANDQ zmm15,zmm15,zmm30 ; BBB RL15 mask lower nibble
+;
+ VMOVDQA64 [rdi+8*64],zmm8 ; BBB RL08 Store Hexdump
+ VPERMQ zmm8 ,zmm29,[rsi+0*64] ; AAA p____5 l3+ QL0 RL00,RL01
+ VMOVDQA64 [rdi+9*64],zmm9 ; BBB RL09 Store Hexdump
+ VPERMQ zmm9 ,zmm29,[rsi+1*64] ; AAA p____5 l3+ QL1 RL02,RL03
+ VPSHUFB zmm13,zmm31,zmm13 ; BBB RL13 shuffle_to_hex_digits
+
+ VMOVDQA64 [rdi+10*64],zmm10 ; BBB RL10 Store Hexdump
+ VPERMQ zmm10,zmm29,[rsi+2*64] ; AAA p____5 l3+ QL2 RL04,RL05
+ VPSRLQ zmm2,zmm16,4 ; CCC RL16+RL17 QL8 shift Hx to lower nibble in byte
+ VMOVDQA64 [rdi+11*64],zmm11 ; BBB RL11 Store Hexdump
+ VPERMQ zmm11,zmm29,[rsi+3*64] ; AAA p____5 l3+ QL3 RL06,RL07
+
+ VPSHUFB zmm14,zmm31,zmm14 ; BBB RL14 shuffle_to_hex_digits
+ VPSRLQ zmm3,zmm17,4 ; CCC RL18+RL19 QL9 shift Hx to lower nibble in byte
+ VPSHUFB zmm15,zmm31,zmm15 ; BBB RL15 shuffle_to_hex_digits
+
+ VPUNPCKLBW zmm0,zmm2,zmm16 ; CCC RL16 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPSRLQ zmm6,zmm18,4 ; CCC RL20+RL21 QLA shift Hx to lower nibble in byte
+ VMOVDQA64 [rdi+12*64],zmm12 ; BBB RL12 Store Hexdump
+ VPERMQ zmm12,zmm29,[rsi+4*64] ; BBB p____5 l3+ QL4 RL08,RL09
+
+ VPUNPCKHBW zmm1,zmm2,zmm16 ; CCC RL17 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+ VPSRLQ zmm7,zmm19,4 ; CCC RL22+RL23 QLB shift Hx to lower nibble in byte
+ VMOVDQA64 [rdi+13*64],zmm13 ; BBB RL13 Store Hexdump
+ VPERMQ zmm13,zmm29,[rsi+5*64] ; BBB p____5 l3+ QL5 RL10,RL11
+
+ VMOVDQA64 [rdi+14*64],zmm14 ; BBB RL14 Store Hexdump
+ VPERMQ zmm14,zmm29,[rsi+6*64] ; BBB p____5 l3+ QL6 RL12,RL13
+ VPUNPCKLBW zmm2,zmm3,zmm17 ; CCC RL18 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VMOVDQA64 [rdi+15*64],zmm15 ; BBB RL15 Store Hexdump
+
+ add rdi,rcx ; add half the number of processed output bytes
+
+ VPERMQ zmm15,zmm29,[rsi+7*64] ; BBB p____5 l3+ QL7 RL14,RL15
+ VPUNPCKHBW zmm3,zmm3,zmm17 ; CCC RL19 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+
+;; CCC+DDD
+; process unpacked CCC (QL8-QLC=RL16-RL23) in zmm0-zmm7 and process DDD (QLC-QLF=RL24-RL31) in zmm16-zmm23 and zmm2+zmm3
+ add rsi,rax ; add half the number of processed array elements
+
+ VPUNPCKLBW zmm4,zmm6,zmm18 ; CCC RL20 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPANDQ zmm0,zmm0,zmm30 ; CCC RL16 mask lower nibble
+
+ VPUNPCKHBW zmm5,zmm6,zmm18 ; CCC RL21 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+ VPSRLQ zmm18,zmm20,4 ; DDD RL24+RL25 QLC shift Hx to lower nibble in byte
+ VPUNPCKLBW zmm16,zmm18,zmm20 ; DDD RL24 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPANDQ zmm1,zmm1,zmm30 ; CCC RL17 mask lower nibble
+
+ VPUNPCKLBW zmm6,zmm7,zmm19 ; CCC RL22 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPANDQ zmm2,zmm2,zmm30 ; CCC RL18 mask lower nibble
+ VPUNPCKHBW zmm7,zmm7,zmm19 ; CCC RL23 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+ VPSRLQ zmm19,zmm21,4 ; DDD RL26+RL27 QLD shift Hx to lower nibble in byte
+ VPANDQ zmm16,zmm16,zmm30 ; DDD RL24 mask lower nibble
+
+ VPUNPCKHBW zmm17,zmm18,zmm20 ; DDD RL25 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+ VPANDQ zmm3,zmm3,zmm30 ; CCC RL19 mask lower nibble
+ VPUNPCKLBW zmm18,zmm19,zmm21 ; DDD RL26 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPANDQ zmm17,zmm17,zmm30 ; DDD RL25 mask lower nibble
+
+ VPSHUFB zmm0,zmm31,zmm0 ; CCC RL16 shuffle_to_hex_digits
+ VPANDQ zmm4,zmm4,zmm30 ; CCC RL20 mask lower nibble
+ VPUNPCKHBW zmm19,zmm19,zmm21 ; DDD RL27 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+ VPANDQ zmm18,zmm18,zmm30 ; DDD RL26 mask lower nibble
+
+ VPSHUFB zmm1,zmm31,zmm1 ; CCC RL17 shuffle_to_hex_digits
+ VPANDQ zmm5,zmm5,zmm30 ; CCC RL21 mask lower nibble
+ VPSHUFB zmm16,zmm31,zmm16 ; DDD RL24 shuffle_to_hex_digits
+ VPANDQ zmm19,zmm19,zmm30 ; DDD RL27 mask lower nibble
+
+ VPSHUFB zmm2,zmm31,zmm2 ; CCC RL18 shuffle_to_hex_digits
+ VMOVDQA64 [rdi+0*64],zmm0 ; CCC RL16 Store Hexdump
+ VMOVDQA64 [rdi+1*64],zmm1 ; CCC RL17 Store Hexdump
+ VPANDQ zmm6,zmm6,zmm30 ; CCC RL22 mask lower nibble
+ VPSHUFB zmm17,zmm31,zmm17 ; DDD RL25 shuffle_to_hex_digits
+
+ VPSHUFB zmm3,zmm31,zmm3 ; CCC RL19 shuffle_to_hex_digits
+ VPANDQ zmm7,zmm7,zmm30 ; CCC RL23 mask lower nibble
+ VMOVDQA64 [rdi+2*64],zmm2 ; CCC RL18 Store Hexdump
+ VPSRLQ zmm2,zmm22,4 ; DDD RL28+RL29 QLE shift Hx to lower nibble in byte
+ VPSHUFB zmm18,zmm31,zmm18 ; DDD RL26 shuffle_to_hex_digits
+ VMOVDQA64 [rdi+3*64],zmm3 ; CCC RL19 Store Hexdump
+
+ VPSRLQ zmm3,zmm23,4 ; DDD RL30+RL31 QLF shift Hx to lower nibble in byte
+ VPUNPCKLBW zmm20,zmm2,zmm22 ; DDD RL28 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPSHUFB zmm4,zmm31,zmm4 ; CCC RL20 shuffle_to_hex_digits
+ VMOVDQA64 [rdi+4*64],zmm4 ; CCC RL20 Store Hexdump
+ VPSHUFB zmm19,zmm31,zmm19 ; DDD RL27 shuffle_to_hex_digits
+ VPUNPCKHBW zmm21,zmm2,zmm22 ; DDD RL29 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ VPSHUFB zmm5,zmm31,zmm5 ; CCC RL21 shuffle_to_hex_digits
+ VPANDQ zmm20,zmm20,zmm30 ; DDD RL28 mask lower nibble
+ VPUNPCKLBW zmm22,zmm3,zmm23 ; DDD RL30 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VMOVDQA64 [rdi+5*64],zmm5 ; CCC RL21 Store Hexdump
+
+ VPSHUFB zmm6,zmm31,zmm6 ; CCC RL22 shuffle_to_hex_digits
+ VPANDQ zmm21,zmm21,zmm30 ; DDD RL29 mask lower nibble
+ VPUNPCKHBW zmm23,zmm3,zmm23 ; DDD RL31 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ VPSHUFB zmm7,zmm31,zmm7 ; CCC RL23 shuffle_to_hex_digits
+ VPANDQ zmm22,zmm22,zmm30 ; DDD RL30 mask lower nibble
+ VMOVDQA64 [rdi+6*64],zmm6 ; CCC RL22 Store Hexdump
+ VMOVDQA64 [rdi+7*64],zmm7 ; CCC RL23 Store Hexdump
+ VPSHUFB zmm20,zmm31,zmm20 ; DDD RL28 shuffle_to_hex_digits
+ VPANDQ zmm23,zmm23,zmm30 ; DDD RL31 mask lower nibble
+;
+ VMOVDQA64 [rdi+8*64],zmm16 ; DDD RL24 Store Hexdump
+ VPERMQ zmm16,zmm29,[rsi+0*64] ; CCC p____5 l3+ QL8 RL16,RL17
+ VMOVDQA64 [rdi+9*64],zmm17 ; DDD RL25 Store Hexdump
+ VPERMQ zmm17,zmm29,[rsi+1*64] ; CCC p____5 l3+ QL9 RL18,RL19
+ VPSHUFB zmm21,zmm31,zmm21 ; DDD RL29 shuffle_to_hex_digits
+
+ VMOVDQA64 [rdi+10*64],zmm18 ; DDD RL26 Store Hexdump
+ VPERMQ zmm18,zmm29,[rsi+2*64] ; CCC p____5 l3+ QLA RL20,RL21
+ VPSRLQ zmm2,zmm8,4 ; AAA RL00+RL01 QL0 shift Hx to lower nibble in byte
+ VMOVDQA64 [rdi+11*64],zmm19 ; DDD RL27 Store Hexdump
+ VPERMQ zmm19,zmm29,[rsi+3*64] ; CCC p____5 l3+ QLB RL22,RL23
+
+ VPSHUFB zmm22,zmm31,zmm22 ; DDD RL30 shuffle_to_hex_digits
+ VPSRLQ zmm3,zmm9,4 ; AAA RL02+RL03 QL1 shift Hx to lower nibble in byte
+ VPSHUFB zmm23,zmm31,zmm23 ; DDD RL31 shuffle_to_hex_digits
+
+ VPUNPCKLBW zmm0,zmm2,zmm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VPSRLQ zmm6,zmm10,4 ; AAA RL04+RL05 QL2 shift Hx to lower nibble in byte
+ VMOVDQA64 [rdi+12*64],zmm20 ; DDD RL28 Store Hexdump
+ VPERMQ zmm20,zmm29,[rsi+4*64] ; DDD p____5 l3+ QLC RL24,RL25
+
+ VPUNPCKHBW zmm1,zmm2,zmm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+ VPSRLQ zmm7,zmm11,4 ; AAA RL06+RL07 QL3 shift Hx to lower nibble in byte
+ VMOVDQA64 [rdi+13*64],zmm21 ; DDD RL29 Store Hexdump
+ VPERMQ zmm21,zmm29,[rsi+5*64] ; DDD p____5 l3+ QLD RL26,RL27
+
+ VMOVDQA64 [rdi+14*64],zmm22 ; DDD RL30 Store Hexdump
+ VPERMQ zmm22,zmm29,[rsi+6*64] ; DDD p____5 l3+ QLE RL28,RL29
+ VPUNPCKLBW zmm2,zmm3,zmm9 ; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+ VMOVDQA64 [rdi+15*64],zmm23 ; DDD RL31 Store Hexdump
+ VPERMQ zmm23,zmm29,[rsi+7*64] ; DDD p____5 l3+ QLF RL30,RL31
+
+ add rsi,rax ; add half the number of processed array elements
+
+ VPUNPCKHBW zmm3,zmm3,zmm9 ; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+ add rdi,rcx ; add half the number of processed output bytes
+
+ cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+ jl .LHEXENCODE_LOOP
+
+ ; end of normal loop reached
+ ; we can do one more round when original count has been reduced by one round
+ cmp rax,0
+ je .LFINISH_EXTRA
+
+ cmp rdx,r9 ; input buffer length was not reduced when equal
+ je .LFINISH_NORMAL
+
+ add rax,rax ; rax is only half the bytes of input round, so double it
+ sub rsi,rax ; for prefetching the last round, load the last round again
+ sub rdx,rax ; adopt and condition for last round also
+ mov rax,0
+ jmp .LHEXENCODE_LOOP
+
+
+.LFINISH_EXTRA:
+ add rsi,NINP_BYTES_PER_ROUND ; add the extra round to get processed bytes
+ jmp .LFINISH
+
+.LFINISH_NORMAL:
+ sub rsi,NINP_BYTES_PER_ROUND ; sub the added prefetch round to get processed bytes
+
+.LFINISH:
+ ; r9 = address of requested input bytes+1
+ ; rsi = address of processed input bytes+1
+ ; now get the minimum of rdx,rsi to rax
+;; sub rsi,rax ; for last round do nothing (rax=0), else sub increment for one round
+;; sub r9,rax
+
+ mov rax,r12
+ cmp rsi,r12 ; get min from rdx (address of requested input) and rsi (address of done input)
+
+ jge .LCALC_PROCESSED_BYTES
+ mov rax,rsi ; rax=address of last valid input byte+1
+
+.LCALC_PROCESSED_BYTES:
+ sub rax,r10 ; sub the input buffer start address
+ ; rax = number of valid processed input bytes = return value
+
+ cmp rsi,rdx ; compare rdx (address of requested input) and rsi (address of done input)
+ je .LNO_ZERO_OUT
+
+ mov r15,rax ; number of elements to process
+
+ shl r15,1 ; number of output bytes
+
+ add r15,r11 ; pointer to next byte after full valid output buffer
+
+
+ VPXORQ zmm0,zmm0,zmm0 ; all zero
+;ZERO VMOVDQU64 [r15],zmm0 ; zero out one register width after last output
+
+.LNO_ZERO_OUT:
+
+%ifdef __WIN__
+
+ VMOVDQA xmm6 ,[rsp ]
+ VMOVDQA xmm7 ,[rsp+1*16]
+ VMOVDQA xmm8 ,[rsp+2*16]
+ VMOVDQA xmm9 ,[rsp+3*16]
+ VMOVDQA xmm10,[rsp+4*16]
+ VMOVDQA xmm11,[rsp+5*16]
+ VMOVDQA xmm12,[rsp+6*16]
+ VMOVDQA xmm13,[rsp+7*16]
+ VMOVDQA xmm14,[rsp+8*16]
+ VMOVDQA xmm15,[rsp+9*16]
+
+%endif
+
+ mov rdi,[rsp+STACK_FOR_XMM+0*8]
+ mov rsi,[rsp+STACK_FOR_XMM+1*8]
+ mov r12,[rsp+STACK_FOR_XMM+2*8]
+ mov r14,[rsp+STACK_FOR_XMM+3*8]
+ mov r15,[rsp+STACK_FOR_XMM+4*8]
+
+ add rsp,STACK_ADJ
+
+ ret
+
+;----------------------------------------------------------------------------------------------
+
+%endif
diff --git a/postgresql-15devel_orig/src/backend/utils/adt/varlena.c b/postgresql-15devel/src/backend/utils/adt/varlena.c
index bd3091b..183f67f 100644
--- a/postgresql-15devel_orig/src/backend/utils/adt/varlena.c
+++ b/postgresql-15devel/src/backend/utils/adt/varlena.c
@@ -397,7 +397,7 @@ byteaout(PG_FUNCTION_ARGS)
if (bytea_output == BYTEA_OUTPUT_HEX)
{
/* Print hex format */
- rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
+ rp = result = palloc(hex_enc_len(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena)) + 2 + 1);
*rp++ = '\\';
*rp++ = 'x';
rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
diff --git a/postgresql-15devel_orig/src/include/utils/builtins.h b/postgresql-15devel/src/include/utils/builtins.h
index b07eefa..e6efb73 100644
--- a/postgresql-15devel_orig/src/include/utils/builtins.h
+++ b/postgresql-15devel/src/include/utils/builtins.h
@@ -35,6 +35,9 @@ extern int errdomainconstraint(Oid datatypeOid, const char *conname);
extern uint64 hex_encode(const char *src, size_t len, char *dst);
extern uint64 hex_decode(const char *src, size_t len, char *dst);
+extern uint64 hex_enc_len(const char *src, size_t srclen);
+extern uint64 hex_dec_len(const char *src, size_t srclen);
+
/* int.c */
extern int2vector *buildint2vector(const int16 *int2s, int n);