0001_hex_encode.patch

application/octet-stream

Filename: 0001_hex_encode.patch
Type: application/octet-stream
Part: 0
Message: AW: Introducing PgVA aka PostgresVectorAcceleration using SIMD vector instructions starting with hex_encode

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: unified
File+
src/backend/utils/adt/cpu_capabilities_x86_64.asm 630 0
src/backend/utils/adt/encode.c 88 8
src/backend/utils/adt/hex_x86_64.asm 2915 0
src/backend/utils/adt/Makefile 2 0
src/backend/utils/adt/varlena.c 1 1
src/include/utils/builtins.h 3 0
src/Makefile.global.in 8 0
diff --git a/postgresql-15devel_orig/src/Makefile.global.in b/postgresql-15devel/src/Makefile.global.in
index 05c54b2..ea5c785 100644
--- a/postgresql-15devel_orig/src/Makefile.global.in
+++ b/postgresql-15devel/src/Makefile.global.in
@@ -270,6 +270,10 @@ LLVM_CPPFLAGS = @LLVM_CPPFLAGS@
 LLVM_CFLAGS = @LLVM_CFLAGS@
 LLVM_CXXFLAGS = @LLVM_CXXFLAGS@
 
+# TODO should be adapted to configure
+NASM = nasm
+NASMFLAGS = elf64
+
 # Kind-of compilers
 
 BISON = @BISON@
@@ -782,6 +786,10 @@ endif
 %.bz2: %
 	$(BZIP2) -c $< >$@
 
+%.o: %.asm
+	$(NASM) -f $(NASMFLAGS) -g -o $@ $< 
+
+
 # Direct builds of foo.c -> foo are disabled to avoid generating
 # *.dSYM junk on Macs.  All builds should normally go through the
 # foo.c -> foo.o -> foo steps.  This also ensures that dependency
diff --git a/postgresql-15devel_orig/src/backend/utils/adt/Makefile b/postgresql-15devel/src/backend/utils/adt/Makefile
index 41b486b..fa74e69 100644
--- a/postgresql-15devel_orig/src/backend/utils/adt/Makefile
+++ b/postgresql-15devel/src/backend/utils/adt/Makefile
@@ -25,6 +25,7 @@ OBJS = \
 	bool.o \
 	cash.o \
 	char.o \
+	cpu_capabilities_x86_64.o \
 	cryptohashfuncs.o \
 	date.o \
 	datetime.o \
@@ -42,6 +43,7 @@ OBJS = \
 	geo_ops.o \
 	geo_selfuncs.o \
 	geo_spgist.o \
+	hex_x86_64.o \
 	inet_cidr_ntop.o \
 	inet_net_pton.o \
 	int.o \
diff --git a/postgresql-15devel/src/backend/utils/adt/cpu_capabilities_x86_64.asm b/postgresql-15devel/src/backend/utils/adt/cpu_capabilities_x86_64.asm
new file mode 100644
index 0000000..bcb7db3
--- /dev/null
+++ b/postgresql-15devel/src/backend/utils/adt/cpu_capabilities_x86_64.asm
@@ -0,0 +1,630 @@
+%ifdef __NASM_MAJOR__
+%ifdef COMPILE_C_STYLE_COMMENTS
+/*-------------------------------------------------------------------------
+ *
+ * cpu_capabilities_x86_64.asm
+ *	  Assembler routines for fetching the cpu_capabilities in a convenient int64
+ *	  and selecting the maximum possible implementation for all valid algorithms
+ *
+ * Copyright (c) 2021-2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/cpu_capabilities_x86_64.asm
+ *
+ *-------------------------------------------------------------------------
+ */
+%endif
+
+
+
+
+
+
+; cpu_capabilities_x86_64.sam
+; Assembler routines for converting a buffer to hex (cpu_capabilities_encode_xxx)
+; and restore the binary from cpu_capabilities_code (cpu_capabilities_decode_xxx) on Intel X64
+
+
+; nasm -f WIN64 -g cpu_capabilities_x86_64.asm    -l cpu_capabilities_x86_64.lis
+
+; golink /console hexdump.obj cpu_capabilities_x86_64.obj hex_x86_64.obj base64_x86_64.obj /files
+
+; Linux register order: 	%rdi, %rsi, %rdx, %rcx, %r8 and %r9
+; Windows register order:	 rcx,  rdx,  r8,   r9
+
+; Windows non volatile registers:	rbx,rbp,rdi,rsi,rsp, r12,r13,r14,r15 and xmm6-xmm15
+; Linux non volatile registers:     rbx,rbp,        rsp, r12,r13,r14,r15
+
+; nasm -f elf64 -g cpu_capabilities_x86_64.asm    -l cpu_capabilities_x86_64_elf64.lis
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define __WIN__ 1
+%elifidn __OUTPUT_FORMAT__, elf64
+%define __ELF__ 1
+%endif
+
+
+global apply_cpu_capabilities
+
+global get_instr_info
+
+;;global get_highest_impl_id
+
+
+
+default rel
+
+section .rdata align=64
+
+; these are the id defines for different algorithms implemented or planned
+; every implementation know its own IMPL_ID, which should never change
+;
+
+%define	ALGORITHM_ID_HEX_ENCODE			0
+%define	ALGORITHM_ID_HEX_DECODE			1
+%define	ALGORITHM_ID_BASE64_ENCODE		2
+%define	ALGORITHM_ID_BASE64_DECODE		3
+%define	ALGORITHM_ID_CECKSUM			4
+%define	ALGORITHM_ID_CECKSUM_COPY		5
+
+
+
+
+%define	CPU_IS_ARCH_X86_64		1
+%define	CPU_HAS_SSE2			8
+%define	CPU_HAS_SSE3			9
+%define	CPU_HAS_SSSE3			10
+%define	CPU_HAS_SSE4_1			11
+%define	CPU_HAS_SSE4_2			12
+%define	CPU_HAS_AVX				13
+%define	CPU_HAS_F16C			14
+%define	CPU_HAS_AVX2			15
+%define	CPU_HAS_AVX512_F		16
+%define	CPU_HAS_AVX512_VL		17
+%define	CPU_HAS_AVX512_DQ		18
+%define	CPU_HAS_AVX512_BW		19
+%define	CPU_HAS_AVX512_IFMA		20
+%define	CPU_HAS_AVX512_VBMI		21
+%define	CPU_HAS_AVX512_VBMI2	22
+%define	CPU_HAS_AVX512_VNNI		23
+%define	CPU_HAS_AVX512_BITALG	24
+%define	CPU_HAS_AVX512_VPOPCNTDQ	25
+%define	CPU_HAS_AVX512_VP2INTERSECT	26
+%define	CPU_HAS_AVX512_FP16		27
+%define	CPU_HAS_AMX_TILE		28
+%define	CPU_HAS_AMX_BF16		29
+%define	CPU_HAS_AMX_INT8		31
+
+
+REQUIREMENTS_ARR:
+HEX_ENC_CPU_REQUIREMENTS_ARR:
+		dq 0
+		dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_SSE2)
+		dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_SSSE3)
+		dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_AVX2)
+		dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_AVX512_BW)
+
+
+
+HEX_DEC_CPU_REQUIREMENTS_ARR:
+		dq 0
+		dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_SSE2)
+		dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_AVX2)
+		dq CPU_IS_ARCH_X86_64 + (1<<CPU_HAS_AVX512_BW)
+
+
+CPU_REQUIREMENTS_OFFS_ARR_X86_64:
+		dq HEX_ENC_CPU_REQUIREMENTS_ARR - REQUIREMENTS_ARR		;	ALGORITHM_ID_HEX_ENCODE			0
+		dq HEX_DEC_CPU_REQUIREMENTS_ARR - REQUIREMENTS_ARR		;	ALGORITHM_ID_HEX_DECODE			1
+		dq 0        						;	ALGORITHM_ID_BASE64_ENCODE		2
+		dq 0        						;	ALGORITHM_ID_BASE64_DECODE		3
+		dq 0        						;	ALGORITHM_ID_CECKSUM			4
+		dq 0        						;	ALGORITHM_ID_CECKSUM_COPY		5
+
+		times 60 dq 0
+		dq -1
+
+
+
+
+VALID_IMPL_ID_ARR_X86_64:
+; TODO due to error reduced to 3 from 4 !!! 
+		dq 3		;	ALGORITHM_ID_HEX_ENCODE			0
+		dq 3        ;	ALGORITHM_ID_HEX_DECODE			1
+		dq 0        ;	ALGORITHM_ID_BASE64_ENCODE		2
+		dq 0        ;	ALGORITHM_ID_BASE64_DECODE		3
+		dq 0        ;	ALGORITHM_ID_CECKSUM			4
+		dq 0        ;	ALGORITHM_ID_CECKSUM_COPY		5
+
+		times 60 dq -1
+
+
+
+
+
+
+section .text align=32
+
+
+%use smartalign
+
+	ALIGNMODE 	p6
+
+
+%ifdef	USE_LOCAL_IMPL_ID_ARR_LOOKUP
+;----------------------------------------------------------------------------------------------
+
+; get_highest_impl_id accepts the ALGORITHM_ID of the requested algorithm
+; and returns the highest impl_id valid on the current architecture.
+; A return value of zero indicates no valid implementation
+
+get_highest_impl_id:
+
+	sub			rsp,0x28
+
+%ifdef __WIN__
+;	mov			rcx,rcx							; WIN parameter 1 ALGORITHM_ID
+%else
+	mov			rcx,rdi							; LINUX parameter 1 ALGORITHM_ID
+%endif
+
+	mov			rax,[8*rcx+VALID_IMPL_ID_ARR_X86_64]
+
+	add			rsp,0x28
+
+	ret
+
+%endif
+
+
+;----------------------------------------------------------------------------------------------
+
+; apply_cpu_capabilities fetches the cpu capabilities and compres it to the cpu_requirements
+; of all Algorithms.
+; It uses a local array of algorithm_impl_id, which contains the maximum impl_id for
+; this specific algorithm. A zero indicates the algorithm has not benn implemented yet,
+; a -1 terminates the list
+; according to the cpu capabilities a public impl_id_array (passed by address parameter)
+; is filled with the maximum supported impl_ids of the algorithms
+; this public array is initialized to all zero, so the support of a specific algorithm must
+; be enabled by this routine
+; on other CPU architectures (only 1 architecture is active in any program) similar routeines
+; can fill the public impl_id_arr according to their implementations.
+; For reference the cpu_capabilites are set to a public int8 variable passed by address.
+; there is a unique value for every interesting CPU architecture.
+;
+; To mask one or more algorithms (bug hunting, debugging), a mask can be passed as third parameter.
+; every Algorithm_ID has the corresponding bit in the mask  (bit 0..x corresponds to algorithm 0..x).
+; when the bit for a certain algorithm is set, the public impl_id_arr is not set and left at its
+; default of zero.
+
+%define	STACK_ADJ	0x28+4*8
+
+apply_cpu_capabilities:
+						; parameter 1 address cpu_capabilities 	(int8)
+						; parameter 2 address valid_impl_arr	(array of int8)
+						; parameter 3 bitmask algorithm disable	(int8 by value)
+
+
+	sub			rsp,STACK_ADJ
+
+	mov			[rsp+0*8],rdi
+	mov			[rsp+1*8],rsi
+	mov			[rsp+2*8],r9
+	mov			[rsp+3*8],r15
+
+%ifdef __WIN__
+	mov			rdi,rcx							; parameter 1 address cpu_capabilities
+
+	mov			rsi,rdx							; parameter 2 address valid_impl_arr
+
+	mov			rdx,r8							; parameter 3 bitmask algorithm disable
+
+%endif
+
+	mov			r9,[rdi]						; load current cpu_capabilities
+	cmp			r9,0
+	jns			.capabilities_set
+
+	call		get_instr_info
+	mov			[rdi],rax
+	mov			r9,rax							; r9 = current capabilities
+
+.capabilities_set:
+
+	mov			r8,1							; r8 bitmask for current algorithm
+	lea			rdi,[VALID_IMPL_ID_ARR_X86_64]	; rdi address of local impl_id_arr
+	xor			r15,r15							; r15 current offset into algorithm arrays
+
+.loop_algorithm:
+	mov			rcx,[rdi+8*r15]					; rcx = max index for current algorithm
+	cmp			rcx,0
+	jl			.end_loop_algorithm				; current index = -1 -> goto end
+	je			.skip_algorithm
+
+	lea			rax,[CPU_REQUIREMENTS_OFFS_ARR_X86_64]	; start offset of requirement_arr of current algorithm
+	mov			rax,[rax+8*r15]					; start offset of requirement_arr of current algorithm
+	lea			r10,[REQUIREMENTS_ARR]			; r10 pointer to requirement_arr for current algorithm
+	add			r10,rax
+
+.check_requirements:
+	mov			rax,r9							; rax temp for current capabilities
+	and			rax,[8*rcx+r10]
+	cmp			rax,[8*rcx+r10]
+	je			.max_index_found
+	sub			rcx,1
+	jnz			.check_requirements
+
+.max_index_found:
+	mov			rax,r8							; rax temp for bitmask of current algorithm
+	test		rax,rdx
+	jnz			.skip_algorithm
+	mov			[rsi+8*r15],rcx
+
+
+.skip_algorithm:
+	add 		r8,r8							; shift bitmask of current algorithm 1 to the right
+	add			r15,1
+	jmp			.loop_algorithm
+
+.end_loop_algorithm:
+
+.return:
+
+	mov			rdi,[rsp+0*8]
+	mov			rsi,[rsp+1*8]
+	mov			r9 ,[rsp+2*8]
+	mov			r15,[rsp+3*8]
+
+	add			rsp,STACK_ADJ
+
+	ret
+
+;----------------------------------------------------------------------------------------------
+
+	; 		CPUID Input EAX=01h
+	;		Feature Information Returned in the ECX Register (according to Intel Instruction Manual)
+	;ECX bit
+	;->	 0 	SSE3 Streaming SIMD Extensions 3
+	;	 1 	PCLMULQDQ
+	;	 2 	DTES64 64-bit DS Area.
+	;	 3 	MONITOR MONITOR/MWAIT.
+	;	 4 	DS-CPL CPL Qualified Debug Store.
+	;	 5 	VMX Virtual Machine Extensions.
+	;	 6 	SMX Safer Mode Extensions.
+	;	 7 	EIST Enhanced Intel SpeedStep® technology.
+	;	 8 	TM2 Thermal Monitor 2.
+	;->	 9 	SSSE3
+	;	10 CNXT-ID L1 Context ID.
+	;	11 SDBG
+	;	12 FMA
+	;	13 CMPXCHG16B
+	;	14 xTPR Update Control
+	;	15 PDCM Perfmon and Debug Capability.
+	;	16 Reserved
+	;	17 PCID Process-context identifiers.
+	;	18 DCA
+	;->	19 SSE4_1
+	;->	20 SSE4_2
+	;	21 x2APIC
+	;	22 MOVBE
+	;	23 POPCNT
+	;	24 TSC-Deadline
+	;	25 AESNI
+	;	26 XSAVE
+	;	27 OSXSAVE
+	;->	28 AVX
+	;->	29 F16C
+	;	30 RDRAND
+	;	31 Not Used
+
+	; 		CPUID Input EAX=01h
+	;		Feature Information Returned in the EDX Register (according to Intel Instruction Manual)
+	; EDX bit
+	;	 0 FPU Floating Point Unit On-Chip.
+	;	 1 VME Virtual 8086 Mode Enhancements.
+	;	 2 DE Debugging Extensions.
+	;	 3 PSE Page Size Extension.
+	;	 4 TSC Time Stamp Counter.
+	;	 5 MSR Model Specific Registers RDMSR and WRMSR Instructions.
+	;	 6 PAE Physical Address Extension.
+	;	 7 MCE Machine Check Exception.
+	;	 8 CX8 CMPXCHG8B Instruction. Th
+	;	 9 APIC APIC On-Chip.
+	;	10 Reserved
+	;	11 SEP SYSENTER and SYSEXIT Instructions.
+	;	12 MTRR Memory Type Range Registers
+	;	13 PGE Page Global Bit.
+	;	14 MCA Machine Check Architecture.
+	;	15 CMOV Conditional Move Instructions.
+	;	16 PAT Page Attribute Table.
+	;	17 PSE-36 36-Bit Page Size Extension.
+	;	18 PSN Processor Serial Number.
+	;	19 CLFSH CLFLUSH Instruction.
+	;	20 Reserved
+	;	21 DS Debug Store.
+	;	22 ACPI Thermal Monitor and Software Controlled Clock Facilities.
+	;	23 MMX Intel MMX Technology.
+	;	24 FXSR FXSAVE and FXRSTOR Instructions.
+	;	25 SSE SSE.
+	;->	26 SSE2 SSE2.
+	;	27 SS Self Snoop.
+	;	28 HTT Max APIC IDs reserved field is Valid.
+	;	29 TM Thermal Monitor.
+	;	30 Reserved
+	;	31 PBE Pending Break Enable.
+	;
+
+	; 		CPUID Input EAX=07H
+	;		Feature Information returned in the EAX-EDX Registers (according to Intel Instruction Set extension Manual)
+
+; EBX bits
+	;	EBX Bit00: FSGSBASE. Supports RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE if 1.
+	;	EBX Bit01: IA32_TSC_ADJUST MSR is supported if 1.
+	;	EBX Bit02: SGX
+	;	EBX Bit03: BMI1
+	;	EBX Bit04: HLE
+	;->	EBX Bit05: Intel® AVX2
+	;	EBX Bit06: FDP_EXCPTN_ONLY. x87 FPU Data Pointer updated only on x87 exceptions if 1.
+	;	EBX Bit07: SMEP. Supports Supervisor Mode Execution Protection if 1.
+	;	EBX Bit08: BMI2
+	;	EBX Bit09: Supports Enhanced REP MOVSB/STOSB if 1.
+	;	EBX Bit10: INVPCID
+	;	EBX Bit11: RTM
+	;	EBX Bit12: RDT-M. Supports Intel® Resource Director Technology (Intel® RDT) Monitoring capability if 1.
+	;	EBX Bit13: Deprecates FPU CS and FPU DS values if 1.
+	;	EBX Bit14: Intel® Memory Protection Extensions
+	;	EBX Bit15: RDT-A. Supports Intel® Resource Director Technology (Intel® RDT) Allocation capability if 1.
+	;->	EBX Bit16: AVX512F
+	;->	EBX Bit17: AVX512DQ
+	;	EBX Bit18: RDSEED
+	;	EBX Bit19: ADX
+	;	EBX Bit20: SMAP
+	;->	EBX Bit21: AVX512_IFMA
+	;	EBX Bit22: Reserved
+	;	EBX Bit23: CLFLUSHOPT
+	;	EBX Bit24: CLWB
+	;	EBX Bit25: Intel Processor Trace
+	;	EBX Bit26: AVX512PF (Intel® Xeon Phi™ only.)
+	;	EBX Bit27: AVX512ER (Intel® Xeon Phi™ only.)
+	;	EBX Bit28: AVX512CD
+	;	EBX Bit29: SHA
+	;->	EBX Bit30: AVX512BW
+	;->	EBX Bit31: AVX512VL
+
+
+; ECX bits
+	;	ECX Bit00: PREFETCHWT1 (Intel® Xeon Phi™ only.)
+	;->	ECX Bit01: AVX512_VBMI
+	;	ECX Bit02: UMIP. Supports user-mode instruction prevention if 1.
+	;	ECX Bit03: PKU. Supports protection keys for user-mode pages if 1.
+	;	ECX Bit04: OSPKE. If 1, OS has set CR4.PKE to enable protection keys (and the RDPKRU/WRPKRU instructions).
+	;	ECX Bit05: WAITPKG
+	;->	ECX Bit06: AVX512_VBMI2
+	;	ECX Bit07: CET_SS. Supports CET shadow stack features if 1.
+	;	ECX Bit08: GFNI
+	;	ECX Bit09: VAES
+	;	ECX Bit10: VPCLMULQDQ
+	;->	ECX Bit11: AVX512_VNNI
+	;->	ECX Bit12: AVX512_BITALG
+	;	ECX Bit13: TME_EN.
+	;->	ECX Bit14: AVX512_VPOPCNTDQ
+	;	ECX Bit15: Reserved
+	;	ECX Bit16: LA57. Supports 57-bit linear addresses and five-level paging if 1.
+	;	ECX Bits 21-17: The value of MAWAU used by the BNDLDX and BNDSTX instructions in 64-bit mode.
+	;	ECX Bit22: RDPID and IA32_TSC_AUX are available if 1.
+	;	ECX Bit23: KL. Supports Key Locker if 1.
+	;	ECX Bit24: Reserved
+	;	ECX Bit25: CLDEMOTE. Supports cache line demote if 1.
+	;	ECX Bit26: Reserved
+	;	ECX Bit27: MOVDIRI. Supports MOVDIRI if 1.
+	;	ECX Bit28: MOVDIR64B. Supports MOVDIR64B if 1.
+	;	ECX Bit29: ENQCMD: Supports Enqueue Stores if 1.
+	;	ECX Bit30: SGX_LC. Supports SGX Launch Configuration if 1.
+	;	ECX Bit31: PKS. Supports protection keys for supervisor-mode pages if 1.
+
+; EDX bits
+	;	EDX Bits 01-00: Reserved
+	;	EDX Bit02: AVX512_4VNNIW (Intel® Xeon Phi™ only.)
+	;	EDX Bit03: AVX512_4FMAPS (Intel® Xeon Phi™ only.)
+	;	EDX Bit04: Fast Short REP MOV
+	;	EDX Bit05: UINTR. If 1, the processor supports user interrupts.
+	;	EDX Bits 07-06: Reserved
+	;->	EDX Bit08: AVX512_VP2INTERSECT
+	;	EDX Bit09: Reserved
+	;	EDX Bit10: MD_CLEAR supported.
+	;	EDX Bits 13-11: Reserved
+	;	EDX Bit14: SERIALIZE
+	;	EDX Bit15: Hybrid. If 1, the processor is identified as a hybrid part.
+	;	EDX Bit16: TSXLDTRK. If 1, the processor supports Intel TSX suspend load address tracking.
+	;	EDX Bit17: Reserved
+	;	EDX Bit18: PCONFIG
+	;	EDX Bit19: Reserved
+	;	EDX Bit20: CET_IBT. Supports CET indirect branch tracking features if 1.
+	;	EDX Bit21: Reserved
+	;->	EDX Bit22: AMX-BF16. If 1, the processor supports tile computational operations on bfloat16 numbers.
+	;->	EDX Bit23: AVX512_FP16
+	;->	EDX Bit24: AMX-TILE. If 1, the processor supports tile architecture
+	;->	EDX Bit25: AMX-INT8. If 1, the processor supports tile computational operations on 8-bit integers.
+	;	EDX Bit26: Enumerates support for indirect branch restricted speculation (IBRS) and the indirect branch predictor barrier (IBPB).
+	;	EDX Bit27: Enumerates support for single thread indirect branch predictors (STIBP).
+	;	EDX Bit29: Enumerates support for the IA32_ARCH_CAPABILITIES MSR.
+	;	EDX Bit30: Enumerates support for the IA32_CORE_CAPABILITIES MSR.
+	;	EDX Bit31: Enumerates support for Speculative Store Bypass Disable (SSBD).
+
+
+
+%define	STACK_ADJ	0x28+6*8
+
+get_instr_info:
+
+	sub			rsp,STACK_ADJ
+
+	mov			[rsp+0*8],rbx
+	mov			[rsp+1*8],rcx
+	mov			[rsp+2*8],rdx
+	mov			[rsp+3*8],r8
+	mov			[rsp+4*8],r9
+	mov			[rsp+5*8],r15
+
+
+									; NOTE: the upper bits 32-63 of the corresponding 64bit register are zeroed on 32bit movs!
+									; so it is easy to adapt the scheme to more CPU-features occupying the upper 32 bits
+	mov			r9d,CPU_IS_ARCH_X86_64
+
+;LEAF_01H
+	mov			eax,0x01
+	cpuid
+
+;ECX
+	mov			r8d,ecx
+	and			r8d,1<<0
+	shl			r8d,CPU_HAS_SSE3-0
+	or			r9d,r8d
+
+	mov			r8d,ecx
+	and			r8d,1<<9
+	shl			r8d,CPU_HAS_SSSE3-9
+	or			r9d,r8d
+
+	mov			r8d,ecx
+	and			r8d,1<<19
+	shr			r8d,19-CPU_HAS_SSE4_1
+	or			r9d,r8d
+
+	mov			r8d,ecx
+	and			r8d,1<<20
+	shr			r8d,20-CPU_HAS_SSE4_2
+	or			r9d,r8d
+
+	mov			r8d,ecx
+	and			r8d,1<<28
+	shr			r8d,28-CPU_HAS_AVX
+	or			r9d,r8d
+
+	mov			r8d,ecx
+	and			r8d,1<<29
+	shr			r8d,29-CPU_HAS_F16C
+	or			r9d,r8d
+
+
+;EDX
+	mov			r8d,edx
+	and			r8d,1<<26
+	shr			r8d,26-CPU_HAS_SSE2
+	or			r9d,r8d
+
+
+;LEAF_07H
+	mov			eax,0x07
+	mov			ecx,0
+	cpuid
+
+;EBX
+	mov			r8d,ebx
+	and			r8d,1<<5
+	shl			r8d,CPU_HAS_AVX2-5
+	or			r9d,r8d
+
+	mov			r8d,ebx
+	and			r8d,1<<16
+	shl			r8d,CPU_HAS_AVX512_F-16
+	or			r9d,r8d
+
+	mov			r8d,ebx
+	and			r8d,1<<17
+	shl			r8d,CPU_HAS_AVX512_DQ-17
+	or			r9d,r8d
+
+	mov			r8d,ebx
+	and			r8d,1<<21
+	shr			r8d,21-CPU_HAS_AVX512_IFMA
+	or			r9d,r8d
+
+	mov			r8d,ebx
+	and			r8d,1<<30
+	shr			r8d,30-CPU_HAS_AVX512_BW
+	or			r9d,r8d
+
+	mov			r8d,ebx
+	and			r8d,1<<31
+	shr			r8d,31-CPU_HAS_AVX512_VL
+	or			r9d,r8d
+
+;ECX
+	mov			r8d,ecx
+	and			r8d,1<<1
+	shl			r8d,CPU_HAS_AVX512_VBMI-1
+	or			r9d,r8d
+
+	mov			r8d,ecx
+	and			r8d,1<<6
+	shl			r8d,CPU_HAS_AVX512_VBMI2-6
+	or			r9d,r8d
+
+	mov			r8d,ecx
+	and			r8d,1<<11
+	shl			r8d,CPU_HAS_AVX512_VNNI-11
+	or			r9d,r8d
+
+	mov			r8d,ecx
+	and			r8d,1<<12
+	shl			r8d,CPU_HAS_AVX512_BITALG-12
+	or			r9d,r8d
+
+	mov			r8d,ecx
+	and			r8d,1<<14
+	shl			r8d,CPU_HAS_AVX512_VPOPCNTDQ-14
+	or			r9d,r8d
+
+;EDX
+	mov			r8d,edx
+	and			r8d,1<<8
+	shl			r8d,CPU_HAS_AVX512_VP2INTERSECT-8
+	or			r9d,r8d
+
+	mov			r8d,edx
+	and			r8d,1<<22
+	shl			r8d,CPU_HAS_AMX_BF16-22
+	or			r9d,r8d
+
+	mov			r8d,edx
+	and			r8d,1<<23
+	shl			r8d,CPU_HAS_AVX512_FP16-23
+	or			r9d,r8d
+
+	mov			r8d,edx
+	and			r8d,1<<24
+	shl			r8d,CPU_HAS_AMX_TILE-24
+	or			r9d,r8d
+
+	mov			r8d,edx
+	and			r8,1<<25
+	shl			r8,CPU_HAS_AMX_INT8-25
+	or			r9,r8
+
+; example for CPU_HAS_property_GT_31
+; 	mov			r8d,edx
+; 	and			r8,1<<26
+; 	shl			r8,CPU_HAS_PROPERTY_GT_31-26
+; 	or			r9,r8
+
+
+
+	mov			rax,r9
+
+	mov			rbx,[rsp+0*8]
+	mov			rcx,[rsp+1*8]
+	mov			rdx,[rsp+2*8]
+	mov			r8 ,[rsp+3*8]
+	mov			r9 ,[rsp+4*8]
+	mov			r15,[rsp+5*8]
+
+	add			rsp,STACK_ADJ
+
+	ret
+
+;----------------------------------------------------------------------------------------------
+%endif
diff --git a/postgresql-15devel_orig/src/backend/utils/adt/encode.c b/postgresql-15devel/src/backend/utils/adt/encode.c
index 6dd93f9..7c37989 100644
--- a/postgresql-15devel_orig/src/backend/utils/adt/encode.c
+++ b/postgresql-15devel/src/backend/utils/adt/encode.c
@@ -19,6 +19,7 @@
 #include "utils/builtins.h"
 #include "utils/memutils.h"
 
+#define	ALGORITHM_ID_HEX_ENCODE			0
 
 /*
  * Encoding conversion API.
@@ -39,6 +40,28 @@ struct pg_encoding
 
 static const struct pg_encoding *pg_find_encoding(const char *name);
 
+
+	/* TODO BEGIN of block which should be moved to global initialization */
+
+static int64 cpu_capabilities_unmasked = -1;
+static int64 cpu_capabilities = -1;
+static int64 cpu_capabilities_mask = -1;
+static int64 algorithm_disable_mask = 0;
+
+static int64 valid_impl_id_arr[64];
+
+extern size_t apply_cpu_capabilities (int64 *capabilities, int64 *impl_id_arr, int64 mask);
+
+
+extern size_t get_hex_encode_alloc_addon (size_t srclen, int64 impl_id);
+
+extern size_t get_hex_decode_alloc_addon (size_t srclen, int64 impl_id);
+
+extern size_t hex_encode_fast (char *dst, const char *src, size_t srclen, int64 impl_id);
+
+	/* END init */
+
+
 /*
  * SQL functions.
  */
@@ -161,14 +184,46 @@ uint64
 hex_encode(const char *src, size_t len, char *dst)
 {
 	const char *end = src + len;
+	size_t n_done = 0;
+	size_t resultlen;
+	size_t len_reduce = 256;
 
-	while (src < end)
+	/* TODO BEGIN of block which should be moved to global initialization */
+
+	/*
+	 * Check the CPU capabilities only once.
+	 * we call it through hex_enc_len in case this has not been called before
+	 */
+	if (cpu_capabilities < 0)
 	{
-		*dst++ = hextbl[(*src >> 4) & 0xF];
-		*dst++ = hextbl[*src & 0xF];
-		src++;
+		resultlen = hex_enc_len(src,len);
+//		len_reduce = (resultlen>>1)-len;
+//		elog(NOTICE,"ppast_hex_enc_len len_reduce %d bytes",len_reduce);
+	}
+
+	/* END init */
+
+#if defined(__x86_64__) || defined(_M_AMD64)
+	if (len >= 512)
+	{
+		n_done = hex_encode_fast(dst,src,len-len_reduce,valid_impl_id_arr[ALGORITHM_ID_HEX_ENCODE]);
 	}
-	return (uint64) len * 2;
+#endif
+
+	if (n_done < len)
+	{
+		src += n_done;
+		dst += n_done<<1;
+		while (src < end)
+		{
+			*dst++ = hextbl[(*src >> 4) & 0xF];
+			*dst++ = hextbl[*src & 0xF];
+			src++;
+		}
+	}
+//		elog(NOTICE,"post_hex_encode return double_len %d bytes",len<<1);
+
+	return (uint64) len << 1;
 }
 
 static inline char
@@ -223,13 +278,38 @@ hex_decode(const char *src, size_t len, char *dst)
 	return p - dst;
 }
 
-static uint64
+uint64
 hex_enc_len(const char *src, size_t srclen)
 {
-	return (uint64) srclen << 1;
+	/* TODO BEGIN of block which should be moved to global initialization */
+
+	/*
+	 * Check the CPU capabilities only once.
+	 * When cpu_capabilities is not set (is < 0) we call the architecture-
+	 * dependant instruction information.
+	 * An architecture supported for ASM/SIMD acceleration returns a positive
+	 * value, for all other (not yet) supported architectures we set it to 0.
+	 */
+#if defined(__x86_64__) || defined(_M_AMD64)
+	if (cpu_capabilities < 0)
+	{
+		apply_cpu_capabilities(&cpu_capabilities_unmasked, valid_impl_id_arr, algorithm_disable_mask);
+		cpu_capabilities = cpu_capabilities_unmasked&cpu_capabilities_mask;
+		elog(NOTICE,"post_apply cpu_capabilities = %ld",cpu_capabilities);
+		elog(NOTICE,"post_apply valid_impl_id_0  = %ld",valid_impl_id_arr[ALGORITHM_ID_HEX_ENCODE]);
+
+	}
+#else
+	cpu_capabilities = 0;
+#endif
+
+	/* END init */
+
+	return (uint64) (srclen << 1) +
+		get_hex_encode_alloc_addon(srclen,valid_impl_id_arr[ALGORITHM_ID_HEX_ENCODE]);
 }
 
-static uint64
+uint64
 hex_dec_len(const char *src, size_t srclen)
 {
 	return (uint64) srclen >> 1;
diff --git a/postgresql-15devel/src/backend/utils/adt/hex_x86_64.asm b/postgresql-15devel/src/backend/utils/adt/hex_x86_64.asm
new file mode 100644
index 0000000..c2fd0c6
--- /dev/null
+++ b/postgresql-15devel/src/backend/utils/adt/hex_x86_64.asm
@@ -0,0 +1,2915 @@
+%ifdef __NASM_MAJOR__
+%ifdef COMPILE_C_STYLE_COMMENTS
+/*-------------------------------------------------------------------------
+ *
+ * hex_x86_64.asm
+ *	  Assembler routines for converting a buffer to hex (hex_encode_xxx)
+ *	  and restore the binary from hex code (hex_decode_xxx) on Intel X64
+ *
+ * Copyright (c) 2021-2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/hex_x86_64.asm
+ *
+ *-------------------------------------------------------------------------
+ */
+%endif
+
+
+
+
+
+
+; hex_x86_64.sam
+; Assembler routines for converting a buffer to hex (hex_encode_xxx)
+; and restore the binary from hex_code (hex_decode_xxx) on Intel X64
+
+
+; nasm -f WIN64 -g hex_x86_64.asm    -l hex_x86_64.lis
+
+; golink /console hexdump.obj hex_x86_64.obj base64_x86_64.obj /files
+
+; Linux register order: 	%rdi, %rsi, %rdx, %rcx, %r8 and %r9
+; Windows register order:	 rcx,  rdx,  r8,   r9
+
+; Windows non volatile registers:	rbx,rbp,rdi,rsi,rsp, r12,r13,r14,r15 and xmm6-xmm15
+; Linux non volatile registers:     rbx,rbp,        rsp, r12,r13,r14,r15
+
+; nasm -f elf64 -g hex_x86_64.asm    -l hex_x86_64_elf64.lis
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define __WIN__ 1
+%elifidn __OUTPUT_FORMAT__, elf64
+%define __ELF__ 1
+%endif
+
+%define	NSHIFT_ADDRESS_TO_PAGE	12
+
+%define N_BYTES_PER_SSE2		16
+%define N_BYTES_PER_AVX2		32
+%define N_BYTES_PER_AVX512		64
+
+global get_hex_encode_alloc_addon
+global get_hex_decode_alloc_addon
+
+global hex_encode_fast
+
+global hex_encode_sse2
+global hex_encode_ssse3
+global hex_encode_avx2
+global hex_encode_avx512bw
+
+global hex_decode_sse2
+global hex_decode_avx2
+global hex_decode_avx512bw
+
+
+default rel
+
+section .rdata align=64
+
+; values loaded with VMOVDQA64 in AVX512, so 64 bytes needed
+
+%define VPERM_AVX2_OFFS	0b11_01_10_00
+
+VPERM_ENCODE_OFFSETS		dq 	0,4,1,5,2,6,3,7
+VPERM_DECODE_OFFSETS		dq 	0,2,4,6,1,3,5,7
+
+ENCODE_SHUFFLE_TO_HEX		times 4 db '0123456789abcdef'
+
+ENCODE_SHUFFLE_TO_HIGH_LOW	times 4 db 8,0,9,1, 10,2,11,3, 12,4,13,5, 14,6,15,7
+
+
+; from here on values used with VPBROADCASTQ in AVX512 / VMOVDQA in AVX2, so only 16/32 bytes needed
+
+;BITMASK_UPPER_HALF			times 32 db 0b1111_0000
+BITMASK_LOWER_HALF			times 32 db 0b0000_1111
+
+BITMASK_NIBBLE_3_IN_WORD 	times 16 dw 0x0F00
+
+BITMASK_LITTLE_TO_BIG_ASCII	times 32 db 0b1101_1111
+BITMASK_BIG_TO_LITTLE_ASCII	times 32 db 0b0010_0000
+
+BITMASK_ZERO_ONE			times 32 db 0b0101_0101
+
+BITMASK_ONE_ZERO			times 32 db 0b1010_1010
+
+BITMASK_SELECT_DIGIT		times 32 db 0b0011_1111
+
+ALL_BYTES_9					times 32 db 9
+
+ASCII_LITTLE_A_ADD:
+ALL_BYTES_39				times 32 db 39
+
+ASCII_0_OFFSET:
+ALL_BYTES_48				times 32 db 48
+
+;ASCII_DIGIT_9				times 32 db 48+9
+
+ASCII_LETTER_LITTLE_A		times 32 db 'a'
+ASCII_LETTER_LITTLE_F		times 32 db 'f'
+
+HEX_ENCODE_ARRAYS:
+HEX_ENC_MIN_SRC_LEN_ARR:
+		dq 0
+		dq 128
+		dq 512
+		dq 512
+		dq 1024
+
+HEX_ENC_RESULT_BUFFER_OVERFLOW_ARR:
+		dq 0
+		dq 64
+		dq 128
+		dq 128
+		dq 256
+
+HEX_ENC_IMPL_ROUTINE_ARR:
+		dq 0
+		dq hex_encode_sse2
+		dq hex_encode_ssse3
+		dq hex_encode_avx2
+		dq hex_encode_avx512bw
+
+
+HEX_DECODE_ARRAYS:
+HEX_DEC_MIN_SRC_LEN_ARR:
+		dq 0
+		dq 128
+		dq 512
+		dq 1024
+
+HEX_DEC_RESULT_BUFFER_OVERFLOW_ARR:
+		dq 0
+		dq 64
+		dq 128
+		dq 256
+
+HEX_DEC_IMPL_ROUTINE_ARR:
+		dq 0
+		dq hex_decode_sse2
+		dq hex_decode_avx2
+		dq hex_decode_avx512bw
+
+
+
+section .text align=32
+
+
+%use smartalign
+
+	ALIGNMODE 	p6
+
+%ifdef __WIN__
+%define	STACK_FOR_XMM	10*16
+%else
+%define	STACK_FOR_XMM	0
+%endif
+
+;----------------------------------------------------------------------------------------------
+
+; get_hex_encode_alloc_addon returns the tail-handling-required allocation addon
+; according to the request length and the maximum valid impl_id
+; it looks for the correct values in the hex_enc_tables indexed by impl_id
+
+get_hex_encode_alloc_addon:
+
+	sub			rsp,0x28
+
+%ifdef __WIN__
+;	mov			rcx,rcx							; WIN parameter 1  requested source len
+;	mov			rdx,rdx							; WIN parameter 2  maximum valid impl_id
+%else
+	mov			rcx,rdi							; LINUX parameter 1  requested source len
+	mov			rdx,rsi							; LINUX parameter 2  maximum valid impl_id
+%endif
+
+	lea			r8,[HEX_ENC_MIN_SRC_LEN_ARR]
+.loop_search:
+	cmp			rcx,[r8+8*rdx]					; compare requested length with current impl_id
+	jge			.offset_found
+	sub			rdx,1							; lower impl_id
+	jnz			.loop_search
+.offset_found:
+	lea			r8,[HEX_ENC_RESULT_BUFFER_OVERFLOW_ARR]
+	mov			rax,[r8+8*rdx]					; return the alloc_overflow
+
+	add			rsp,0x28
+
+	ret
+
+
+;----------------------------------------------------------------------------------------------
+
+; get_hex_decode_alloc_addon returns the tail-handling-required allocation addon
+; according to the request length and the maximum valid impl_id
+; It looks for the correct values in the hex_enc_tables indexed by impl_id
+
+get_hex_decode_alloc_addon:
+
+	sub			rsp,0x28
+
+%ifdef __WIN__
+;	mov			rcx,rcx							; WIN parameter 1  requested source len
+;	mov			rdx,rdx							; WIN parameter 2  maximum valid impl_id
+%else
+	mov			rcx,rdi							; LINUX parameter 1  requested source len
+	mov			rdx,rsi							; LINUX parameter 2  maximum valid impl_id
+%endif
+
+	lea			r8,[HEX_DEC_MIN_SRC_LEN_ARR]
+.loop_search:
+	cmp			rcx,[r8+8*rdx]					; compare requested length with current impl_id
+	jge			.offset_found
+	sub			rdx,1							; lower impl_id
+	jnz			.loop_search
+.offset_found:
+	lea			r8,[HEX_DEC_RESULT_BUFFER_OVERFLOW_ARR]
+	mov			rax,[r8+8*rdx]					; return the alloc_overflow
+
+	add			rsp,0x28
+
+	ret
+
+
+
+
+;----------------------------------------------------------------------------------------------
+
+; hex_encode_fast is the dispatcher routine according to the cpu capabilities and
+; the length of the encode request.
+;
+; Parameter 4 (moved to r15) is the maximum valid impl_id fullfilling the cpu requirements
+; (determined at program initialization time outside this routine)
+; The index into the HEX_ENCODE_ARRAYS is set to the maximum supported requirements.
+; When  r15 == 0 no fast encode is supported and a zero length is returned.
+
+%define	STACK_ADJ	0x28+2*8
+
+hex_encode_fast:
+
+	sub			rsp,STACK_ADJ
+
+	mov			[rsp+0*8],r9
+	mov			[rsp+1*8],r15
+
+
+						; r15 = checked highest valid index
+%ifdef __WIN__
+	mov			rax,r8							; WIN parameter 3 number of elements
+	mov			r15,r9							; WIN parameter 4 maximum valid impl_id
+%else
+	mov			rax,rdx							; LINUX parameter 3 number of elements
+	mov			r15,rcx							; LINUX parameter 4 maximum valid impl_id
+%endif
+
+	lea			r10,[HEX_ENC_MIN_SRC_LEN_ARR]
+
+.check_length:
+	cmp			rax,[r10+8*r15]
+	jge			.max_length_found
+	sub			r15,1
+	jnz			.check_length
+
+.max_length_found:
+	xor			rax,rax
+	cmp			r15,0
+	jz			.return
+
+	lea			r10,[HEX_ENC_IMPL_ROUTINE_ARR]
+	call		[r10+8*r15]
+
+.return:
+	mov			r9,[rsp+0*8]
+	mov			r15,[rsp+1*8]
+
+	add			rsp,STACK_ADJ
+
+	ret
+
+
+
+%define	STACK_ADJ	0x28+6*8+STACK_FOR_XMM
+
+
+
+;----------------------------------------------------------------------------------------------
+
+
+; xmm15			; CONST ALL bytes 9
+; xmm14			; CONST BITMASK_LOWER_HALF
+; xmm13			; CONST ASCII_0_OFFSET
+; xmm12			; CONST ASCII_LITTLE_A_ADD
+; xmm11			; Prefetch Input line 3
+; xmm10			; Prefetch Input line 2
+; xmm9			; Input Line 1
+; xmm8			; Input Line 0
+
+; xmm7			; Unpack RL1 	Rght Half  low bits  secnd line
+; xmm6			; Unpack RH1 	Rght Half high bits  secnd line
+; xmm5			; Unpack LL1 	Left Half  low bits  secnd line
+; xmm4			; Unpack LH1 	Left Half high bits  secnd line
+; xmm3			; Unpack RL0 	Rght Half  low bits  first line
+; xmm2			; Unpack RH0 	Rght Half high bits  first line
+; xmm1			; Unpack LL0 	Left Half  low bits  first line
+; xmm0			; Unpack LH0 	Left Half high bits  first line
+
+
+
+%define	NINP_BYTES_PER_ROUND	2*16
+%define	NINP_BITSHIFT			5
+
+hex_encode_sse2:
+
+	sub			rsp,STACK_ADJ
+
+	mov			[rsp+STACK_FOR_XMM+0*8],rdi
+	mov			[rsp+STACK_FOR_XMM+1*8],rsi
+	mov			[rsp+STACK_FOR_XMM+2*8],r12
+	mov			[rsp+STACK_FOR_XMM+3*8],r14
+	mov			[rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+	MOVDQA		[rsp     ],xmm6
+	MOVDQA		[rsp+1*16],xmm7
+	MOVDQA		[rsp+2*16],xmm8
+	MOVDQA		[rsp+3*16],xmm9
+	MOVDQA		[rsp+4*16],xmm10
+	MOVDQA		[rsp+5*16],xmm11
+	MOVDQA		[rsp+6*16],xmm12
+	MOVDQA		[rsp+7*16],xmm13
+	MOVDQA		[rsp+8*16],xmm14
+	MOVDQA		[rsp+9*16],xmm15
+
+	mov			rdi,rcx							; parameter 1 output buffer
+
+	mov			rsi,rdx							; parameter 2 input buffer
+
+	mov			rdx,r8							; parameter 3 number of elements
+
+%endif
+
+
+;; initializer for QQ0 and QQ1
+
+	MOVDQU		xmm8,[rsi+0*16]				; QQ0 p__23__ p__23__ l8 QL0
+	MOVDQU		xmm9,[rsi+1*16]				; QQ1 p__23__ p__23__ l8 QL0
+
+;; initialize constants
+
+	MOVDQA xmm15,[ALL_BYTES_9]				; p_23__ l3
+
+	MOVDQA xmm14,[BITMASK_NIBBLE_3_IN_WORD]	; p_23__ l3
+
+;	MOVDQA xmm13,[ALL_BYTES_48]				; p_23__ l3
+
+	MOVDQA xmm12,[ALL_BYTES_39]				; p_23__ l3
+
+
+	MOVDQA		xmm13,xmm12
+	PADDB		xmm13,xmm15					; 48 = 39+9
+
+
+;; do page overshoot checks
+
+	mov			rax,NINP_BYTES_PER_ROUND
+
+
+	mov			r9,rdx							; exact requested number of elements to process
+	add			r9,rsi							; r9 last valid pointer +1 of requested input buffer
+
+	mov			r10,rsi							; r10 saved start of input buffer
+	mov			r12,r9							; r12 save of end of input buffer+1
+
+	lea			rcx,[rsi+rdx-1]					; rcx address of last byte requested to read
+
+	lea			r8,[rdx+NINP_BYTES_PER_ROUND-1]
+	shr			r8,NINP_BITSHIFT				; number of loops
+	shl			r8,NINP_BITSHIFT
+	add			r8,rsi							; r8 address of last byte+1 read in complete loops
+	add			r8,NINP_BYTES_PER_ROUND-1		; r8 address of last byte read in normal loop with overshoot
+
+	mov			r11,r8
+
+; DISABLED for NO OVERSHOOT
+;	add			r11,rax							; r11 address of last byte of prefetched data
+
+	shr			rcx,NSHIFT_ADDRESS_TO_PAGE		; rcx page number of last byte requested input
+	shr			r8,NSHIFT_ADDRESS_TO_PAGE		; r8  page number of last byte read after normal round
+
+	cmp			rcx,r8							; stay on same page
+	je			.LSAME_PAGE_IN_ROUND
+	sub			rdx,rax							; don't overshoot in reading: do one round less
+
+.LSAME_PAGE_IN_ROUND:
+	shr			r11,NSHIFT_ADDRESS_TO_PAGE		; r11 page number of byte after prefetched data
+	cmp			rcx,r11
+	je			.LSAME_PAGE_IN_PREFETCH
+	sub			rdx,rax							; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+	add			rdx,rsi							; rdx last valid pointer+1 for normal loop
+
+												; due to prefetch add one round to end checks
+	add			rdx,rax
+	add			r9,rax
+
+	mov			r11,rdi							; r11 saved start of output buffer
+
+	mov			rcx,NINP_BYTES_PER_ROUND<<1		; increment of output buffer for each round
+
+;; start preprocessing before loop
+
+	add 		rsi,rax						; 						add the  number of processed array elements
+
+	align		32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+.LHEXENCODE_LOOP:
+
+	MOVDQA		xmm6,xmm8
+	PUNPCKLBW	xmm6,xmm9					; QL0 p____5 l1 QQ0 [Lin0_RghtH] [00 HL_0 00 HL_1 ...]
+
+
+	MOVDQA		xmm7,xmm8
+	PUNPCKHBW	xmm7,xmm9					; QL0 p____5 l1 QQ0 [Lin0_LeftH] [00 HL_0 00 HL_1 ...]
+
+	MOVDQA		xmm8,xmm10
+
+	MOVDQU		xmm10,[rsi+0*16]			; QL0 p_____5 p1____5 l3+ QL0
+
+
+;;
+	MOVDQA		xmm4,xmm6
+	PSRLW		xmm4,12						; RL2 shift RL2 Hx to lower byte in word
+	MOVDQA		xmm5,xmm6
+	PAND		xmm5,xmm14					; RL2 mask nibble 3 in word (lower nibble shifted 8 bits left)
+
+	MOVDQA		xmm9,xmm11
+
+	MOVDQU		xmm11,[rsi+1*16]			; QL1 p_____5 p1____5 l3+ QL0
+
+	add 		rsi,rax						; 						add the  number of processed array elements
+
+
+	PSLLW		xmm6,8						; RL0 rotate (shift) RL0 1 byte to left
+
+	MOVDQA		xmm0,xmm6
+	PSRLW		xmm0,4+8					; RL0 shift RL0 Hx to lower byte in word
+	POR			xmm4,xmm5					; RL2 low nibble, high nibble at correct position (0L0H)
+
+
+
+	MOVDQA		xmm1,xmm6
+	PAND		xmm1,xmm14					; RL0 mask nibble 3 in word (lower nibble shifted 8 bits left)
+
+	MOVDQA		xmm6,xmm7
+	PSLLW		xmm6,8						; RL1 rotate (shift) RL1 1 byte to left
+
+
+	MOVDQA		xmm5,xmm4
+	PCMPGTB		xmm5,xmm15					; RL2 all letters set to 0xFF, all digits to 0
+
+
+	POR			xmm0,xmm1					; RL0 low nibble, high nibble at correct position (0L0H)
+
+	PADDB		xmm4,xmm13					; RL2 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39
+
+
+	MOVDQA		xmm2,xmm6
+	PSRLW		xmm2,4+8					; RL1 shift RL1 Hx to lower byte in word
+	MOVDQA		xmm3,xmm6
+	PAND		xmm3,xmm14					; RL1 mask nibble 3 in word (lower nibble shifted 8 bits left)
+
+	MOVDQA		xmm1,xmm0
+	PCMPGTB		xmm1,xmm15					; RL0 all letters set to 0xFF, all digits to 0
+
+
+	PAND		xmm5,xmm12					; RL2 for all letters set to 39, else 0 (
+											; RL2 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters)
+
+	POR			xmm2,xmm3					; RL1 low nibble, high nibble at correct position (0L0H)
+
+	PAND		xmm1,xmm12					; RL0 for all letters set to 39, else 0 (
+											; RL0 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters)
+	PADDB		xmm4,xmm5					; RL2 final  result line RL2
+
+	PADDB		xmm0,xmm13					; RL0 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39
+
+	MOVDQA		xmm3,xmm2
+	PCMPGTB		xmm3,xmm15					; RL1 all letters set to 0xFF, all digits to 0
+
+	PADDB		xmm2,xmm13					; RL1 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39
+
+
+	PADDB		xmm0,xmm1					; RL0 final  result line RL0
+
+	MOVDQA		xmm1,xmm7
+	PSRLW		xmm1,12						; RL3 shift RL3 Hx to lower byte in word
+
+	PAND		xmm3,xmm12					; RL1 for all letters set to 39, else 0 (
+											; RL1 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters)
+	PADDB		xmm2,xmm3					; RL1 final  result line RL1
+
+	MOVDQU		[rdi+0*16],xmm0				; RL0 RL0 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+	PAND		xmm7,xmm14					; RL3 mask nibble 3 in word (lower nibble shifted 8 bits left)
+
+	MOVDQA		xmm6,xmm7
+	POR			xmm6,xmm1					; RL3 low nibble, high nibble at correct position (0L0H)
+
+
+	MOVDQU		[rdi+1*16],xmm2				; RL1 RL1 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+
+	MOVDQA		xmm7,xmm6
+	PCMPGTB		xmm7,xmm15					; RL3 all letters set to 0xFF, all digits to 0
+
+	PADDB		xmm6,xmm13					; RL3 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39
+
+	PAND		xmm7,xmm12					; RL3 for all letters set to 39, else 0 (
+											; RL3 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters)
+	MOVDQU		[rdi+2*16],xmm4				; RL2 RL2 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+
+	PADDB		xmm6,xmm7					; RL3 final  result line RL2
+
+	MOVDQU		[rdi+3*16],xmm6				; RL3 RL3 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+
+	add 		rdi,rcx						; 						add the number of processed output bytes
+
+	cmp			rsi,rdx						; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+	jl			.LHEXENCODE_LOOP
+
+												; end of normal loop reached
+												; we can do one more round when original count has been reduced by one round
+	cmp			rax,0
+	je			.LFINISH_EXTRA
+
+	cmp			rdx,r9							; input buffer length was not reduced when equal
+	je			.LFINISH_NORMAL
+
+	sub			rsi,rax							; for prefetching the last round, load the last round again
+	sub			rdx,rax							; adopt and condition for last round also
+	xor			rax,rax
+	jmp			.LHEXENCODE_LOOP
+
+
+.LFINISH_EXTRA:
+	add			rsi,NINP_BYTES_PER_ROUND		; add the extra round to get processed bytes
+	jmp .LFINISH
+
+.LFINISH_NORMAL:
+	sub			rsi,NINP_BYTES_PER_ROUND		; sub the added prefetch round to get processed bytes
+
+.LFINISH:
+												; r9 = address of requested input bytes+1
+												; rsi = address of processed input bytes+1
+												; now get the minimum of rdx,rsi to rax
+;;	sub			rsi,rax							; for last round do nothing (rax=0), else sub increment for one round
+;;	sub			r9,rax
+
+	mov			rax,r12
+	cmp			rsi,r12							; get min from r12 (address of requested input) and rsi (address of done input)
+
+	jge			.LCALC_PROCESSED_BYTES
+	mov 		rax,rsi							; rax=address of last valid input byte+1
+
+.LCALC_PROCESSED_BYTES:
+	sub			rax,r10							; sub the input buffer start address
+												; rax = number of valid processed input bytes = return value
+
+	cmp			rsi,rdx							; compare rdx (address of requested input) and rsi (address of done input)
+	je			.LNO_ZERO_OUT
+
+	mov			r15,rax							; number of elements to process
+
+	shl			r15,1							; number of output bytes
+
+	add			r15,r11							; pointer to next byte after full valid output buffer
+
+	PXOR		xmm0,xmm0						; all zero
+;ZERO	MOVDQU		[r15],xmm0						; zero out one register width after last output
+
+.LNO_ZERO_OUT:
+
+%ifdef __WIN__
+
+	MOVDQA		xmm6 ,[rsp     ]
+	MOVDQA		xmm7 ,[rsp+1*16]
+	MOVDQA		xmm8 ,[rsp+2*16]
+	MOVDQA		xmm9 ,[rsp+3*16]
+	MOVDQA		xmm10,[rsp+4*16]
+	MOVDQA		xmm11,[rsp+5*16]
+	MOVDQA		xmm12,[rsp+6*16]
+	MOVDQA		xmm13,[rsp+7*16]
+	MOVDQA		xmm14,[rsp+8*16]
+	MOVDQA		xmm15,[rsp+9*16]
+
+
+%endif
+
+	mov			rdi,[rsp+STACK_FOR_XMM+0*8]
+	mov			rsi,[rsp+STACK_FOR_XMM+1*8]
+	mov			r12,[rsp+STACK_FOR_XMM+2*8]
+	mov			r14,[rsp+STACK_FOR_XMM+3*8]
+	mov			r15,[rsp+STACK_FOR_XMM+4*8]
+
+	add			rsp,STACK_ADJ
+
+	ret
+
+
+;----------------------------------------------------------------------------------------------
+
+
+
+
+; xmm15			; CONST ALL bytes 9
+; xmm14			; MAX byte value of all lines		(Init all to letter 'F' (0x46)					;CONST BITMASK_LOWER_HALF
+; xmm13			; MIN byte value of non-ascii-digit values (not 3x)	(Init all to letter 'A' (0x41)	;CONST BITMASK_ASCII_0
+; xmm12			; ORed compare of all digit-values cmp > 9	 (Init all zero)						;CONST BITMASK_WORD_LOWER_BYTE
+; xmm11			; Input line 3
+; xmm10			; Input line 2
+; xmm9			; Input line 1
+; xmm8			; Input Line 0
+
+; xmm7			; Unpack RL1 	Rght Half  low bits  secnd line
+; xmm6			; Unpack RH1 	Rght Half high bits  secnd line
+; xmm5			; Unpack LL1 	Left Half  low bits  secnd line
+; xmm4			; Unpack LH1 	Left Half high bits  secnd line
+; xmm3			; Unpack RL0 	Rght Half  low bits  first line
+; xmm2			; Unpack RH0 	Rght Half high bits  first line
+; xmm1			; Unpack LL0 	Left Half  low bits  first line
+; xmm0			; Unpack LH0 	Left Half high bits  first line
+
+
+%define	NINP_BYTES_PER_ROUND	4*16
+%define	NINP_BITSHIFT			6
+
+hex_decode_sse2:
+
+	sub			rsp,STACK_ADJ
+
+	mov			[rsp+STACK_FOR_XMM+0*8],rdi
+	mov			[rsp+STACK_FOR_XMM+1*8],rsi
+	mov			[rsp+STACK_FOR_XMM+2*8],r12
+	mov			[rsp+STACK_FOR_XMM+3*8],r14
+	mov			[rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+	MOVDQA		[rsp     ],xmm6
+	MOVDQA		[rsp+1*16],xmm7
+	MOVDQA		[rsp+2*16],xmm8
+	MOVDQA		[rsp+3*16],xmm9
+	MOVDQA		[rsp+4*16],xmm10
+	MOVDQA		[rsp+5*16],xmm11
+	MOVDQA		[rsp+6*16],xmm12
+	MOVDQA		[rsp+7*16],xmm13
+	MOVDQA		[rsp+8*16],xmm14
+	MOVDQA		[rsp+9*16],xmm15
+
+	mov			rdi,rcx							; parameter 1 output buffer
+
+	mov			rsi,rdx							; parameter 2 input buffer
+
+	mov			rdx,r8							; parameter 3 number of elements
+
+%endif
+
+;; initializer for QQ0 and QQ1
+
+	MOVDQU		xmm8,[rsi]				;
+	MOVDQU		xmm9,[rsi+1*16]			;
+
+	MOVDQU		xmm10,[rsi+2*16]		;
+	MOVDQU		xmm11,[rsi+3*16]		;
+
+;; initialize constants
+
+	mov			r15,[BITMASK_BIG_TO_LITTLE_ASCII]
+
+	MOVDQA		xmm7,[BITMASK_LOWER_HALF]
+
+	MOVDQA 		xmm15,[ALL_BYTES_9]			; p_23__ l3
+
+	MOVDQA 		xmm14,[ASCII_LETTER_LITTLE_F]	; p_23__ l3
+
+	MOVDQA 		xmm13,[ASCII_LETTER_LITTLE_A]	; p_23__ l3
+
+	PXOR		xmm12,xmm12						; 							all zero
+
+	MOVQ		xmm2,r15						; 0b0010_0000
+
+
+;; do page overshoot checks
+;; due to end condition handling not done here, we only process full rounds
+
+	mov			rax,NINP_BYTES_PER_ROUND
+
+	add			rdx,NINP_BYTES_PER_ROUND-1
+	shr			rdx,NINP_BITSHIFT				;
+	shl			rdx,NINP_BITSHIFT				; rdx number of bytes read in normal loop equiv to xxx full loops
+
+	mov			r9,rdx							; exact requested number of elements to process
+	add			r9,rsi							; r9 last valid pointer +1 of requested input buffer
+
+	mov			r10,rsi							; r10 saved start of input buffer
+	mov			r12,r9							; r12 save of end of input buffer+1
+
+	lea			rcx,[rsi+rdx-1]					; rcx address of last byte requested to read
+
+	mov			r11,r9
+; DISABLED for NO OVERSHOOT
+;	add			r11,rax							; r11 address of last byte of prefetched data
+
+	shr			rcx,NSHIFT_ADDRESS_TO_PAGE		; rcx page number of last byte requested input
+	shr			r11,NSHIFT_ADDRESS_TO_PAGE		; r11 page number of byte after prefetched data
+	cmp			rcx,r11
+	je			.LSAME_PAGE_IN_PREFETCH
+	sub			rdx,rax							; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+	add			rdx,rsi							; rdx last valid pointer+1 for normal loop
+
+												; due to prefetch add one round to end checks
+	add			rdx,rax
+	add			r9,rax
+
+	mov			r11,rdi							; r11 saved start of output buffer
+
+	mov			rcx,NINP_BYTES_PER_ROUND>>1		; increment of output buffer for each round
+
+;; start preprocessing before loop
+
+	PUNPCKLQDQ	xmm2,xmm2				; all bytes 0b0010_0000
+
+;	PUNPCKLQDQ	xmm7,xmm7				; all bytes 0b0000_1111
+
+	MOVDQA		xmm0,xmm2
+	MOVDQA		xmm1,xmm2
+
+	MOVDQA		xmm4,xmm2
+	MOVDQA		xmm5,xmm2
+
+	add 		rsi,rax					; 							add the number of processed array elements
+
+	align		32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+.LHEXDECODE_LOOP:
+
+	MOVDQA		xmm6,xmm2
+
+
+	PSRAD		xmm6,1					; all bytes 0b0001_0000
+
+	POR			xmm0,xmm8				; line 0 all letters set to little ASCII a-f
+	POR			xmm1,xmm9
+	POR			xmm4,xmm10
+	POR			xmm5,xmm11
+
+	PMAXUB		xmm14,xmm0
+	PMAXUB		xmm14,xmm1
+	PMAXUB		xmm14,xmm4
+	PMAXUB		xmm14,xmm5
+
+;max check finished
+
+	POR			xmm0,xmm6				; line 0 with bits for ASCII_0 set (Byte OR 0bxx11_xxxx)
+	POR			xmm1,xmm6
+	POR			xmm4,xmm6
+	POR			xmm5,xmm6
+
+	PCMPEQD		xmm6,xmm6				; all ONE
+
+	PCMPEQB		xmm0,xmm8				; set to all ONE when ASCI Digit (forced bit 0bxx11_xxxx equal to orig value)
+	PCMPEQB		xmm1,xmm9
+	PCMPEQB		xmm4,xmm10
+	PCMPEQB		xmm5,xmm11
+
+;start min check line0+1
+	MOVDQA		xmm2,xmm0				; copy all one when digit
+	MOVDQA		xmm3,xmm1
+
+	PANDN		xmm2,xmm6				; set to all one for values NOT digits
+	PANDN		xmm3,xmm6
+
+	PAND		xmm2,xmm8				; set to orig value when NOT ASCI Digit
+	PAND		xmm3,xmm9
+
+	POR			xmm2,xmm0				; set all zero bytes to all one
+	POR			xmm3,xmm1
+
+	PMINUB		xmm13,xmm2
+	PMINUB		xmm13,xmm3
+
+
+;start min check line2+3
+	MOVDQA		xmm2,xmm4				; copy all one when digit
+	MOVDQA		xmm3,xmm5
+
+
+	PANDN		xmm2,xmm6				; set to all one for values NOT digits
+	PANDN		xmm3,xmm6
+
+	PAND		xmm2,xmm10				; set to orig value when NOT ASCI Digit
+	PAND		xmm3,xmm11
+
+	POR			xmm2,xmm4				; set all zero bytes to all one
+	POR			xmm3,xmm5
+
+	PMINUB		xmm13,xmm2
+	PMINUB		xmm13,xmm3
+
+
+; start legal digit check
+
+	MOVDQA		xmm2,xmm0				; copy all one when digit
+	MOVDQA		xmm3,xmm1
+
+	PAND		xmm2,xmm8				; set to orig value when ASCI Digit
+	PAND		xmm3,xmm9
+
+	PAND		xmm2,xmm7				; set to lower nibble value when ASCI Digit
+	PAND		xmm2,xmm7
+
+	PCMPGTB		xmm2,xmm15				; set to all ONE when ASCI Digit and value > 9
+	PCMPGTB		xmm3,xmm15
+
+	POR			xmm12,xmm2				; accumulate illegal chars like ASCII digit and value > 9
+	POR			xmm12,xmm3
+
+	PAND		xmm2,xmm10				; set to orig value when ASCI Digit
+	PAND		xmm3,xmm11
+
+	PAND		xmm2,xmm7				; set to lower nibble value when ASCI Digit
+	PAND		xmm3,xmm7
+
+
+	PCMPGTB		xmm2,xmm15				; set to all ONE when ASCI Digit and value > 9
+	PCMPGTB		xmm3,xmm15				; set to orig value when ASCI Digit
+
+	POR			xmm12,xmm2
+	POR			xmm12,xmm3
+
+
+
+;--										; all checks accumulated, xmm2,xmm3,xmm6,xmm7 have lower nibbles of lines 0-3
+	PCMPEQD		xmm6,xmm6				; all ONE
+	PSRLW		xmm6,8					; QQ0 p01____ p01____ l1
+
+	MOVDQA		xmm2,xmm7				; all bytes 0b0000_1111
+	MOVDQA		xmm3,xmm7
+
+	PAND		xmm2,xmm8				; all byte values only lower half (nibble) Line 0+1
+	MOVDQU		xmm8,[rsi+0*16]				;
+	PAND		xmm3,xmm9
+	MOVDQU		xmm9,[rsi+1*16]			;
+
+	PANDN		xmm0,xmm15				; put 9 to every element not DIGIT
+	PANDN		xmm1,xmm15
+
+	PADDB		xmm2,xmm0				; add 9 to every nibble not DIGIT
+	PADDB		xmm3,xmm1
+
+	MOVDQA		xmm0,xmm2
+	PSRLW		xmm0,8					; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	PSLLW		xmm2,4					; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	MOVDQA		xmm1,xmm3
+	PSRLW		xmm1,8					; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	PSLLW		xmm3,4					; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	POR			xmm0,xmm2				; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+	POR			xmm1,xmm3				; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+
+	PAND		xmm0,xmm6				; line 0
+	PAND		xmm1,xmm6				; line 1
+
+	PACKUSWB	xmm0,xmm1				; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...]
+
+; line 0 and 1 processed
+
+
+	MOVDQA		xmm2,xmm7				; all bytes 0b0000_1111
+	MOVDQA		xmm3,xmm7
+
+	PAND		xmm2,xmm10				; all byte values only lower half (nibble) Line 0+1
+	MOVDQU		xmm10,[rsi+2*16]		;
+	PAND		xmm3,xmm11
+	MOVDQU		xmm11,[rsi+3*16]		;
+
+	PANDN		xmm4,xmm15				; put 9 to every element not DIGIT
+	PANDN		xmm5,xmm15
+
+	PADDB		xmm2,xmm4				; add 9 to every nibble not DIGIT
+	PADDB		xmm3,xmm5
+
+	add 		rsi,rax					; 							add the number of processed array elements
+
+	MOVDQU		[rdi+0*16],xmm0			; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+
+	MOVDQA		xmm4,xmm2
+	PSRLW		xmm4,8					; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	PSLLW		xmm2,4					; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	MOVDQA		xmm5,xmm3
+	PSRLW		xmm5,8					; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	PSLLW		xmm3,4					; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+
+	POR			xmm4,xmm2				; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+	POR			xmm5,xmm3				; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+
+	MOVQ		xmm2,r15				;
+	PUNPCKLQDQ	xmm2,xmm2				; all bytes 0b0010_0000
+
+	MOVDQA		xmm0,xmm2
+	MOVDQA		xmm1,xmm2
+
+;	MOVQ		xmm7,rcx				;
+
+	PAND		xmm4,xmm6				;
+	PAND		xmm5,xmm6				; line 1
+
+	PACKUSWB	xmm4,xmm5				; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...]
+
+;	MOVDQA		xmm1,xmm11
+
+
+	MOVDQU		[rdi+1*16],xmm4			; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+	MOVDQA		xmm4,xmm2
+	MOVDQA		xmm5,xmm2
+
+;	PUNPCKLQDQ	xmm7,xmm7				; all bytes 0b0000_1111
+
+
+	add 		rdi,rcx					; 						add the number of processed output bytes
+
+	cmp			rsi,rdx					; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+	jl			.LHEXDECODE_LOOP
+
+												; end of normal loop reached
+												; we can do one more round when original count has been reduced by one round
+	cmp			rax,0
+	je			.LFINISH
+
+	cmp			rdx,r9							; input buffer length was not reduced when equal
+	je			.LFINISH
+
+	sub			rsi,rax							; for prefetching the last round, load the last round again
+	sub			rdx,rax							; adopt and condition for last round also
+	xor			rax,rax
+	jmp			.LHEXDECODE_LOOP
+
+.LFINISH:
+
+	mov			rax,rdi
+	sub			rax,r11							; rax = number of output bytes
+	add			rax,rax							; rax = number of valid processed input bytes = return value
+
+%ifdef __WIN__
+
+	MOVDQA		xmm6 ,[rsp     ]
+	MOVDQA		xmm7 ,[rsp+1*16]
+	MOVDQA		xmm8 ,[rsp+2*16]
+	MOVDQA		xmm9 ,[rsp+3*16]
+	MOVDQA		xmm10,[rsp+4*16]
+	MOVDQA		xmm11,[rsp+5*16]
+	MOVDQA		xmm12,[rsp+6*16]
+	MOVDQA		xmm13,[rsp+7*16]
+	MOVDQA		xmm14,[rsp+8*16]
+	MOVDQA		xmm15,[rsp+9*16]
+
+%endif
+
+	mov			rdi,[rsp+STACK_FOR_XMM+0*8]
+	mov			rsi,[rsp+STACK_FOR_XMM+1*8]
+	mov			r12,[rsp+STACK_FOR_XMM+2*8]
+	mov			r14,[rsp+STACK_FOR_XMM+3*8]
+	mov			r15,[rsp+STACK_FOR_XMM+4*8]
+
+	add			rsp,STACK_ADJ
+
+	ret
+
+
+
+;----------------------------------------------------------------------------------------------
+
+
+
+
+; ymm15			; CONST ALL bytes 9
+; ymm14			; MAX byte value of all lines		(Init all to letter 'F' (0x46)					;CONST BITMASK_LOWER_HALF
+; ymm13			; MIN byte value of non-ascii-digit values (not 3x)	(Init all to letter 'A' (0x41)	;CONST BITMASK_ASCII_0
+; ymm12			; ORed compare of all digit-values cmp > 9	 (Init all zero)						;CONST BITMASK_WORD_LOWER_BYTE
+; ymm11			; Input line 3
+; ymm10			; Input line 2
+; ymm9			; Input line 1
+; ymm8			; Input Line 0
+
+; ymm7			; CONST BITMASK_LOWER_HALF							;Unpack RL1 	Rght Half  low bits  secnd line
+; ymm6			; Unpack RH1 	Rght Half high bits  secnd line
+; ymm5			; Unpack LL1 	Left Half  low bits  secnd line
+; ymm4			; Unpack LH1 	Left Half high bits  secnd line
+; ymm3			; Unpack RL0 	Rght Half  low bits  first line
+; ymm2			; Unpack RH0 	Rght Half high bits  first line
+; ymm1			; Unpack LL0 	Left Half  low bits  first line
+; ymm0			; Unpack LH0 	Left Half high bits  first line
+
+
+%define	NINP_BYTES_PER_ROUND	4*32
+%define	NINP_BITSHIFT			7
+
+hex_decode_avx2:
+
+	sub			rsp,STACK_ADJ
+
+	mov			[rsp+STACK_FOR_XMM+0*8],rdi
+	mov			[rsp+STACK_FOR_XMM+1*8],rsi
+	mov			[rsp+STACK_FOR_XMM+2*8],r12
+	mov			[rsp+STACK_FOR_XMM+3*8],r14
+	mov			[rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+	VMOVDQA		[rsp     ],xmm6
+	VMOVDQA		[rsp+1*16],xmm7
+	VMOVDQA		[rsp+2*16],xmm8
+	VMOVDQA		[rsp+3*16],xmm9
+	VMOVDQA		[rsp+4*16],xmm10
+	VMOVDQA		[rsp+5*16],xmm11
+	VMOVDQA		[rsp+6*16],xmm12
+	VMOVDQA		[rsp+7*16],xmm13
+	VMOVDQA		[rsp+8*16],xmm14
+	VMOVDQA		[rsp+9*16],xmm15
+
+	mov			rdi,rcx							; parameter 1 output buffer
+
+	mov			rsi,rdx							; parameter 2 input buffer
+
+	mov			rdx,r8							; parameter 3 number of elements
+
+%endif
+
+;; initializer for QQ0 and QQ1
+
+	VMOVDQU		ymm8,[rsi+0*32]			;
+	VMOVDQU		ymm9,[rsi+1*32]			;
+
+	VMOVDQU		ymm10,[rsi+2*32]		;
+	VMOVDQU		ymm11,[rsi+3*32]		;
+
+;; initialize constants
+
+	VMOVDQA 	ymm15,[ALL_BYTES_9]					; p_23__ l3
+
+	VMOVDQA 	ymm14,[ASCII_LETTER_LITTLE_F]		; p_23__ l3
+
+	VMOVDQA 	ymm13,[ASCII_LETTER_LITTLE_A]		; p_23__ l3
+
+	VMOVDQA 	ymm7,[BITMASK_LOWER_HALF]			; 0b0000_1111
+
+	VPXOR		ymm12,ymm12							;					all zero
+
+	VMOVDQA 	ymm2,[BITMASK_BIG_TO_LITTLE_ASCII]	; 0b0010_0000
+
+
+
+;; do page overshoot checks
+;; due to end condition handling not done here, we only process full rounds
+
+	mov			rax,NINP_BYTES_PER_ROUND
+
+	add			rdx,NINP_BYTES_PER_ROUND-1
+	shr			rdx,NINP_BITSHIFT				;
+	shl			rdx,NINP_BITSHIFT				; rdx number of bytes read in normal loop equiv to xxx full loops
+
+	mov			r9,rdx							; exact requested number of elements to process
+	add			r9,rsi							; r9 last valid pointer +1 of requested input buffer
+
+	mov			r10,rsi							; r10 saved start of input buffer
+	mov			r12,r9							; r12 save of end of input buffer+1
+
+	lea			rcx,[rsi+rdx-1]					; rcx address of last byte requested to read
+
+	mov			r11,r9
+; DISABLED for NO OVERSHOOT
+;	add			r11,rax							; r11 address of last byte of prefetched data
+
+	shr			rcx,NSHIFT_ADDRESS_TO_PAGE		; rcx page number of last byte requested input
+	shr			r11,NSHIFT_ADDRESS_TO_PAGE		; r11 page number of byte after prefetched data
+	cmp			rcx,r11
+	je			.LSAME_PAGE_IN_PREFETCH
+	sub			rdx,rax							; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+	add			rdx,rsi							; rdx last valid pointer+1 for normal loop
+
+												; due to prefetch add one round to end checks
+	add			rdx,rax
+	add			r9,rax
+
+	mov			r11,rdi							; r11 saved start of output buffer
+
+	mov			rcx,NINP_BYTES_PER_ROUND>>1		; increment of output buffer for each round
+
+;; start preprocessing before loop
+
+	add 		rsi,rax					; 							add the number of processed array elements
+
+	align		32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+.LHEXDECODE_LOOP:
+
+	VMOVDQA		ymm6,ymm2
+
+
+	VPSRAD		ymm6,1					; all bytes 0b0001_0000
+
+	VPOR		ymm0,ymm2,ymm8			; line 0 all letters set to little ASCII a-f
+	VPOR		ymm1,ymm2,ymm9
+	VPOR		ymm4,ymm2,ymm10
+	VPOR		ymm5,ymm2,ymm11
+
+	VPMAXUB		ymm14,ymm0
+	VPMAXUB		ymm14,ymm1
+	VPMAXUB		ymm14,ymm4
+	VPMAXUB		ymm14,ymm5
+
+;max check finished
+
+	VPOR		ymm0,ymm6				; line 0 with bits for ASCII_0 set (Byte OR 0bxx11_xxxx)
+	VPOR		ymm1,ymm6
+	VPOR		ymm4,ymm6
+	VPOR		ymm5,ymm6
+
+	VPCMPEQD	ymm6,ymm6				; all ONE
+
+	VPCMPEQB	ymm0,ymm8				; set to all ONE when ASCI Digit (forced bit 0bxx11_xxxx equal to orig value)
+	VPCMPEQB	ymm1,ymm9
+	VPCMPEQB	ymm4,ymm10
+	VPCMPEQB	ymm5,ymm11
+
+;start min check line0+1
+	VPANDN		ymm2,ymm0,ymm6			; set to all one for values NOT digits
+	VPANDN		ymm3,ymm1,ymm6
+
+	VPAND		ymm2,ymm8				; set to orig value when NOT ASCI Digit
+	VPAND		ymm3,ymm9
+
+	VPOR		ymm2,ymm0				; set all zero bytes to all one
+	VPOR		ymm3,ymm1
+
+	VPMINUB		ymm13,ymm2
+	VPMINUB		ymm13,ymm3
+
+
+;start min check line2+3
+
+
+	VPANDN		ymm2,ymm4,ymm6			; set to all one for values NOT digits
+	VPANDN		ymm3,ymm5,ymm6
+
+	VPAND		ymm2,ymm10				; set to orig value when NOT ASCI Digit
+	VPAND		ymm3,ymm11
+
+	VPOR		ymm2,ymm4				; set all zero bytes to all one
+	VPOR		ymm3,ymm5
+
+	VPMINUB		ymm13,ymm2
+	VPMINUB		ymm13,ymm3
+
+
+; start legal digit check
+
+	VPAND		ymm2,ymm0,ymm8			; set to orig value when ASCI Digit
+	VPAND		ymm3,ymm1,ymm9
+
+	VPAND		ymm2,ymm7				; set to lower nibble value when ASCI Digit
+	VPAND		ymm2,ymm7
+
+	VPCMPGTB	ymm2,ymm15				; set to all ONE when ASCI Digit and value > 9
+	VPCMPGTB	ymm3,ymm15
+
+	VPOR		ymm12,ymm2				; accumulate illegal chars like ASCII digit and value > 9
+	VPOR		ymm12,ymm3
+
+	VPAND		ymm2,ymm10				; set to orig value when ASCI Digit
+	VPAND		ymm3,ymm11
+
+	VPAND		ymm2,ymm7				; set to lower nibble value when ASCI Digit
+	VPAND		ymm3,ymm7
+
+
+	VPCMPGTB	ymm2,ymm15				; set to all ONE when ASCI Digit and value > 9
+	VPCMPGTB	ymm3,ymm15				; set to orig value when ASCI Digit
+
+	VPOR		ymm12,ymm2
+	VPOR		ymm12,ymm3
+
+; all (max, min and >9) checks finished
+
+
+;--										; all checks accumulated, ymm2,ymm3,ymm6,ymm7 have lower nibbles of lines 0-3
+	VPCMPEQD	ymm6,ymm6				; all ONE
+	VPSRLW		ymm6,8					; QQ0 p01____ p01____ l1
+
+	VPAND		ymm2,ymm7,ymm8			; all byte values only lower half (nibble) Line 0+1
+	VMOVDQU		ymm8,[rsi+0*32]				;
+	VPAND		ymm3,ymm7,ymm9
+	VMOVDQU		ymm9,[rsi+1*32]			;
+
+	VPANDN		ymm0,ymm15				; put 9 to every element not DIGIT
+	VPANDN		ymm1,ymm15
+
+	VPADDB		ymm2,ymm0				; add 9 to every nibble not DIGIT
+	VPADDB		ymm3,ymm1
+
+	VPSRLW		ymm0,ymm2,8				; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	VPSLLW		ymm2,4					; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	VPSRLW		ymm1,ymm3,8				; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	VPSLLW		ymm3,4					; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	VPOR		ymm0,ymm2				; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+	VPOR		ymm1,ymm3				; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+
+	VPAND		ymm0,ymm6				; line 0
+	VPAND		ymm1,ymm6				; line 1
+
+	VPACKUSWB	ymm0,ymm1				; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...]
+
+; line 0 and 1 processed
+
+
+	VPAND		ymm2,ymm7,ymm10			; all byte values only lower half (nibble) Line 0+1
+	VMOVDQU		ymm10,[rsi+2*32]		;
+	VPAND		ymm3,ymm7,ymm11
+	VMOVDQU		ymm11,[rsi+3*32]		;
+
+	VPANDN		ymm4,ymm15				; put 9 to every element not DIGIT
+	VPANDN		ymm5,ymm15
+
+	VPADDB		ymm2,ymm4				; add 9 to every nibble not DIGIT
+	VPADDB		ymm3,ymm5
+
+	add 		rsi,rax					; 							add the number of processed array elements
+
+	VMOVDQU		[rdi+0*32],ymm0			; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+
+	VPSRLW		ymm4,ymm2,8				; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	VPSLLW		ymm2,4					; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	VPSRLW		ymm5,ymm3,8				; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+	VPSLLW		ymm3,4					; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...]
+
+
+	VPOR		ymm4,ymm2				; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+	VPOR		ymm5,ymm3				; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...]
+
+	VMOVDQA 	ymm2,[BITMASK_BIG_TO_LITTLE_ASCII]				;
+
+
+	VPAND		ymm4,ymm6				;
+	VPAND		ymm5,ymm6				; line 1
+
+	VPACKUSWB	ymm4,ymm5				; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...]
+
+
+	VMOVDQU		[rdi+1*32],ymm4			; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump
+
+	add 		rdi,rcx					; 						add the number of processed output bytes
+
+
+
+	cmp			rsi,rdx					; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+	jl			.LHEXDECODE_LOOP
+
+												; end of normal loop reached
+												; we can do one more round when original count has been reduced by one round
+	cmp			rax,0
+	je			.LFINISH
+
+	cmp			rdx,r9							; input buffer length was not reduced when equal
+	je			.LFINISH
+
+	sub			rsi,rax							; for prefetching the last round, load the last round again
+	sub			rdx,rax							; adopt and condition for last round also
+	xor			rax,rax
+	jmp			.LHEXDECODE_LOOP
+
+.LFINISH:
+
+	mov			rax,rdi
+	sub			rax,r11							; rax = number of output bytes
+	add			rax,rax							; rax = number of valid processed input bytes = return value
+
+%ifdef __WIN__
+
+	VMOVDQA		xmm6 ,[rsp     ]
+	VMOVDQA		xmm7 ,[rsp+1*16]
+	VMOVDQA		xmm8 ,[rsp+2*16]
+	VMOVDQA		xmm9 ,[rsp+3*16]
+	VMOVDQA		xmm10,[rsp+4*16]
+	VMOVDQA		xmm11,[rsp+5*16]
+	VMOVDQA		xmm12,[rsp+6*16]
+	VMOVDQA		xmm13,[rsp+7*16]
+	VMOVDQA		xmm14,[rsp+8*16]
+	VMOVDQA		xmm15,[rsp+9*16]
+
+%endif
+
+	mov			rdi,[rsp+STACK_FOR_XMM+0*8]
+	mov			rsi,[rsp+STACK_FOR_XMM+1*8]
+	mov			r12,[rsp+STACK_FOR_XMM+2*8]
+	mov			r14,[rsp+STACK_FOR_XMM+3*8]
+	mov			r15,[rsp+STACK_FOR_XMM+4*8]
+
+	add			rsp,STACK_ADJ
+
+	ret
+
+
+;----------------------------------------------------------------------------------------------
+
+; k7			; compare flags lower   eq little f, initially all ONE
+; k6			; compare flags greater eq little a, initially all ONE QL0,QL1
+; k5			; CONST BITMASK_ZERO_ONE 0101010101010101 selecting lower half
+; k4			; digit flags QL3
+; k3			; digit flags QL2
+; k2			; digit flags QL1
+; k1			; digit flags QL0
+; k0			;
+
+; zmm31			; CONST ALL ZERO
+; zmm30			; CONST BITMASK_LOWER_HALF
+; zmm29			; CONST ASCII_0_OFFSET
+; zmm28			; CONST ASCII_LITTLE_A_ADD
+; zmm27			; CONST VPERM_DECODE_OFFSETS
+; zmm26			; CONST ALL bytes 9
+; zmm25			; Ouptut Line OH1	(Line 0 is stored in the source load regs zmm8-zmm23
+; zmm24			; Output Line OL1
+; zmm23			; Preload QL3
+; zmm22			; Preload QL2
+; zmm21			; Preload QL1
+; zmm20			; Preload QL0
+; zmm19			; Source Load QL3
+; zmm18			; Source Load QL2
+; zmm17			; Source Load QL1
+; zmm16			; Source Load QL0
+
+; zmm15			; QL3 little a
+; zmm14			; QL2 little a
+; zmm13			; QL1 little a
+; zmm12			; QL0 little a
+; zmm11			; QL3 masked for digit
+; zmm10			; QL2 masked for digit
+; zmm9			; QL1 masked for digit
+; zmm8			; QL0 masked for digit
+; zmm7			; lower nibble masked QL3
+; zmm6			; lower nibble masked QL2
+; zmm5			; lower nibble masked QL1
+; zmm4			; lower nibble masked QL0
+; zmm3			;
+; zmm2			;
+; zmm1			;
+; zmm0			;
+
+
+%define	NINP_BYTES_PER_ROUND	4*64
+%define	NINP_BITSHIFT			8
+
+hex_decode_avx512bw:
+
+	sub			rsp,STACK_ADJ
+
+	mov			[rsp+STACK_FOR_XMM+0*8],rdi
+	mov			[rsp+STACK_FOR_XMM+1*8],rsi
+	mov			[rsp+STACK_FOR_XMM+2*8],r12
+	mov			[rsp+STACK_FOR_XMM+3*8],r14
+	mov			[rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+	VMOVDQA		[rsp     ],xmm6
+	VMOVDQA		[rsp+1*16],xmm7
+	VMOVDQA		[rsp+2*16],xmm8
+	VMOVDQA		[rsp+3*16],xmm9
+	VMOVDQA		[rsp+4*16],xmm10
+	VMOVDQA		[rsp+5*16],xmm11
+	VMOVDQA		[rsp+6*16],xmm12
+	VMOVDQA		[rsp+7*16],xmm13
+	VMOVDQA		[rsp+8*16],xmm14
+	VMOVDQA		[rsp+9*16],xmm15
+
+	mov			rdi,rcx							; parameter 1 output buffer
+
+	mov			rsi,rdx							; parameter 2 input buffer
+
+	mov			rdx,r8							; parameter 3 number of elements
+
+%endif
+
+;; initializer for QQ0 and QQ1
+
+	VMOVDQU64	zmm20,[rsi+0*64]					; QQ0 p____5 l3+ QL4
+	VMOVDQU64	zmm21,[rsi+1*64]					; QQ0 p____5 l3+ QL5
+	VMOVDQU64	zmm22,[rsi+2*64]					; QQ0 p____5 l3+ QL6
+	VMOVDQU64	zmm23,[rsi+3*64]					; QQ0 p____5 l3+ QL7
+
+;; initialize constants
+
+	KXNORQ		k7,k7,k7							; all one
+
+	VPBROADCASTQ zmm31,[ALL_BYTES_9]				; p_23__ l3
+
+	VPBROADCASTQ zmm30,[BITMASK_LOWER_HALF]			; p_23__ l3
+
+	KXNORQ		k6,k6,k6							; all one
+
+	VPBROADCASTQ zmm29,[ASCII_LETTER_LITTLE_F]		; p_23__ l3
+
+	VPBROADCASTQ zmm28,[ASCII_LETTER_LITTLE_A]		; p_23__ l3
+
+	KMOVQ		k5,[BITMASK_ZERO_ONE]
+
+	VMOVDQA64	zmm27,[VPERM_DECODE_OFFSETS]		; p_23__ l3
+
+	VPBROADCASTQ zmm26,[BITMASK_BIG_TO_LITTLE_ASCII]; p_23__ l3
+
+	VPBROADCASTQ zmm25,[BITMASK_SELECT_DIGIT]		; p_23__ l3
+
+
+;; do page overshoot checks
+;; due to end condition handling not done here, we only process full rounds
+
+	mov			rax,NINP_BYTES_PER_ROUND
+
+	add			rdx,NINP_BYTES_PER_ROUND-1
+	shr			rdx,NINP_BITSHIFT				;
+	shl			rdx,NINP_BITSHIFT				; rdx number of bytes read in normal loop equiv to xxx full loops
+
+	mov			r9,rdx							; exact requested number of elements to process
+	add			r9,rsi							; r9 last valid pointer +1 of requested input buffer
+
+	mov			r10,rsi							; r10 saved start of input buffer
+	mov			r12,r9							; r12 save of end of input buffer+1
+
+	lea			rcx,[rsi+rdx-1]					; rcx address of last byte requested to read
+
+	mov			r11,r9
+; DISABLED for NO OVERSHOOT
+;	add			r11,rax							; r11 address of last byte of prefetched data
+
+	shr			rcx,NSHIFT_ADDRESS_TO_PAGE		; rcx page number of last byte requested input
+	shr			r11,NSHIFT_ADDRESS_TO_PAGE		; r11 page number of byte after prefetched data
+	cmp			rcx,r11
+	je			.LSAME_PAGE_IN_PREFETCH
+	sub			rdx,rax							; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+	add			rdx,rsi							; rdx last valid pointer+1 for normal loop
+
+												; due to prefetch add one round to end checks
+	add			rdx,rax
+	add			r9,rax
+
+	mov			r11,rdi							; r11 saved start of output buffer
+
+	mov			rcx,NINP_BYTES_PER_ROUND>>1		; increment of output buffer for each round
+
+;; start preprocessing before loop
+
+;	VPUNPCKHBW	zmm1,zmm16,zmm31					; QL0 p____5 l1 QQ0 [Lin0_LeftH] [00 HL_0 00 HL_1 ...]
+;	VPUNPCKLBW	zmm3,zmm16,zmm31					; QL0 p____5 l1 QQ0 [Lin0_RghtH] [00 HL_0 00 HL_1 ...]
+
+	add 		rsi,rax					; 							add the number of processed array elements
+
+	align		32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+
+; Vector Port info AVX512
+; ----------------------------------------
+; VPShift			p0		l1
+; VPMax/Min			p0		l1
+; VPMUL				p0		l5		; with 2FMA-Units p05 (SKX,CLX etc.)
+; VPMOVB2M			p0		l3
+; VPSUBUSB /SSB		p0		l1
+
+; VPALIGNR			p5		l1				;Shift of n*8 bits!
+; VPERM				p5		l3
+; VPERMI2x		1*p05+2*p5	l7 		; (l9 with flags)
+; VPCompare			p5		l3-l4
+; VP Pack/Unpack	p5		l1(SKX) l3(TGL)
+; VPSHUF			p5		l1
+
+
+.LHEXDECODE_LOOP:
+
+	VMOVDQA64	zmm16,zmm20							; QL0 copy preload to load
+	VMOVDQA64	zmm17,zmm21							; QL1 copy preload to load
+	VPANDQ		zmm8,zmm25,zmm20					; QL0 set bitmask for digits only
+	VPMAXUB		zmm0,zmm20,zmm21					; QL0,QL1 max from both lines
+;;;	VPCMPB		k7{k7},zmm29,zmm20,2				; QL0 compare lower_eq little f
+
+	VMOVDQA64	zmm18,zmm22							; QL2
+	VMOVDQA64	zmm19,zmm23							; QL3
+	VPCMPEQB	k1,zmm8,zmm20						; QL0 compare for is digit
+	VPANDQ		zmm9,zmm25,zmm21					; QL1 set bitmask for digits only
+
+	VMOVDQU64	zmm20,[rsi+0*64]					; QQ0 p____5 l3+ QL4
+	VMOVDQU64	zmm21,[rsi+1*64]					; QL1 p____5 l3+ QL5
+
+	VPANDQ		zmm10,zmm25,zmm18					; QL2 set bitmask for digits only
+	VPCMPEQB	k2,zmm9,zmm17						; QL1 compare for is digit
+
+	VMOVDQU64	zmm22,[rsi+2*64]					; QQ0 p____5 l3+ QL6
+	VMOVDQU64	zmm23,[rsi+3*64]					; QQ0 p____5 l3+ QL7
+
+	VPANDQ		zmm11,zmm25,zmm19					; QL3 set bitmask for digits only
+	VPCMPEQB	k3,zmm10,zmm18						; QL2 compare for is digit
+
+	VPMAXUB		zmm1,zmm18,zmm19					; QL2,QL3 max from both lines
+;;;	VPCMPB		k7{k7},zmm29,zmm17,2				; QL1 compare lower_eq little f
+	VPCMPEQB	k4,zmm11,zmm19						; QL2 compare for is digit
+
+	add 		rsi,rax								; 						add the  number of processed array elements
+
+	VPORQ		zmm12,zmm26,zmm16					; QL0 set bit for little a
+	VPANDQ		zmm4,zmm30,zmm16					; QL0 bitmask lower nibble
+
+	VPORQ		zmm13,zmm26,zmm17					; QL1 set bit for little a
+	VPANDQ		zmm5,zmm30,zmm17					; QL1 bitmask lower nibble
+
+	VPMAXUB		zmm0,zmm0,zmm1						; QL0,QL1,QL2,QL3 max from 4 lines
+	VPADDB		zmm4,zmm4,zmm31						; QL0 add 9
+
+	VPORQ		zmm14,zmm26,zmm18					; QL2 set bit for little a
+	VPANDQ		zmm6,zmm30,zmm18					; QL2 bitmask lower nibble
+
+	VPANDQ		zmm7,zmm30,zmm19					; QL3 bitmask lower nibble
+	VPCMPB		k7{k7},zmm29,zmm0,2					; QL0,QL1,QL2,QL3 compare lower_eq little f
+
+	VPADDB		zmm5,zmm5,zmm31						; QL1 add 9
+	VPORQ		zmm15,zmm26,zmm19					; QL3 set bit for little a
+
+	VPADDB		zmm6,zmm6,zmm31						; QL2 add 9
+	VPADDB		zmm7,zmm7,zmm31						; QL3 add 9
+
+	VPSUBB		zmm4{k1},zmm4,zmm31					; QL0 sub 9 for digits
+	VPSUBB		zmm5{k2},zmm5,zmm31					; QL1 sub 9 for digits
+	VPSUBB		zmm6{k3},zmm6,zmm31					; QL2 sub 9 for digits
+	VPSUBB		zmm7{k4},zmm7,zmm31					; QL3 sub 9 for digits
+
+;
+
+	VPSRLW		zmm0,zmm4,8							; QL0 lower nibble-value
+	VPSLLW		zmm4,zmm4,2							; QL0 upper nibble_value
+
+	VPADDB		zmm0{k5}{z},zmm0,zmm4				; QL0 values in lower byte of dword
+
+	VPSRLW		zmm1,zmm5,8							; QL1 lower nibble-value
+	VPSLLW		zmm5,zmm5,2							; QL1 upper nibble_value
+
+	VPADDB		zmm1{k5}{z},zmm1,zmm5				; QL1 values in lower byte of dword
+
+	VPACKUSWB	zmm0,zmm0,zmm1						; QL0 vlues in single bytes
+
+	VMOVDQA64	[rdi+0*64],zmm0		;DEBUG	###########			; QL0 p____4_ p____49 l4 [Lin0 Left Half] Store Hexdump
+
+	VPERMQ		zmm1,zmm27,zmm0						; QL0,QL1 byte values in right order
+
+;
+
+	VPSRLW		zmm2,zmm6,8							; QL2 lower nibble-value
+	VPSLLW		zmm6,zmm6,2							; QL2 upper nibble_value
+
+	VPADDB		zmm2{k5}{z},zmm2,zmm6				; QL2 values in lower byte of dword
+
+	VPSRLW		zmm3,zmm7,8							; QL3 lower nibble-value
+	VPSLLW		zmm7,zmm7,2							; QL3 upper nibble_value
+
+	VPADDB		zmm3{k5}{z},zmm3,zmm7				; QL3 values in lower byte of dword
+
+	VPACKUSWB	zmm2,zmm2,zmm3						; QL2,QL3 vlues in single bytes
+
+	VMOVDQA64	[rdi+1*64],zmm2	;DEBUG	##############					; QL0 p____4_ p____49 l4 [Lin0 Rght Half] Store Hexdump
+
+	VPERMQ		zmm3,zmm27,zmm2						; QL2,QL3 byte values in right order
+
+; -------- MISSING check for little a
+
+	VMOVDQA64	[rdi+0*64],zmm1						; QL0 p____4_ p____49 l4 [Lin0 Left Half] Store Hexdump
+	VMOVDQA64	[rdi+1*64],zmm3						; QL0 p____4_ p____49 l4 [Lin0 Rght Half] Store Hexdump
+
+	add 		rdi,rcx							; 						add the number of processed output bytes
+
+
+	cmp			rsi,rdx								; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+	jl			.LHEXDECODE_LOOP
+
+												; end of normal loop reached
+												; we can do one more round when original count has been reduced by one round
+	cmp			rax,0
+	je			.LFINISH
+
+	cmp			rdx,r9							; input buffer length was not reduced when equal
+	je			.LFINISH
+
+	sub			rsi,rax							; for prefetching the last round, load the last round again
+	sub			rdx,rax							; adopt and condition for last round also
+	xor			rax,rax
+	jmp			.LHEXDECODE_LOOP
+
+.LFINISH:
+
+	mov			rax,rdi
+	sub			rax,r11							; rax = number of output bytes
+	add			rax,rax							; rax = number of valid processed input bytes = return value
+
+%ifdef __WIN__
+
+	VMOVDQA		xmm6 ,[rsp     ]
+	VMOVDQA		xmm7 ,[rsp+1*16]
+	VMOVDQA		xmm8 ,[rsp+2*16]
+	VMOVDQA		xmm9 ,[rsp+3*16]
+	VMOVDQA		xmm10,[rsp+4*16]
+	VMOVDQA		xmm11,[rsp+5*16]
+	VMOVDQA		xmm12,[rsp+6*16]
+	VMOVDQA		xmm13,[rsp+7*16]
+	VMOVDQA		xmm14,[rsp+8*16]
+	VMOVDQA		xmm15,[rsp+9*16]
+
+%endif
+
+	mov			rdi,[rsp+STACK_FOR_XMM+0*8]
+	mov			rsi,[rsp+STACK_FOR_XMM+1*8]
+	mov			r12,[rsp+STACK_FOR_XMM+2*8]
+	mov			r14,[rsp+STACK_FOR_XMM+3*8]
+	mov			r15,[rsp+STACK_FOR_XMM+4*8]
+
+	add			rsp,STACK_ADJ
+
+	ret
+
+
+;----------------------------------------------------------------------------------------------
+
+
+; ymm15			; Source Load QL7
+; ymm14			; Source Load QL6
+; ymm13			; Source Load QL5
+; ymm12			; Source Load QL4
+; ymm11			; Source Load QL3
+; ymm10			; Source Load QL2
+; ymm9			; Source Load QL1
+; ymm8			; Source Load QL0
+
+; ymm7			; CONST ENCODE_SHUFFLE_TO_HEX
+; ymm6			; CONST BITMASK_NIBBLE_3_IN_WORD
+; ymm5			; Shift temp for High nibble 1
+; ymm4			; Shift temp for High nibble 0
+; ymm3			; Temp3
+; ymm2			; Temp2
+; ymm1			; Temp1
+; ymm0			; Temp0
+
+
+%define	NINP_BYTES_PER_ROUND	8*32
+%define	NINP_BITSHIFT			8
+
+hex_encode_avx2:
+
+	sub			rsp,STACK_ADJ
+
+	mov			[rsp+STACK_FOR_XMM+0*8],rdi
+	mov			[rsp+STACK_FOR_XMM+1*8],rsi
+	mov			[rsp+STACK_FOR_XMM+2*8],r12
+	mov			[rsp+STACK_FOR_XMM+3*8],r14
+	mov			[rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+	VMOVDQA		[rsp     ],xmm6
+	VMOVDQA		[rsp+1*16],xmm7
+	VMOVDQA		[rsp+2*16],xmm8
+	VMOVDQA		[rsp+3*16],xmm9
+	VMOVDQA		[rsp+4*16],xmm10
+	VMOVDQA		[rsp+5*16],xmm11
+	VMOVDQA		[rsp+6*16],xmm12
+	VMOVDQA		[rsp+7*16],xmm13
+	VMOVDQA		[rsp+8*16],xmm14
+	VMOVDQA		[rsp+9*16],xmm15
+
+	mov			rdi,rcx							; parameter 1 output buffer
+
+	mov			rsi,rdx							; parameter 2 input buffer
+
+	mov			rdx,r8							; parameter 3 number of elements
+
+%endif
+
+;; Loading QL0-QL3, prefetching QL4-QL7
+
+	VPERMQ		ymm8, [rsi+0*32],VPERM_AVX2_OFFS	; AAA p_____5 p1____5 l3+ QL0
+	VPERMQ		ymm9, [rsi+1*32],VPERM_AVX2_OFFS	; AAA p_____5 p1____5 l3+ QL1
+	VPERMQ		ymm10,[rsi+2*32],VPERM_AVX2_OFFS	; AAA p_____5 p1____5 l3+ QL2
+	VPERMQ		ymm11,[rsi+3*32],VPERM_AVX2_OFFS	; AAA p_____5 p1____5 l3+ QL3
+
+	VPERMQ		ymm12,[rsi+4*32],VPERM_AVX2_OFFS	; BBB p_____5 p1____5 l3+ QL4
+	VPERMQ		ymm13,[rsi+5*32],VPERM_AVX2_OFFS	; BBB p_____5 p1____5 l3+ QL5
+	VPERMQ		ymm14,[rsi+6*32],VPERM_AVX2_OFFS	; BBB p_____5 p1____5 l3+ QL6
+	VPERMQ		ymm15,[rsi+7*32],VPERM_AVX2_OFFS	; BBB p_____5 p1____5 l3+ QL7
+
+;; initialize constants
+
+	VMOVDQA 	ymm7,[ENCODE_SHUFFLE_TO_HEX]	; p_23__ l3
+
+	VMOVDQA 	ymm6,[BITMASK_LOWER_HALF]		; p_23__ l3
+
+;; do page overshoot checks
+
+	mov			rax,NINP_BYTES_PER_ROUND
+
+
+	mov			r9,rdx							; exact requested number of elements to process
+	add			r9,rsi							; r9 last valid pointer +1 of requested input buffer
+
+	mov			r10,rsi							; r10 saved start of input buffer
+	mov			r12,r9							; r12 save of end of input buffer+1
+
+	lea			rcx,[rsi+rdx-1]					; rcx address of last byte requested to read
+
+	lea			r8,[rdx+NINP_BYTES_PER_ROUND-1]
+	shr			r8,NINP_BITSHIFT				; number of loops
+	shl			r8,NINP_BITSHIFT
+	add			r8,rsi							; r8 address of last byte+1 read in complete loops
+	add			r8,NINP_BYTES_PER_ROUND-1		; r8 address of last byte read in normal loop with overshoot
+
+	mov			r11,r8
+
+; DISABLED for NO OVERSHOOT
+;	add			r11,rax							; r11 address of last byte of prefetched data
+
+	shr			rcx,NSHIFT_ADDRESS_TO_PAGE		; rcx page number of last byte requested input
+	shr			r8,NSHIFT_ADDRESS_TO_PAGE		; r8  page number of last byte read after normal round
+
+	cmp			rcx,r8							; stay on same page
+	je			.LSAME_PAGE_IN_ROUND
+	sub			rdx,rax							; don't overshoot in reading: do one round less
+
+.LSAME_PAGE_IN_ROUND:
+	shr			r11,NSHIFT_ADDRESS_TO_PAGE		; r11 page number of byte after prefetched data
+	cmp			rcx,r11
+	je			.LSAME_PAGE_IN_PREFETCH
+	sub			rdx,rax							; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+	add			rdx,rsi							; rdx last valid pointer+1 for normal loop
+
+												; due to prefetch add one round to end checks
+	add			rdx,rax
+	add			r9,rax
+
+	mov			r11,rdi							; r11 saved start of output buffer
+
+	mov			rcx,NINP_BYTES_PER_ROUND<<1		; increment of output buffer for each round
+
+;; start preprocessing before loop
+
+	VPSRLQ		ymm4,ymm8,4							; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte
+	VPSRLQ		ymm5,ymm9,4							; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte
+
+	VPUNPCKLBW	ymm0,ymm4,ymm8						; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPUNPCKHBW	ymm1,ymm4,ymm8						; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	add 		rsi,rax								; 						add the number of processed array elements
+
+
+	align		32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+
+.LHEXENCODE_LOOP:
+
+;; process unpacked AAA in YMM0-YMM4 and YMM8-YMM11, UNPCK BBB to YMM0-YMM1, PreLoad AAA to YMM8-YMM11
+;;RL00,RL01,RL02,RL03,RL04,RL05,RL06,RL07
+	VPUNPCKLBW	ymm2,ymm5,ymm9						; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0]
+	VPSRLQ		ymm4,ymm10,4						; AAA RL04,RL05 QL2 shift Hx to lower nibble in byte
+	VPUNPCKHBW	ymm3,ymm5,ymm9						; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0]
+	VPAND		ymm0,ymm0,ymm6						; AAA RL00 mask lower nibble
+
+	VPSRLQ		ymm5,ymm11,4						; AAA RL06,RL07 QL3 shift Hx to lower nibble in byte
+	VPUNPCKLBW	ymm8,ymm4,ymm10						; AAA RL04 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPAND		ymm1,ymm1,ymm6						; AAA RL01 mask lower nibble
+	VPUNPCKHBW	ymm9,ymm4,ymm10						; AAA RL05 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	VPAND		ymm2,ymm2,ymm6						; AAA RL02 mask lower nibble
+	VPUNPCKLBW	ymm10,ymm5,ymm11					; AAA RL06 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPAND		ymm3,ymm3,ymm6						; AAA RL03 mask lower nibble
+	VPUNPCKHBW	ymm11,ymm5,ymm11					; AAA RL07 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	VPSHUFB		ymm0,ymm7,ymm0						; AAA RL00 shuffle_to_hex_digits
+	VPAND		ymm8,ymm8,ymm6						; AAA RL04 mask lower nibble
+	VPSHUFB		ymm1,ymm7,ymm1						; AAA RL01 shuffle_to_hex_digits
+	VPAND		ymm9,ymm9,ymm6						; AAA RL05 mask lower nibble
+
+	VPSHUFB		ymm2,ymm7,ymm2						; AAA RL02 shuffle_to_hex_digits
+	VMOVDQU		[rdi+0*32],ymm0						; AAA RL00 Store Hexdump
+	VPAND		ymm10,ymm10,ymm6					; AAA RL06 mask lower nibble
+
+	VPSHUFB		ymm3,ymm7,ymm3						; AAA RL03 shuffle_to_hex_digits
+	VMOVDQU		[rdi+1*32],ymm1						; AAA RL01 Store Hexdump
+	VPAND		ymm11,ymm11,ymm6					; AAA RL07 mask lower nibble
+
+	VPSHUFB		ymm8,ymm7,ymm8						; AAA RL04 shuffle_to_hex_digits
+	VPSRLQ		ymm4,ymm12,4						; BBB RL08,RL09 QL4 shift Hx to lower nibble in byte
+	VMOVDQU		[rdi+2*32],ymm2						; AAA RL02 Store Hexdump
+	VPSHUFB		ymm9,ymm7,ymm9						; AAA RL05 shuffle_to_hex_digits
+	VPSRLQ		ymm5,ymm13,4						; BBB RL10,RL11 QL5 shift Hx to lower nibble in byte
+	VMOVDQU		[rdi+3*32],ymm3						; AAA RL03 Store Hexdump
+
+	VPSHUFB		ymm10,ymm7,ymm10					; AAA RL06 shuffle_to_hex_digits
+	VMOVDQU		[rdi+4*32],ymm8						; AAA RL04 Store Hexdump
+	VPERMQ		ymm8, [rsi+0*32],VPERM_AVX2_OFFS	; AAA p_____5 p1____5 l3+ QL0
+	VMOVDQU		[rdi+5*32],ymm9						; AAA RL05 Store Hexdump
+	VPERMQ		ymm9, [rsi+1*32],VPERM_AVX2_OFFS	; AAA p_____5 p1____5 l3+ QL1
+
+	VPSHUFB		ymm11,ymm7,ymm11					; AAA RL07 shuffle_to_hex_digits
+	VMOVDQU		[rdi+6*32],ymm10					; AAA RL06 Store Hexdump
+	VPUNPCKLBW	ymm0,ymm4,ymm12						; BBB RL08 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPERMQ		ymm10,[rsi+2*32],VPERM_AVX2_OFFS	; AAA p_____5 p1____5 l3+ QL2
+
+	VMOVDQU		[rdi+7*32],ymm11					; AAA RL07 Store Hexdump
+	VPERMQ		ymm11,[rsi+3*32],VPERM_AVX2_OFFS	; AAA p_____5 p1____5 l3+ QL3
+	VPUNPCKHBW	ymm1,ymm4,ymm12						; AAA RL09 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+
+;; process unpacked BBB in YMM0-YMM4 and YMM9-YMM15, UNPCK AAA to YMM0-YMM1, PreLoad BBB to YMM12-YMM15
+;;RL08,RL09,RL10,RL11,RL12,RL13,RL14,RL15
+	VPUNPCKLBW	ymm2,ymm5,ymm13						; BBB RL10 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0]
+	VPSRLQ		ymm4,ymm14,4						; BBB RL12,RL13 QL6 shift Hx to lower nibble in byte
+	VPUNPCKHBW	ymm3,ymm5,ymm13						; BBB RL11 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0]
+	VPAND		ymm0,ymm0,ymm6						; BBB RL08 mask lower nibble
+
+	VPSRLQ		ymm5,ymm15,4						; BBB RL14,RL15 QL7 shift Hx to lower nibble in byte
+	VPUNPCKLBW	ymm12,ymm4,ymm14					; BBB RL12 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPAND		ymm1,ymm1,ymm6						; BBB RL09 mask lower nibble
+	VPUNPCKHBW	ymm13,ymm4,ymm14					; BBB RL13 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	VPAND		ymm2,ymm2,ymm6						; BBB RL10 mask lower nibble
+	VPUNPCKLBW	ymm14,ymm5,ymm15					; BBB RL14 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPAND		ymm3,ymm3,ymm6						; BBB RL11 mask lower nibble
+	VPUNPCKHBW	ymm15,ymm5,ymm15					; BBB RL15 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	VPSHUFB		ymm0,ymm7,ymm0						; BBB RL08 shuffle_to_hex_digits
+	VPAND		ymm12,ymm12,ymm6					; BBB RL12 mask lower nibble
+	VPSHUFB		ymm1,ymm7,ymm1						; BBB RL09 shuffle_to_hex_digits
+	VPAND		ymm13,ymm13,ymm6					; BBB RL13 mask lower nibble
+
+	VPSHUFB		ymm2,ymm7,ymm2						; BBB RL10 shuffle_to_hex_digits
+	VMOVDQU		[rdi+8*32],ymm0						; BBB RL08 Store Hexdump
+	VPAND		ymm14,ymm14,ymm6					; BBB RL14 mask lower nibble
+
+	VPSHUFB		ymm3,ymm7,ymm3						; BBB RL11 shuffle_to_hex_digits
+	VMOVDQU		[rdi+9*32],ymm1						; BBB RL09 Store Hexdump
+	VPAND		ymm15,ymm15,ymm6					; BBB RL15 mask lower nibble
+
+	VPSHUFB		ymm12,ymm7,ymm12					; BBB RL12 shuffle_to_hex_digits
+	VPSRLQ		ymm4,ymm8,4							; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte
+	VMOVDQU		[rdi+10*32],ymm2					; BBB RL10 Store Hexdump
+	VPSHUFB		ymm13,ymm7,ymm13					; BBB RL13 shuffle_to_hex_digits
+	VPSRLQ		ymm5,ymm9,4							; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte
+	VMOVDQU		[rdi+11*32],ymm3					; BBB RL11 Store Hexdump
+
+	VPSHUFB		ymm14,ymm7,ymm14					; BBB RL14 shuffle_to_hex_digits
+	VMOVDQU		[rdi+12*32],ymm12					; BBB RL12 Store Hexdump
+	VPERMQ		ymm12, [rsi+4*32],VPERM_AVX2_OFFS	; BBB p_____5 p1____5 l3+ QL0
+	VMOVDQU		[rdi+13*32],ymm13					; BBB RL13 Store Hexdump
+	VPERMQ		ymm13, [rsi+5*32],VPERM_AVX2_OFFS	; BBB p_____5 p1____5 l3+ QL1
+
+	VPSHUFB		ymm15,ymm7,ymm15					; BBB RL15 shuffle_to_hex_digits
+	VMOVDQU		[rdi+14*32],ymm14					; BBB RL14 Store Hexdump
+	VPUNPCKLBW	ymm0,ymm4,ymm8						; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPERMQ		ymm14,[rsi+6*32],VPERM_AVX2_OFFS	; BBB p_____5 p1____5 l3+ QL2
+
+	VMOVDQU		[rdi+15*32],ymm15					; BBB RL15 Store Hexdump
+	VPERMQ		ymm15,[rsi+7*32],VPERM_AVX2_OFFS	; BBB p_____5 p1____5 l3+ QL3
+
+	add 		rsi,rax								; 					add the number of processed array elements
+
+	VPUNPCKHBW	ymm1,ymm4,ymm8						; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	add 		rdi,rcx								; 					add the number of processed output bytes
+
+
+	cmp			rsi,rdx								; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+	jl			.LHEXENCODE_LOOP
+
+												; end of normal loop reached
+												; we can do one more round when original count has been reduced by one round
+	cmp			rax,0
+	je			.LFINISH_EXTRA
+
+	cmp			rdx,r9							; input buffer length was not reduced when equal
+	je			.LFINISH_NORMAL
+
+	sub			rsi,rax							; for prefetching the last round, load the last round again
+	sub			rdx,rax							; adopt and condition for last round also
+	xor			rax,rax
+	jmp			.LHEXENCODE_LOOP
+
+
+.LFINISH_EXTRA:
+	add			rsi,NINP_BYTES_PER_ROUND		; add the extra round to get processed bytes
+	jmp .LFINISH
+
+.LFINISH_NORMAL:
+	sub			rsi,NINP_BYTES_PER_ROUND		; sub the added prefetch round to get processed bytes
+
+.LFINISH:
+												; r9 = address of requested input bytes+1
+												; rsi = address of processed input bytes+1
+												; now get the minimum of rdx,rsi to rax
+;;	sub			rsi,rax							; for last round do nothing (rax=0), else sub increment for one round
+;;	sub			r9,rax
+
+	mov			rax,r12
+	cmp			rsi,r12							; get min from r12 (address of requested input) and rsi (address of done input)
+
+	jge			.LCALC_PROCESSED_BYTES
+	mov 		rax,rsi							; rax=address of last valid input byte+1
+
+.LCALC_PROCESSED_BYTES:
+	sub			rax,r10							; sub the input buffer start address
+												; rax = number of valid processed input bytes = return value
+
+	cmp			rsi,rdx							; compare rdx (address of requested input) and rsi (address of done input)
+	je			.LNO_ZERO_OUT
+
+	mov			r15,rax							; number of elements to process
+
+	shl			r15,1							; number of output bytes
+
+	add			r15,r11							; pointer to next byte after full valid output buffer
+
+
+	VPXOR		ymm0,ymm0,ymm0					; all zero
+;ZERO	VMOVDQU		[r15],ymm0						; zero out one register width after last output
+
+.LNO_ZERO_OUT:
+
+%ifdef __WIN__
+
+	VMOVDQA		xmm6 ,[rsp     ]
+	VMOVDQA		xmm7 ,[rsp+1*16]
+	VMOVDQA		xmm8 ,[rsp+2*16]
+	VMOVDQA		xmm9 ,[rsp+3*16]
+	VMOVDQA		xmm10,[rsp+4*16]
+	VMOVDQA		xmm11,[rsp+5*16]
+	VMOVDQA		xmm12,[rsp+6*16]
+	VMOVDQA		xmm13,[rsp+7*16]
+	VMOVDQA		xmm14,[rsp+8*16]
+	VMOVDQA		xmm15,[rsp+9*16]
+
+%endif
+
+	mov			rdi,[rsp+STACK_FOR_XMM+0*8]
+	mov			rsi,[rsp+STACK_FOR_XMM+1*8]
+	mov			r12,[rsp+STACK_FOR_XMM+2*8]
+	mov			r14,[rsp+STACK_FOR_XMM+3*8]
+	mov			r15,[rsp+STACK_FOR_XMM+4*8]
+
+	add			rsp,STACK_ADJ
+
+
+	ret
+
+;----------------------------------------------------------------------------------------------
+
+
+; xmm15			; Source Load QL7
+; xmm14			; Source Load QL6
+; xmm13			; Source Load QL5
+; xmm12			; Source Load QL4
+; xmm11			; Source Load QL3
+; xmm10			; Source Load QL2
+; xmm9			; Source Load QL1
+; xmm8			; Source Load QL0
+
+; xmm7			; CONST ENCODE_SHUFFLE_TO_HEX
+; xmm6			; CONST BITMASK_NIBBLE_3_IN_WORD
+; xmm5			; Shift temp for High nibble 1
+; xmm4			; Shift temp for High nibble 0
+; xmm3			; Temp3
+; xmm2			; Temp2
+; xmm1			; Temp1
+; xmm0			; Temp0
+
+
+
+%define	NINP_BYTES_PER_ROUND	8*16
+%define	NINP_BITSHIFT			7
+
+
+hex_encode_ssse3:
+
+	sub			rsp,STACK_ADJ
+
+	mov			[rsp+STACK_FOR_XMM+0*8],rdi
+	mov			[rsp+STACK_FOR_XMM+1*8],rsi
+	mov			[rsp+STACK_FOR_XMM+2*8],r12
+	mov			[rsp+STACK_FOR_XMM+3*8],r14
+	mov			[rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+	MOVDQA		[rsp     ],xmm6
+	MOVDQA		[rsp+1*16],xmm7
+	MOVDQA		[rsp+2*16],xmm8
+	MOVDQA		[rsp+3*16],xmm9
+	MOVDQA		[rsp+4*16],xmm10
+	MOVDQA		[rsp+5*16],xmm11
+	MOVDQA		[rsp+6*16],xmm12
+	MOVDQA		[rsp+7*16],xmm13
+	MOVDQA		[rsp+8*16],xmm14
+	MOVDQA		[rsp+9*16],xmm15
+
+	mov			rdi,rcx							; parameter 1 output buffer
+
+	mov			rsi,rdx							; parameter 2 input buffer
+
+	mov			rdx,r8							; parameter 3 number of elements
+
+%endif
+
+;; Loading QL0-QL3, prefetching QL4-QL7
+
+	MOVDQU		xmm8, [rsi+0*16]			; AAA p_____5 p1____5 l3+ QL0
+	MOVDQU		xmm9, [rsi+1*16]			; AAA p_____5 p1____5 l3+ QL1
+	MOVDQU		xmm10,[rsi+2*16]			; AAA p_____5 p1____5 l3+ QL2
+	MOVDQU		xmm11,[rsi+3*16]			; AAA p_____5 p1____5 l3+ QL3
+
+	MOVDQU		xmm12,[rsi+4*16]			; BBB p_____5 p1____5 l3+ QL4
+	MOVDQU		xmm13,[rsi+5*16]			; BBB p_____5 p1____5 l3+ QL5
+	MOVDQU		xmm14,[rsi+6*16]			; BBB p_____5 p1____5 l3+ QL6
+	MOVDQU		xmm15,[rsi+7*16]			; BBB p_____5 p1____5 l3+ QL7
+
+;; initialize constants
+
+	MOVDQA 	xmm7,[ENCODE_SHUFFLE_TO_HEX]	; p_23__ l3
+
+	MOVDQA 	xmm6,[BITMASK_LOWER_HALF]		; p_23__ l3
+
+;; do page overshoot checks
+
+	mov			rax,NINP_BYTES_PER_ROUND
+
+
+	mov			r9,rdx							; exact requested number of elements to process
+	add			r9,rsi							; r9 last valid pointer +1 of requested input buffer
+
+	mov			r10,rsi							; r10 saved start of input buffer
+	mov			r12,r9							; r12 save of end of input buffer+1
+
+	lea			rcx,[rsi+rdx-1]					; rcx address of last byte requested to read
+
+	lea			r8,[rdx+NINP_BYTES_PER_ROUND-1]
+	shr			r8,NINP_BITSHIFT				; number of loops
+	shl			r8,NINP_BITSHIFT
+	add			r8,rsi							; r8 address of last byte+1 read in complete loops
+	add			r8,NINP_BYTES_PER_ROUND-1		; r8 address of last byte read in normal loop with overshoot
+
+	mov			r11,r8
+
+; DISABLED for NO OVERSHOOT
+;	add			r11,rax							; r11 address of last byte of prefetched data
+
+	shr			rcx,NSHIFT_ADDRESS_TO_PAGE		; rcx page number of last byte requested input
+	shr			r8,NSHIFT_ADDRESS_TO_PAGE		; r8  page number of last byte read after normal round
+
+	cmp			rcx,r8							; stay on same page
+	je			.LSAME_PAGE_IN_ROUND
+	sub			rdx,rax							; don't overshoot in reading: do one round less
+
+.LSAME_PAGE_IN_ROUND:
+	shr			r11,NSHIFT_ADDRESS_TO_PAGE		; r11 page number of byte after prefetched data
+	cmp			rcx,r11
+	je			.LSAME_PAGE_IN_PREFETCH
+	sub			rdx,rax							; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+	add			rdx,rsi							; rdx last valid pointer+1 for normal loop
+
+												; due to prefetch add one round to end checks
+	add			rdx,rax
+	add			r9,rax
+
+	mov			r11,rdi							; r11 saved start of output buffer
+
+	mov			rcx,NINP_BYTES_PER_ROUND<<1		; increment of output buffer for each round
+
+;; start preprocessing before loop
+
+	MOVDQA		xmm4,xmm8
+	PSRLQ		xmm4,4						; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte
+	MOVDQA		xmm5,xmm9
+	PSRLQ		xmm5,4						; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte
+
+	MOVDQA		xmm0,xmm4
+	PUNPCKLBW	xmm0,xmm8					; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	MOVDQA		xmm1,xmm4
+	PUNPCKHBW	xmm1,xmm8					; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	add 		rsi,rax						; 						add the number of processed array elements
+
+
+	align		32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+.LHEXENCODE_LOOP:
+
+;; process unpacked AAA in XMM0-XMM4 and XMM8-XMM11, UNPCK BBB to XMM0-XMM1, PreLoad AAA to XMM8-XMM11
+;;RL00,RL01,RL02,RL03,RL04,RL05,RL06,RL07
+	MOVDQA		xmm2,xmm5
+	PUNPCKLBW	xmm2,xmm9					; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0]
+	MOVDQA		xmm4,xmm10
+	PSRLQ		xmm4,4						; AAA RL04,RL05 QL2 shift Hx to lower nibble in byte
+	MOVDQA		xmm3,xmm5
+	PUNPCKHBW	xmm3,xmm9					; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0]
+	PAND		xmm0,xmm6					; AAA RL00 mask lower nibble
+
+	MOVDQA		xmm5,xmm11
+	PSRLQ		xmm5,4						; AAA RL06,RL07 QL3 shift Hx to lower nibble in byte
+	MOVDQA		xmm8,xmm4
+	PUNPCKLBW	xmm8,xmm10					; AAA RL04 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	PAND		xmm1,xmm6					; AAA RL01 mask lower nibble
+	MOVDQA		xmm9,xmm4
+	PUNPCKHBW	xmm9,xmm10					; AAA RL05 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	PAND		xmm2,xmm6					; AAA RL02 mask lower nibble
+	MOVDQA		xmm10,xmm5
+	PUNPCKLBW	xmm10,xmm11					; AAA RL06 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	PAND		xmm3,xmm6					; AAA RL03 mask lower nibble
+	MOVDQA		xmm4,xmm5
+	PUNPCKHBW	xmm4,xmm11					; AAA RL07 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+	MOVDQA		xmm11,xmm4
+
+	MOVDQA		xmm4,xmm7
+	PSHUFB		xmm4,xmm0					; AAA RL00 shuffle_to_hex_digits
+	PAND		xmm8,xmm6					; AAA RL04 mask lower nibble
+	MOVDQA		xmm5,xmm7
+	PSHUFB		xmm5,xmm1					; AAA RL01 shuffle_to_hex_digits
+	PAND		xmm9,xmm6					; AAA RL05 mask lower nibble
+
+	MOVDQA		xmm0,xmm7
+	PSHUFB		xmm0,xmm2					; AAA RL02 shuffle_to_hex_digits
+	MOVDQU		[rdi+0*16],xmm4				; AAA RL00 Store Hexdump
+	PAND		xmm10,xmm6					; AAA RL06 mask lower nibble
+
+	MOVDQA		xmm1,xmm7
+	PSHUFB		xmm1,xmm3					; AAA RL03 shuffle_to_hex_digits
+	MOVDQU		[rdi+1*16],xmm5				; AAA RL01 Store Hexdump
+	PAND		xmm11,xmm6					; AAA RL07 mask lower nibble
+
+	MOVDQA		xmm2,xmm7
+	PSHUFB		xmm2,xmm8					; AAA RL04 shuffle_to_hex_digits
+	MOVDQA		xmm4,xmm12
+	PSRLQ		xmm4,4						; BBB RL08,RL09 QL4 shift Hx to lower nibble in byte
+	MOVDQU		[rdi+2*16],xmm0				; AAA RL02 Store Hexdump
+	MOVDQA		xmm3,xmm7
+	PSHUFB		xmm3,xmm9					; AAA RL05 shuffle_to_hex_digits
+	MOVDQA		xmm5,xmm13
+	PSRLQ		xmm5,4						; BBB RL10,RL11 QL5 shift Hx to lower nibble in byte
+	MOVDQU		[rdi+3*16],xmm1				; AAA RL03 Store Hexdump
+
+	MOVDQA		xmm0,xmm7
+	PSHUFB		xmm0,xmm10					; AAA RL06 shuffle_to_hex_digits
+	MOVDQU		[rdi+4*16],xmm2				; AAA RL04 Store Hexdump
+	MOVDQU		xmm8, [rsi+0*16]			; AAA p_____5 p1____5 l3+ QL0
+	MOVDQU		[rdi+5*16],xmm3				; AAA RL05 Store Hexdump
+	MOVDQU		xmm9, [rsi+1*16]			; AAA p_____5 p1____5 l3+ QL1
+
+	MOVDQA		xmm1,xmm7
+	PSHUFB		xmm1,xmm11					; AAA RL07 shuffle_to_hex_digits
+	MOVDQU		[rdi+6*16],xmm0				; AAA RL06 Store Hexdump
+	MOVDQA		xmm0,xmm4
+	PUNPCKLBW	xmm0,xmm12					; BBB RL08 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	MOVDQU		xmm10,[rsi+2*16]			; AAA p_____5 p1____5 l3+ QL2
+
+	MOVDQU		[rdi+7*16],xmm1				; AAA RL07 Store Hexdump
+	MOVDQU		xmm11,[rsi+3*16]			; AAA p_____5 p1____5 l3+ QL3
+	MOVDQA		xmm1,xmm4
+	PUNPCKHBW	xmm1,xmm12					; AAA RL09 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+
+;; process unpacked BBB in XMM0-XMM4 and XMM9-XMM15, UNPCK AAA to XMM0-XMM1, PreLoad BBB to XMM12-XMM15
+;;RL08,RL09,RL10,RL11,RL12,RL13,RL14,RL15
+	MOVDQA		xmm2,xmm5
+	PUNPCKLBW	xmm2,xmm13					; BBB RL10 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0]
+	MOVDQA		xmm4,xmm14
+	PSRLQ		xmm4,4						; BBB RL12,RL13 QL6 shift Hx to lower nibble in byte
+	MOVDQA		xmm3,xmm5
+	PUNPCKHBW	xmm3,xmm13					; BBB RL11 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0]
+	PAND		xmm0,xmm6					; BBB RL08 mask lower nibble
+
+	MOVDQA		xmm5,xmm15
+	PSRLQ		xmm5,4						; BBB RL14,RL15 QL7 shift Hx to lower nibble in byte
+	MOVDQA		xmm12,xmm4
+	PUNPCKLBW	xmm12,xmm14					; BBB RL12 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	PAND		xmm1,xmm6					; BBB RL09 mask lower nibble
+	MOVDQA		xmm13,xmm4
+	PUNPCKHBW	xmm13,xmm14					; BBB RL13 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	PAND		xmm2,xmm6					; BBB RL10 mask lower nibble
+	MOVDQA		xmm14,xmm5
+	PUNPCKLBW	xmm14,xmm15					; BBB RL14 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	PAND		xmm3,xmm6					; BBB RL11 mask lower nibble
+	MOVDQA		xmm4,xmm5
+	PUNPCKHBW	xmm4,xmm15					; BBB RL15 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+	MOVDQA		xmm15,xmm4
+
+	MOVDQA		xmm4,xmm7
+	PSHUFB		xmm4,xmm0					; BBB RL08 shuffle_to_hex_digits
+	PAND		xmm12,xmm6					; BBB RL12 mask lower nibble
+	MOVDQA		xmm5,xmm7
+	PSHUFB		xmm5,xmm1					; BBB RL09 shuffle_to_hex_digits
+	PAND		xmm13,xmm6					; BBB RL13 mask lower nibble
+
+	MOVDQA		xmm0,xmm7
+	PSHUFB		xmm0,xmm2					; BBB RL10 shuffle_to_hex_digits
+	MOVDQU		[rdi+8*16],xmm4				; BBB RL08 Store Hexdump
+	PAND		xmm14,xmm6					; BBB RL14 mask lower nibble
+
+	MOVDQA		xmm1,xmm7
+	PSHUFB		xmm1,xmm3					; BBB RL11 shuffle_to_hex_digits
+	MOVDQU		[rdi+9*16],xmm5				; BBB RL09 Store Hexdump
+	PAND		xmm15,xmm6					; BBB RL15 mask lower nibble
+
+	MOVDQA		xmm2,xmm7
+	PSHUFB		xmm2,xmm12					; BBB RL12 shuffle_to_hex_digits
+	MOVDQA		xmm4,xmm8
+	PSRLQ		xmm4,4						; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte
+	MOVDQU		[rdi+10*16],xmm0			; BBB RL10 Store Hexdump
+	MOVDQA		xmm3,xmm7
+	PSHUFB		xmm3,xmm13					; BBB RL13 shuffle_to_hex_digits
+	MOVDQA		xmm5,xmm9
+	PSRLQ		xmm5,4						; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte
+	MOVDQU		[rdi+11*16],xmm1			; BBB RL11 Store Hexdump
+
+	MOVDQA		xmm0,xmm7
+	PSHUFB		xmm0,xmm14					; BBB RL14 shuffle_to_hex_digits
+	MOVDQU		[rdi+12*16],xmm2			; BBB RL12 Store Hexdump
+	MOVDQU		xmm12, [rsi+4*16]			; BBB p_____5 p1____5 l3+ QL0
+	MOVDQU		[rdi+13*16],xmm3			; BBB RL13 Store Hexdump
+	MOVDQU		xmm13, [rsi+5*16]			; BBB p_____5 p1____5 l3+ QL1
+
+	MOVDQA		xmm1,xmm7
+	PSHUFB		xmm1,xmm15					; BBB RL15 shuffle_to_hex_digits
+	MOVDQU		[rdi+14*16],xmm0			; BBB RL14 Store Hexdump
+	MOVDQA		xmm0,xmm4
+	PUNPCKLBW	xmm0,xmm8					; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	MOVDQU		xmm14,[rsi+6*16]			; BBB p_____5 p1____5 l3+ QL2
+
+	MOVDQU		[rdi+15*16],xmm1			; BBB RL15 Store Hexdump
+	MOVDQU		xmm15,[rsi+7*16]			; BBB p_____5 p1____5 l3+ QL3
+
+	add 		rsi,rax						; 					add the number of processed array elements
+
+	MOVDQA		xmm1,xmm4
+	PUNPCKHBW	xmm1,xmm8					; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	add 		rdi,rcx						; 					add the number of processed output bytes
+
+
+	cmp			rsi,rdx						; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+	jl			.LHEXENCODE_LOOP
+
+												; end of normal loop reached
+												; we can do one more round when original count has been reduced by one round
+	cmp			rax,0
+	je			.LFINISH_EXTRA
+
+	cmp			rdx,r9							; input buffer length was not reduced when equal
+	je			.LFINISH_NORMAL
+
+	sub			rsi,rax							; for prefetching the last round, load the last round again
+	sub			rdx,rax							; adopt and condition for last round also
+	xor			rax,rax
+	jmp			.LHEXENCODE_LOOP
+
+
+.LFINISH_EXTRA:
+	add			rsi,NINP_BYTES_PER_ROUND		; add the extra round to get processed bytes
+	jmp .LFINISH
+
+.LFINISH_NORMAL:
+	sub			rsi,NINP_BYTES_PER_ROUND		; sub the added prefetch round to get processed bytes
+
+.LFINISH:
+												; r9 = address of requested input bytes+1
+												; rsi = address of processed input bytes+1
+												; now get the minimum of rdx,rsi to rax
+;;	sub			rsi,rax							; for last round do nothing (rax=0), else sub increment for one round
+;;	sub			r9,rax
+
+	mov			rax,r12
+	cmp			rsi,r12							; get min from r12 (address of requested input) and rsi (address of done input)
+
+	jge			.LCALC_PROCESSED_BYTES
+	mov 		rax,rsi							; rax=address of last valid input byte+1
+
+.LCALC_PROCESSED_BYTES:
+	sub			rax,r10							; sub the input buffer start address
+												; rax = number of valid processed input bytes = return value
+
+	cmp			rsi,rdx							; compare rdx (address of requested input) and rsi (address of done input)
+	je			.LNO_ZERO_OUT
+
+	mov			r15,rax							; number of elements to process
+
+	shl			r15,1							; number of output bytes
+
+	add			r15,r11							; pointer to next byte after full valid output buffer
+
+
+	PXOR		xmm0,xmm0						; all zero
+;ZERO	MOVDQU		[r15],xmm0						; zero out one register width after last output
+
+.LNO_ZERO_OUT:
+
+%ifdef __WIN__
+
+	MOVDQA		xmm6 ,[rsp     ]
+	MOVDQA		xmm7 ,[rsp+1*16]
+	MOVDQA		xmm8 ,[rsp+2*16]
+	MOVDQA		xmm9 ,[rsp+3*16]
+	MOVDQA		xmm10,[rsp+4*16]
+	MOVDQA		xmm11,[rsp+5*16]
+	MOVDQA		xmm12,[rsp+6*16]
+	MOVDQA		xmm13,[rsp+7*16]
+	MOVDQA		xmm14,[rsp+8*16]
+	MOVDQA		xmm15,[rsp+9*16]
+
+%endif
+
+	mov			rdi,[rsp+STACK_FOR_XMM+0*8]
+	mov			rsi,[rsp+STACK_FOR_XMM+1*8]
+	mov			r12,[rsp+STACK_FOR_XMM+2*8]
+	mov			r14,[rsp+STACK_FOR_XMM+3*8]
+	mov			r15,[rsp+STACK_FOR_XMM+4*8]
+
+	add			rsp,STACK_ADJ
+
+	ret
+
+
+
+;----------------------------------------------------------------------------------------------
+
+; k7			; CONST BITMASK_ONE_ZERO 1010101010101010 selecting upper half
+; k6			;
+; k5			; CONST BITMASK_ZERO_ONE 0101010101010101 selecting lower half
+; k4			; digit flags QL3
+; k3			; digit flags QL2
+; k2			; digit flags QL1
+; k1			; digit flags QL0
+; k0			;
+
+; zmm31			; CONST ENCODE_SHUFFLE_TO_HEX
+; zmm30			; CONST BITMASK_NIBBLE_3_IN_WORD
+; zmm29			; CONST VPERM_ENCODE_OFFSETS
+; zmm28			; CONST ALL_BYTES_39				; CONST 48 = 39+9, calculated in the middle
+; zmm27			; Unpack Upper RL5 RL7
+; zmm26			; Unpack Lower RL4 RL6
+; zmm25			; Unpack Upper RL1 RL3
+; zmm24			; Unpack Lower RL0 RL2
+; zmm23			; Source Load QLF
+; zmm22			; Source Load QLE
+; zmm21			; Source Load QLD
+; zmm20			; Source Load QLC
+; zmm19			; Source Load QLB
+; zmm18			; Source Load QLA
+; zmm17			; Source Load QL9
+; zmm16			; Source Load QL8
+
+; zmm15			; Source Load QL7
+; zmm14			; Source Load QL6
+; zmm13			; Source Load QL5
+; zmm12			; Source Load QL4
+; zmm11			; Source Load QL3
+; zmm10			; Source Load QL2
+; zmm9			; Source Load QL1
+; zmm8			; Source Load QL0
+; zmm7			; RL3
+; zmm6			; RL3
+; zmm5			; RL2
+; zmm4			; RL2
+; zmm3			;
+; zmm2			; RL1
+; zmm1			; CONST ALL bytes 48
+; zmm0			; RL0
+
+%define	NHALF_INP_BYTES_PER_ROUND	8*64
+%define	NINP_BYTES_PER_ROUND		2*NHALF_INP_BYTES_PER_ROUND
+%define	NINP_BITSHIFT				10
+
+hex_encode_avx512bw:
+
+	sub			rsp,STACK_ADJ
+
+	mov			[rsp+STACK_FOR_XMM+0*8],rdi
+	mov			[rsp+STACK_FOR_XMM+1*8],rsi
+	mov			[rsp+STACK_FOR_XMM+2*8],r12
+	mov			[rsp+STACK_FOR_XMM+3*8],r14
+	mov			[rsp+STACK_FOR_XMM+4*8],r15
+
+%ifdef __WIN__
+
+	VMOVDQA		[rsp     ],xmm6
+	VMOVDQA		[rsp+1*16],xmm7
+	VMOVDQA		[rsp+2*16],xmm8
+	VMOVDQA		[rsp+3*16],xmm9
+	VMOVDQA		[rsp+4*16],xmm10
+	VMOVDQA		[rsp+5*16],xmm11
+	VMOVDQA		[rsp+6*16],xmm12
+	VMOVDQA		[rsp+7*16],xmm13
+	VMOVDQA		[rsp+8*16],xmm14
+	VMOVDQA		[rsp+9*16],xmm15
+
+	mov			rdi,rcx							; parameter 1 output buffer
+
+	mov			rsi,rdx							; parameter 2 input buffer
+
+	mov			rdx,r8							; parameter 3 number of elements
+
+%endif
+
+	VMOVDQA64	zmm29,[VPERM_ENCODE_OFFSETS]		; p_23__ l3
+
+;; initializer for QQ0 and QQ1
+
+	VPERMQ		zmm8 ,zmm29,[rsi+0*64]				; AAA p____5 l3+ QL0 RL00,RL01
+	VPERMQ		zmm9 ,zmm29,[rsi+1*64]				; AAA p____5 l3+ QL1 RL02,RL03
+	VPERMQ		zmm10,zmm29,[rsi+2*64]				; AAA p____5 l3+ QL2 RL04,RL05
+	VPERMQ		zmm11,zmm29,[rsi+3*64]				; AAA p____5 l3+ QL3 RL06,RL07
+
+	VPERMQ		zmm12,zmm29,[rsi+4*64]				; BBB p____5 l3+ QL4 RL08,RL09
+	VPERMQ		zmm13,zmm29,[rsi+5*64]				; BBB p____5 l3+ QL5 RL10,RL11
+	VPERMQ		zmm14,zmm29,[rsi+6*64]				; BBB p____5 l3+ QL6 RL12,RL13
+	VPERMQ		zmm15,zmm29,[rsi+7*64]				; BBB p____5 l3+ QL7 RL14,RL15
+
+	add 		rsi,rax								; 						add half the number of processed array elements
+
+	VPERMQ		zmm16,zmm29,[rsi+0*64]				; CCC p____5 l3+ QL8 RL16,RL17
+	VPERMQ		zmm17,zmm29,[rsi+1*64]				; CCC p____5 l3+ QL9 RL18,RL19
+	VPERMQ		zmm18,zmm29,[rsi+2*64]				; CCC p____5 l3+ QLA RL20,RL21
+	VPERMQ		zmm19,zmm29,[rsi+3*64]				; CCC p____5 l3+ QLB RL22,RL23
+
+	VPERMQ		zmm20,zmm29,[rsi+4*64]				; DDD p____5 l3+ QLC RL24,RL25
+	VPERMQ		zmm21,zmm29,[rsi+5*64]				; DDD p____5 l3+ QLD RL26,RL27
+	VPERMQ		zmm22,zmm29,[rsi+6*64]				; DDD p____5 l3+ QLE RL28,RL29
+	VPERMQ		zmm23,zmm29,[rsi+7*64]				; DDD p____5 l3+ QLF RL30,RL31
+
+;; initialize constants
+
+	KMOVQ		k7,[BITMASK_ONE_ZERO]
+
+	VMOVDQA64 	zmm31,[ENCODE_SHUFFLE_TO_HEX]		; p_23__ l3
+	VMOVDQA64	zmm1,zmm31
+
+	VPBROADCASTQ zmm30,[BITMASK_LOWER_HALF]			; p_23__ l3
+
+
+	VMOVDQA64 	zmm28,[ENCODE_SHUFFLE_TO_HIGH_LOW]	; p_23__ l3
+
+;; do page overshoot checks
+
+	mov			rax,NHALF_INP_BYTES_PER_ROUND
+
+
+	mov			r9,rdx							; exact requested number of elements to process
+	add			r9,rsi							; r9 last valid pointer +1 of requested input buffer
+
+	mov			r10,rsi							; r10 saved start of input buffer
+	mov			r12,r9							; r12 save of end of input buffer+1
+
+	lea			rcx,[rsi+rdx-1]					; rcx address of last byte requested to read
+
+	lea			r8,[rdx+NINP_BYTES_PER_ROUND-1]
+	shr			r8,NINP_BITSHIFT				; number of loops
+	shl			r8,NINP_BITSHIFT
+	add			r8,rsi							; r8 address of last byte+1 read in complete loops
+	add			r8,NINP_BYTES_PER_ROUND-1		; r8 address of last byte read in normal loop with overshoot
+
+	mov			r11,r8
+
+; DISABLED for NO OVERSHOOT
+;	add			r11,rax							; r11 address of last byte of prefetched data (ONLY HALF A ROUND!)
+	sub			r11,rax							; r11 address of last byte of prefetched data (ONLY HALF A ROUND!)
+	sub			r11,rax							; r11 address of last byte of prefetched data (ONLY HALF A ROUND!)
+	sub			r11,rax							; r11 address of last byte of prefetched data (ONLY HALF A ROUND!)
+
+	shr			rcx,NSHIFT_ADDRESS_TO_PAGE		; rcx page number of last byte after normal round
+	shr			r8,NSHIFT_ADDRESS_TO_PAGE		; r8  page number of last byte after prefetch
+
+	cmp			rcx,r8							; stay on same page
+	je			.LSAME_PAGE_IN_ROUND
+	sub			rdx,rax							; don't overshoot in reading: do one round less
+	sub			rdx,rax							; don't overshoot in reading: do one round less
+
+.LSAME_PAGE_IN_ROUND:
+	shr			r11,NSHIFT_ADDRESS_TO_PAGE		; r11 page number of byte after prefetched data
+	cmp			rcx,r11
+	je			.LSAME_PAGE_IN_PREFETCH
+	sub			rdx,rax							; don't overshoot in prefetch reading: do one round less
+	sub			rdx,rax							; don't overshoot in prefetch reading: do one round less
+
+.LSAME_PAGE_IN_PREFETCH:
+	add			rdx,rsi							; rdx last valid pointer+1 for normal loop
+
+												; due to prefetch add one round to end checks
+	add			rdx,rax
+	add			r9,rax
+
+	mov			r11,rdi							; r11 saved start of output buffer
+
+	mov			rcx,NHALF_INP_BYTES_PER_ROUND<<1	; increment of output buffer for each round
+
+
+;; start preprocessing before loop
+
+	VPSRLQ		zmm2,zmm8,4							; AAA RL00+RL01 QL0 shift Hx to lower nibble in byte
+	VPSRLQ		zmm3,zmm9,4							; AAA RL02+RL03 QL1 shift Hx to lower nibble in byte
+
+	VPUNPCKLBW	zmm0,zmm2,zmm8						; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPUNPCKHBW	zmm1,zmm2,zmm8						; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	VPSRLQ		zmm6,zmm10,4						; AAA RL04+RL05 QL2 shift Hx to lower nibble in byte
+	VPSRLQ		zmm7,zmm11,4						; AAA RL06+RL07 QL3 shift Hx to lower nibble in byte
+
+	VPUNPCKLBW	zmm2,zmm3,zmm9						; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPUNPCKHBW	zmm3,zmm3,zmm9						; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	add 		rsi,rax								; 						add half the number of processed array elements
+
+	align		32
+
+; ;IACA START_MARKER
+; mov ebx, 111
+; db 0x64, 0x67, 0x90
+
+; Vector Port info AVX512
+; ----------------------------------------
+; VPShift			p0		l1
+; VPMax/Min			p0		l1
+; VPMUL				p0		l5		; with 2FMA-Units p05 (SKX,CLX etc.)
+; VPMOVB2M			p0		l3
+; VPSUBUSB /SSB		p0		l1
+
+; VPALIGNR			p5		l1				;Shift of n*8 bits!
+; VPERM				p5		l3
+; VPERMI2x		1*p05+2*p5	l7 		; (l9 with flags)
+; VPCompare			p5		l3-l4
+; VP Pack/Unpack	p5		l1(SKX) l3(TGL)
+; VPSHUF			p5		l1
+
+
+.LHEXENCODE_LOOP:
+
+;; AAA+BBB
+; process unpacked AAA (QL0-QL4=RL00-RL07) in zmm0-zmm7 and process BBB (QL4-QL7=RL08-RL15) in zmm8-zmm15 and zmm2+zmm3
+
+	VPUNPCKLBW	zmm4,zmm6,zmm10						; AAA RL04 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPANDQ		zmm0,zmm0,zmm30						; AAA RL00 mask lower nibble
+
+	VPUNPCKHBW	zmm5,zmm6,zmm10						; AAA RL05 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+	VPSRLQ		zmm10,zmm12,4						; BBB RL08+RL09 QL4 shift Hx to lower nibble in byte
+	VPUNPCKLBW	zmm8,zmm10,zmm12					; BBB RL08 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPANDQ		zmm1,zmm1,zmm30						; AAA RL01 mask lower nibble
+
+	VPUNPCKLBW	zmm6,zmm7,zmm11						; AAA RL06 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPANDQ		zmm2,zmm2,zmm30						; AAA RL02 mask lower nibble
+	VPUNPCKHBW	zmm7,zmm7,zmm11						; AAA RL07 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+	VPSRLQ		zmm11,zmm13,4						; BBB RL10+RL11 QL5 shift Hx to lower nibble in byte
+	VPANDQ		zmm8,zmm8,zmm30						; BBB RL08 mask lower nibble
+
+	VPUNPCKHBW	zmm9,zmm10,zmm12					; BBB RL09 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+	VPANDQ		zmm3,zmm3,zmm30						; AAA RL03 mask lower nibble
+	VPUNPCKLBW	zmm10,zmm11,zmm13					; BBB RL10 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPANDQ		zmm9,zmm9,zmm30						; BBB RL09 mask lower nibble
+
+	VPSHUFB		zmm0,zmm31,zmm0						; AAA RL00 shuffle_to_hex_digits
+	VPANDQ		zmm4,zmm4,zmm30						; AAA RL04 mask lower nibble
+	VPUNPCKHBW	zmm11,zmm11,zmm13					; BBB RL11 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+	VPANDQ		zmm10,zmm10,zmm30					; BBB RL10 mask lower nibble
+
+	VPSHUFB		zmm1,zmm31,zmm1						; AAA RL01 shuffle_to_hex_digits
+	VPANDQ		zmm5,zmm5,zmm30						; AAA RL05 mask lower nibble
+	VPSHUFB		zmm8,zmm31,zmm8						; BBB RL08 shuffle_to_hex_digits
+	VPANDQ		zmm11,zmm11,zmm30					; BBB RL11 mask lower nibble
+
+	VPSHUFB		zmm2,zmm31,zmm2						; AAA RL02 shuffle_to_hex_digits
+	VMOVDQA64	[rdi+0*64],zmm0						; AAA RL00 Store Hexdump
+	VMOVDQA64	[rdi+1*64],zmm1						; AAA RL01 Store Hexdump
+	VPANDQ		zmm6,zmm6,zmm30						; AAA RL06 mask lower nibble
+	VPSHUFB		zmm9,zmm31,zmm9						; BBB RL09 shuffle_to_hex_digits
+
+	VPSHUFB		zmm3,zmm31,zmm3						; AAA RL03 shuffle_to_hex_digits
+	VPANDQ		zmm7,zmm7,zmm30						; AAA RL07 mask lower nibble
+	VMOVDQA64	[rdi+2*64],zmm2						; AAA RL02 Store Hexdump
+	VPSRLQ		zmm2,zmm14,4						; BBB RL12+RL13 QL6 shift Hx to lower nibble in byte
+	VPSHUFB		zmm10,zmm31,zmm10					; BBB RL10 shuffle_to_hex_digits
+	VMOVDQA64	[rdi+3*64],zmm3						; AAA RL03 Store Hexdump
+
+	VPSRLQ		zmm3,zmm15,4						; BBB RL14+RL15 QL7 shift Hx to lower nibble in byte
+	VPUNPCKLBW	zmm12,zmm2,zmm14					; BBB RL12 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPSHUFB		zmm4,zmm31,zmm4						; AAA RL04 shuffle_to_hex_digits
+	VMOVDQA64	[rdi+4*64],zmm4						; AAA RL04 Store Hexdump
+	VPSHUFB		zmm11,zmm31,zmm11					; BBB RL11 shuffle_to_hex_digits
+	VPUNPCKHBW	zmm13,zmm2,zmm14					; BBB RL13 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	VPSHUFB		zmm5,zmm31,zmm5						; AAA RL05 shuffle_to_hex_digits
+	VPANDQ		zmm12,zmm12,zmm30					; BBB RL12 mask lower nibble
+	VPUNPCKLBW	zmm14,zmm3,zmm15					; BBB RL14 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VMOVDQA64	[rdi+5*64],zmm5						; AAA RL05 Store Hexdump
+
+	VPSHUFB		zmm6,zmm31,zmm6						; AAA RL06 shuffle_to_hex_digits
+	VPANDQ		zmm13,zmm13,zmm30					; BBB RL13 mask lower nibble
+	VPUNPCKHBW	zmm15,zmm3,zmm15					; BBB RL15 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	VPSHUFB		zmm7,zmm31,zmm7						; AAA RL07 shuffle_to_hex_digits
+	VPANDQ		zmm14,zmm14,zmm30					; BBB RL14 mask lower nibble
+	VMOVDQA64	[rdi+6*64],zmm6						; AAA RL06 Store Hexdump
+	VMOVDQA64	[rdi+7*64],zmm7						; AAA RL07 Store Hexdump
+	VPSHUFB		zmm12,zmm31,zmm12					; BBB RL12 shuffle_to_hex_digits
+	VPANDQ		zmm15,zmm15,zmm30					; BBB RL15 mask lower nibble
+;
+	VMOVDQA64	[rdi+8*64],zmm8						; BBB RL08 Store Hexdump
+	VPERMQ		zmm8 ,zmm29,[rsi+0*64]				; AAA p____5 l3+ QL0 RL00,RL01
+	VMOVDQA64	[rdi+9*64],zmm9						; BBB RL09 Store Hexdump
+	VPERMQ		zmm9 ,zmm29,[rsi+1*64]				; AAA p____5 l3+ QL1 RL02,RL03
+	VPSHUFB		zmm13,zmm31,zmm13					; BBB RL13 shuffle_to_hex_digits
+
+	VMOVDQA64	[rdi+10*64],zmm10					; BBB RL10 Store Hexdump
+	VPERMQ		zmm10,zmm29,[rsi+2*64]				; AAA p____5 l3+ QL2 RL04,RL05
+	VPSRLQ		zmm2,zmm16,4						; CCC RL16+RL17 QL8 shift Hx to lower nibble in byte
+	VMOVDQA64	[rdi+11*64],zmm11					; BBB RL11 Store Hexdump
+	VPERMQ		zmm11,zmm29,[rsi+3*64]				; AAA p____5 l3+ QL3 RL06,RL07
+
+	VPSHUFB		zmm14,zmm31,zmm14					; BBB RL14 shuffle_to_hex_digits
+	VPSRLQ		zmm3,zmm17,4						; CCC RL18+RL19 QL9 shift Hx to lower nibble in byte
+	VPSHUFB		zmm15,zmm31,zmm15					; BBB RL15 shuffle_to_hex_digits
+
+	VPUNPCKLBW	zmm0,zmm2,zmm16						; CCC RL16 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPSRLQ		zmm6,zmm18,4						; CCC RL20+RL21 QLA shift Hx to lower nibble in byte
+	VMOVDQA64	[rdi+12*64],zmm12					; BBB RL12 Store Hexdump
+	VPERMQ		zmm12,zmm29,[rsi+4*64]				; BBB p____5 l3+ QL4 RL08,RL09
+
+	VPUNPCKHBW	zmm1,zmm2,zmm16						; CCC RL17 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+	VPSRLQ		zmm7,zmm19,4						; CCC RL22+RL23 QLB shift Hx to lower nibble in byte
+	VMOVDQA64	[rdi+13*64],zmm13					; BBB RL13 Store Hexdump
+	VPERMQ		zmm13,zmm29,[rsi+5*64]				; BBB p____5 l3+ QL5 RL10,RL11
+
+	VMOVDQA64	[rdi+14*64],zmm14					; BBB RL14 Store Hexdump
+	VPERMQ		zmm14,zmm29,[rsi+6*64]				; BBB p____5 l3+ QL6 RL12,RL13
+	VPUNPCKLBW	zmm2,zmm3,zmm17						; CCC RL18 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VMOVDQA64	[rdi+15*64],zmm15					; BBB RL15 Store Hexdump
+
+	add 		rdi,rcx								; 						add half the number of processed output bytes
+
+	VPERMQ		zmm15,zmm29,[rsi+7*64]				; BBB p____5 l3+ QL7 RL14,RL15
+	VPUNPCKHBW	zmm3,zmm3,zmm17						; CCC RL19 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+
+;; CCC+DDD
+; process unpacked CCC (QL8-QLC=RL16-RL23) in zmm0-zmm7 and process DDD (QLC-QLF=RL24-RL31) in zmm16-zmm23 and zmm2+zmm3
+	add 		rsi,rax								; 						add half the number of processed array elements
+
+	VPUNPCKLBW	zmm4,zmm6,zmm18						; CCC RL20 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPANDQ		zmm0,zmm0,zmm30						; CCC RL16 mask lower nibble
+
+	VPUNPCKHBW	zmm5,zmm6,zmm18						; CCC RL21 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+	VPSRLQ		zmm18,zmm20,4						; DDD RL24+RL25 QLC shift Hx to lower nibble in byte
+	VPUNPCKLBW	zmm16,zmm18,zmm20					; DDD RL24 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPANDQ		zmm1,zmm1,zmm30						; CCC RL17 mask lower nibble
+
+	VPUNPCKLBW	zmm6,zmm7,zmm19						; CCC RL22 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPANDQ		zmm2,zmm2,zmm30						; CCC RL18 mask lower nibble
+	VPUNPCKHBW	zmm7,zmm7,zmm19						; CCC RL23 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+	VPSRLQ		zmm19,zmm21,4						; DDD RL26+RL27 QLD shift Hx to lower nibble in byte
+	VPANDQ		zmm16,zmm16,zmm30					; DDD RL24 mask lower nibble
+
+	VPUNPCKHBW	zmm17,zmm18,zmm20					; DDD RL25 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+	VPANDQ		zmm3,zmm3,zmm30						; CCC RL19 mask lower nibble
+	VPUNPCKLBW	zmm18,zmm19,zmm21					; DDD RL26 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPANDQ		zmm17,zmm17,zmm30					; DDD RL25 mask lower nibble
+
+	VPSHUFB		zmm0,zmm31,zmm0						; CCC RL16 shuffle_to_hex_digits
+	VPANDQ		zmm4,zmm4,zmm30						; CCC RL20 mask lower nibble
+	VPUNPCKHBW	zmm19,zmm19,zmm21					; DDD RL27 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+	VPANDQ		zmm18,zmm18,zmm30					; DDD RL26 mask lower nibble
+
+	VPSHUFB		zmm1,zmm31,zmm1						; CCC RL17 shuffle_to_hex_digits
+	VPANDQ		zmm5,zmm5,zmm30						; CCC RL21 mask lower nibble
+	VPSHUFB		zmm16,zmm31,zmm16					; DDD RL24 shuffle_to_hex_digits
+	VPANDQ		zmm19,zmm19,zmm30					; DDD RL27 mask lower nibble
+
+	VPSHUFB		zmm2,zmm31,zmm2						; CCC RL18 shuffle_to_hex_digits
+	VMOVDQA64	[rdi+0*64],zmm0						; CCC RL16 Store Hexdump
+	VMOVDQA64	[rdi+1*64],zmm1						; CCC RL17 Store Hexdump
+	VPANDQ		zmm6,zmm6,zmm30						; CCC RL22 mask lower nibble
+	VPSHUFB		zmm17,zmm31,zmm17					; DDD RL25 shuffle_to_hex_digits
+
+	VPSHUFB		zmm3,zmm31,zmm3						; CCC RL19 shuffle_to_hex_digits
+	VPANDQ		zmm7,zmm7,zmm30						; CCC RL23 mask lower nibble
+	VMOVDQA64	[rdi+2*64],zmm2						; CCC RL18 Store Hexdump
+	VPSRLQ		zmm2,zmm22,4						; DDD RL28+RL29 QLE shift Hx to lower nibble in byte
+	VPSHUFB		zmm18,zmm31,zmm18					; DDD RL26 shuffle_to_hex_digits
+	VMOVDQA64	[rdi+3*64],zmm3						; CCC RL19 Store Hexdump
+
+	VPSRLQ		zmm3,zmm23,4						; DDD RL30+RL31 QLF shift Hx to lower nibble in byte
+	VPUNPCKLBW	zmm20,zmm2,zmm22					; DDD RL28 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPSHUFB		zmm4,zmm31,zmm4						; CCC RL20 shuffle_to_hex_digits
+	VMOVDQA64	[rdi+4*64],zmm4						; CCC RL20 Store Hexdump
+	VPSHUFB		zmm19,zmm31,zmm19					; DDD RL27 shuffle_to_hex_digits
+	VPUNPCKHBW	zmm21,zmm2,zmm22					; DDD RL29 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	VPSHUFB		zmm5,zmm31,zmm5						; CCC RL21 shuffle_to_hex_digits
+	VPANDQ		zmm20,zmm20,zmm30					; DDD RL28 mask lower nibble
+	VPUNPCKLBW	zmm22,zmm3,zmm23					; DDD RL30 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VMOVDQA64	[rdi+5*64],zmm5						; CCC RL21 Store Hexdump
+
+	VPSHUFB		zmm6,zmm31,zmm6						; CCC RL22 shuffle_to_hex_digits
+	VPANDQ		zmm21,zmm21,zmm30					; DDD RL29 mask lower nibble
+	VPUNPCKHBW	zmm23,zmm3,zmm23					; DDD RL31 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	VPSHUFB		zmm7,zmm31,zmm7						; CCC RL23 shuffle_to_hex_digits
+	VPANDQ		zmm22,zmm22,zmm30					; DDD RL30 mask lower nibble
+	VMOVDQA64	[rdi+6*64],zmm6						; CCC RL22 Store Hexdump
+	VMOVDQA64	[rdi+7*64],zmm7						; CCC RL23 Store Hexdump
+	VPSHUFB		zmm20,zmm31,zmm20					; DDD RL28 shuffle_to_hex_digits
+	VPANDQ		zmm23,zmm23,zmm30					; DDD RL31 mask lower nibble
+;
+	VMOVDQA64	[rdi+8*64],zmm16					; DDD RL24 Store Hexdump
+	VPERMQ		zmm16,zmm29,[rsi+0*64]				; CCC p____5 l3+ QL8 RL16,RL17
+	VMOVDQA64	[rdi+9*64],zmm17					; DDD RL25 Store Hexdump
+	VPERMQ		zmm17,zmm29,[rsi+1*64]				; CCC p____5 l3+ QL9 RL18,RL19
+	VPSHUFB		zmm21,zmm31,zmm21					; DDD RL29 shuffle_to_hex_digits
+
+	VMOVDQA64	[rdi+10*64],zmm18					; DDD RL26 Store Hexdump
+	VPERMQ		zmm18,zmm29,[rsi+2*64]				; CCC p____5 l3+ QLA RL20,RL21
+	VPSRLQ		zmm2,zmm8,4							; AAA RL00+RL01 QL0 shift Hx to lower nibble in byte
+	VMOVDQA64	[rdi+11*64],zmm19					; DDD RL27 Store Hexdump
+	VPERMQ		zmm19,zmm29,[rsi+3*64]				; CCC p____5 l3+ QLB RL22,RL23
+
+	VPSHUFB		zmm22,zmm31,zmm22					; DDD RL30 shuffle_to_hex_digits
+	VPSRLQ		zmm3,zmm9,4							; AAA RL02+RL03 QL1 shift Hx to lower nibble in byte
+	VPSHUFB		zmm23,zmm31,zmm23					; DDD RL31 shuffle_to_hex_digits
+
+	VPUNPCKLBW	zmm0,zmm2,zmm8						; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VPSRLQ		zmm6,zmm10,4						; AAA RL04+RL05 QL2 shift Hx to lower nibble in byte
+	VMOVDQA64	[rdi+12*64],zmm20					; DDD RL28 Store Hexdump
+	VPERMQ		zmm20,zmm29,[rsi+4*64]				; DDD p____5 l3+ QLC RL24,RL25
+
+	VPUNPCKHBW	zmm1,zmm2,zmm8						; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+	VPSRLQ		zmm7,zmm11,4						; AAA RL06+RL07 QL3 shift Hx to lower nibble in byte
+	VMOVDQA64	[rdi+13*64],zmm21					; DDD RL29 Store Hexdump
+	VPERMQ		zmm21,zmm29,[rsi+5*64]				; DDD p____5 l3+ QLD RL26,RL27
+
+	VMOVDQA64	[rdi+14*64],zmm22					; DDD RL30 Store Hexdump
+	VPERMQ		zmm22,zmm29,[rsi+6*64]				; DDD p____5 l3+ QLE RL28,RL29
+	VPUNPCKLBW	zmm2,zmm3,zmm9						; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0]
+	VMOVDQA64	[rdi+15*64],zmm23					; DDD RL31 Store Hexdump
+	VPERMQ		zmm23,zmm29,[rsi+7*64]				; DDD p____5 l3+ QLF RL30,RL31
+
+	add 		rsi,rax								; 						add half the number of processed array elements
+
+	VPUNPCKHBW	zmm3,zmm3,zmm9						; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0]
+
+	add 		rdi,rcx								; 						add half the number of processed output bytes
+
+	cmp			rsi,rdx								; compare incremented rsi (= pointer to next chunk) to last valid pointer +1
+	jl			.LHEXENCODE_LOOP
+
+												; end of normal loop reached
+												; we can do one more round when original count has been reduced by one round
+	cmp			rax,0
+	je			.LFINISH_EXTRA
+
+	cmp			rdx,r9							; input buffer length was not reduced when equal
+	je			.LFINISH_NORMAL
+
+	add			rax,rax							; rax is only half the bytes of input round, so double it
+	sub			rsi,rax							; for prefetching the last round, load the last round again
+	sub			rdx,rax							; adopt and condition for last round also
+	mov			rax,0
+	jmp			.LHEXENCODE_LOOP
+
+
+.LFINISH_EXTRA:
+	add			rsi,NINP_BYTES_PER_ROUND		; add the extra round to get processed bytes
+	jmp 		.LFINISH
+
+.LFINISH_NORMAL:
+	sub			rsi,NINP_BYTES_PER_ROUND		; sub the added prefetch round to get processed bytes
+
+.LFINISH:
+												; r9 = address of requested input bytes+1
+												; rsi = address of processed input bytes+1
+												; now get the minimum of rdx,rsi to rax
+;;	sub			rsi,rax							; for last round do nothing (rax=0), else sub increment for one round
+;;	sub			r9,rax
+
+	mov			rax,r12
+	cmp			rsi,r12							; get min from rdx (address of requested input) and rsi (address of done input)
+
+	jge			.LCALC_PROCESSED_BYTES
+	mov 		rax,rsi							; rax=address of last valid input byte+1
+
+.LCALC_PROCESSED_BYTES:
+	sub			rax,r10							; sub the input buffer start address
+												; rax = number of valid processed input bytes = return value
+
+	cmp			rsi,rdx							; compare rdx (address of requested input) and rsi (address of done input)
+	je			.LNO_ZERO_OUT
+
+	mov			r15,rax							; number of elements to process
+
+	shl			r15,1							; number of output bytes
+
+	add			r15,r11							; pointer to next byte after full valid output buffer
+
+
+	VPXORQ		zmm0,zmm0,zmm0						; all zero
+;ZERO	VMOVDQU64	[r15],zmm0							; zero out one register width after last output
+
+.LNO_ZERO_OUT:
+
+%ifdef __WIN__
+
+	VMOVDQA		xmm6 ,[rsp     ]
+	VMOVDQA		xmm7 ,[rsp+1*16]
+	VMOVDQA		xmm8 ,[rsp+2*16]
+	VMOVDQA		xmm9 ,[rsp+3*16]
+	VMOVDQA		xmm10,[rsp+4*16]
+	VMOVDQA		xmm11,[rsp+5*16]
+	VMOVDQA		xmm12,[rsp+6*16]
+	VMOVDQA		xmm13,[rsp+7*16]
+	VMOVDQA		xmm14,[rsp+8*16]
+	VMOVDQA		xmm15,[rsp+9*16]
+
+%endif
+
+	mov			rdi,[rsp+STACK_FOR_XMM+0*8]
+	mov			rsi,[rsp+STACK_FOR_XMM+1*8]
+	mov			r12,[rsp+STACK_FOR_XMM+2*8]
+	mov			r14,[rsp+STACK_FOR_XMM+3*8]
+	mov			r15,[rsp+STACK_FOR_XMM+4*8]
+
+	add			rsp,STACK_ADJ
+
+	ret
+
+;----------------------------------------------------------------------------------------------
+
+%endif
diff --git a/postgresql-15devel_orig/src/backend/utils/adt/varlena.c b/postgresql-15devel/src/backend/utils/adt/varlena.c
index bd3091b..183f67f 100644
--- a/postgresql-15devel_orig/src/backend/utils/adt/varlena.c
+++ b/postgresql-15devel/src/backend/utils/adt/varlena.c
@@ -397,7 +397,7 @@ byteaout(PG_FUNCTION_ARGS)
 	if (bytea_output == BYTEA_OUTPUT_HEX)
 	{
 		/* Print hex format */
-		rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
+		rp = result = palloc(hex_enc_len(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena)) + 2 + 1);
 		*rp++ = '\\';
 		*rp++ = 'x';
 		rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
diff --git a/postgresql-15devel_orig/src/include/utils/builtins.h b/postgresql-15devel/src/include/utils/builtins.h
index b07eefa..e6efb73 100644
--- a/postgresql-15devel_orig/src/include/utils/builtins.h
+++ b/postgresql-15devel/src/include/utils/builtins.h
@@ -35,6 +35,9 @@ extern int	errdomainconstraint(Oid datatypeOid, const char *conname);
 extern uint64 hex_encode(const char *src, size_t len, char *dst);
 extern uint64 hex_decode(const char *src, size_t len, char *dst);
 
+extern uint64 hex_enc_len(const char *src, size_t srclen);
+extern uint64 hex_dec_len(const char *src, size_t srclen);
+
 /* int.c */
 extern int2vector *buildint2vector(const int16 *int2s, int n);