diff --git a/postgresql-15devel_orig/src/Makefile.global.in b/postgresql-15devel/src/Makefile.global.in index 05c54b2..ea5c785 100644 --- a/postgresql-15devel_orig/src/Makefile.global.in +++ b/postgresql-15devel/src/Makefile.global.in @@ -270,6 +270,10 @@ LLVM_CPPFLAGS = @LLVM_CPPFLAGS@ LLVM_CFLAGS = @LLVM_CFLAGS@ LLVM_CXXFLAGS = @LLVM_CXXFLAGS@ +# TODO should be adapted to configure +NASM = nasm +NASMFLAGS = elf64 + # Kind-of compilers BISON = @BISON@ @@ -782,6 +786,10 @@ endif %.bz2: % $(BZIP2) -c $< >$@ +%.o: %.asm + $(NASM) -f $(NASMFLAGS) -g -o $@ $< + + # Direct builds of foo.c -> foo are disabled to avoid generating # *.dSYM junk on Macs. All builds should normally go through the # foo.c -> foo.o -> foo steps. This also ensures that dependency diff --git a/postgresql-15devel_orig/src/backend/utils/adt/Makefile b/postgresql-15devel/src/backend/utils/adt/Makefile index 41b486b..fa74e69 100644 --- a/postgresql-15devel_orig/src/backend/utils/adt/Makefile +++ b/postgresql-15devel/src/backend/utils/adt/Makefile @@ -25,6 +25,7 @@ OBJS = \ bool.o \ cash.o \ char.o \ + cpu_capabilities_x86_64.o \ cryptohashfuncs.o \ date.o \ datetime.o \ @@ -42,6 +43,7 @@ OBJS = \ geo_ops.o \ geo_selfuncs.o \ geo_spgist.o \ + hex_x86_64.o \ inet_cidr_ntop.o \ inet_net_pton.o \ int.o \ diff --git a/postgresql-15devel/src/backend/utils/adt/cpu_capabilities_x86_64.asm b/postgresql-15devel/src/backend/utils/adt/cpu_capabilities_x86_64.asm new file mode 100644 index 0000000..bcb7db3 --- /dev/null +++ b/postgresql-15devel/src/backend/utils/adt/cpu_capabilities_x86_64.asm @@ -0,0 +1,630 @@ +%ifdef __NASM_MAJOR__ +%ifdef COMPILE_C_STYLE_COMMENTS +/*------------------------------------------------------------------------- + * + * cpu_capabilities_x86_64.asm + * Assembler routines for fetching the cpu_capabilities in a convenient int64 + * and selecting the maximum possible implementation for all valid algorithms + * + * Copyright (c) 2021-2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/utils/adt/cpu_capabilities_x86_64.asm + * + *------------------------------------------------------------------------- + */ +%endif + + + + + + +; cpu_capabilities_x86_64.sam +; Assembler routines for converting a buffer to hex (cpu_capabilities_encode_xxx) +; and restore the binary from cpu_capabilities_code (cpu_capabilities_decode_xxx) on Intel X64 + + +; nasm -f WIN64 -g cpu_capabilities_x86_64.asm -l cpu_capabilities_x86_64.lis + +; golink /console hexdump.obj cpu_capabilities_x86_64.obj hex_x86_64.obj base64_x86_64.obj /files + +; Linux register order: %rdi, %rsi, %rdx, %rcx, %r8 and %r9 +; Windows register order: rcx, rdx, r8, r9 + +; Windows non volatile registers: rbx,rbp,rdi,rsi,rsp, r12,r13,r14,r15 and xmm6-xmm15 +; Linux non volatile registers: rbx,rbp, rsp, r12,r13,r14,r15 + +; nasm -f elf64 -g cpu_capabilities_x86_64.asm -l cpu_capabilities_x86_64_elf64.lis + + +%ifidn __OUTPUT_FORMAT__, win64 +%define __WIN__ 1 +%elifidn __OUTPUT_FORMAT__, elf64 +%define __ELF__ 1 +%endif + + +global apply_cpu_capabilities + +global get_instr_info + +;;global get_highest_impl_id + + + +default rel + +section .rdata align=64 + +; these are the id defines for different algorithms implemented or planned +; every implementation know its own IMPL_ID, which should never change +; + +%define ALGORITHM_ID_HEX_ENCODE 0 +%define ALGORITHM_ID_HEX_DECODE 1 +%define ALGORITHM_ID_BASE64_ENCODE 2 +%define ALGORITHM_ID_BASE64_DECODE 3 +%define ALGORITHM_ID_CECKSUM 4 +%define ALGORITHM_ID_CECKSUM_COPY 5 + + + + +%define CPU_IS_ARCH_X86_64 1 +%define CPU_HAS_SSE2 8 +%define CPU_HAS_SSE3 9 +%define CPU_HAS_SSSE3 10 +%define CPU_HAS_SSE4_1 11 +%define CPU_HAS_SSE4_2 12 +%define CPU_HAS_AVX 13 +%define CPU_HAS_F16C 14 +%define CPU_HAS_AVX2 15 +%define CPU_HAS_AVX512_F 16 +%define CPU_HAS_AVX512_VL 17 +%define CPU_HAS_AVX512_DQ 18 +%define CPU_HAS_AVX512_BW 19 +%define CPU_HAS_AVX512_IFMA 20 +%define CPU_HAS_AVX512_VBMI 21 +%define CPU_HAS_AVX512_VBMI2 22 +%define CPU_HAS_AVX512_VNNI 23 +%define CPU_HAS_AVX512_BITALG 24 +%define CPU_HAS_AVX512_VPOPCNTDQ 25 +%define CPU_HAS_AVX512_VP2INTERSECT 26 +%define CPU_HAS_AVX512_FP16 27 +%define CPU_HAS_AMX_TILE 28 +%define CPU_HAS_AMX_BF16 29 +%define CPU_HAS_AMX_INT8 31 + + +REQUIREMENTS_ARR: +HEX_ENC_CPU_REQUIREMENTS_ARR: + dq 0 + dq CPU_IS_ARCH_X86_64 + (1< goto end + je .skip_algorithm + + lea rax,[CPU_REQUIREMENTS_OFFS_ARR_X86_64] ; start offset of requirement_arr of current algorithm + mov rax,[rax+8*r15] ; start offset of requirement_arr of current algorithm + lea r10,[REQUIREMENTS_ARR] ; r10 pointer to requirement_arr for current algorithm + add r10,rax + +.check_requirements: + mov rax,r9 ; rax temp for current capabilities + and rax,[8*rcx+r10] + cmp rax,[8*rcx+r10] + je .max_index_found + sub rcx,1 + jnz .check_requirements + +.max_index_found: + mov rax,r8 ; rax temp for bitmask of current algorithm + test rax,rdx + jnz .skip_algorithm + mov [rsi+8*r15],rcx + + +.skip_algorithm: + add r8,r8 ; shift bitmask of current algorithm 1 to the right + add r15,1 + jmp .loop_algorithm + +.end_loop_algorithm: + +.return: + + mov rdi,[rsp+0*8] + mov rsi,[rsp+1*8] + mov r9 ,[rsp+2*8] + mov r15,[rsp+3*8] + + add rsp,STACK_ADJ + + ret + +;---------------------------------------------------------------------------------------------- + + ; CPUID Input EAX=01h + ; Feature Information Returned in the ECX Register (according to Intel Instruction Manual) + ;ECX bit + ;-> 0 SSE3 Streaming SIMD Extensions 3 + ; 1 PCLMULQDQ + ; 2 DTES64 64-bit DS Area. + ; 3 MONITOR MONITOR/MWAIT. + ; 4 DS-CPL CPL Qualified Debug Store. + ; 5 VMX Virtual Machine Extensions. + ; 6 SMX Safer Mode Extensions. + ; 7 EIST Enhanced Intel SpeedStep® technology. + ; 8 TM2 Thermal Monitor 2. + ;-> 9 SSSE3 + ; 10 CNXT-ID L1 Context ID. + ; 11 SDBG + ; 12 FMA + ; 13 CMPXCHG16B + ; 14 xTPR Update Control + ; 15 PDCM Perfmon and Debug Capability. + ; 16 Reserved + ; 17 PCID Process-context identifiers. + ; 18 DCA + ;-> 19 SSE4_1 + ;-> 20 SSE4_2 + ; 21 x2APIC + ; 22 MOVBE + ; 23 POPCNT + ; 24 TSC-Deadline + ; 25 AESNI + ; 26 XSAVE + ; 27 OSXSAVE + ;-> 28 AVX + ;-> 29 F16C + ; 30 RDRAND + ; 31 Not Used + + ; CPUID Input EAX=01h + ; Feature Information Returned in the EDX Register (according to Intel Instruction Manual) + ; EDX bit + ; 0 FPU Floating Point Unit On-Chip. + ; 1 VME Virtual 8086 Mode Enhancements. + ; 2 DE Debugging Extensions. + ; 3 PSE Page Size Extension. + ; 4 TSC Time Stamp Counter. + ; 5 MSR Model Specific Registers RDMSR and WRMSR Instructions. + ; 6 PAE Physical Address Extension. + ; 7 MCE Machine Check Exception. + ; 8 CX8 CMPXCHG8B Instruction. Th + ; 9 APIC APIC On-Chip. + ; 10 Reserved + ; 11 SEP SYSENTER and SYSEXIT Instructions. + ; 12 MTRR Memory Type Range Registers + ; 13 PGE Page Global Bit. + ; 14 MCA Machine Check Architecture. + ; 15 CMOV Conditional Move Instructions. + ; 16 PAT Page Attribute Table. + ; 17 PSE-36 36-Bit Page Size Extension. + ; 18 PSN Processor Serial Number. + ; 19 CLFSH CLFLUSH Instruction. + ; 20 Reserved + ; 21 DS Debug Store. + ; 22 ACPI Thermal Monitor and Software Controlled Clock Facilities. + ; 23 MMX Intel MMX Technology. + ; 24 FXSR FXSAVE and FXRSTOR Instructions. + ; 25 SSE SSE. + ;-> 26 SSE2 SSE2. + ; 27 SS Self Snoop. + ; 28 HTT Max APIC IDs reserved field is Valid. + ; 29 TM Thermal Monitor. + ; 30 Reserved + ; 31 PBE Pending Break Enable. + ; + + ; CPUID Input EAX=07H + ; Feature Information returned in the EAX-EDX Registers (according to Intel Instruction Set extension Manual) + +; EBX bits + ; EBX Bit00: FSGSBASE. Supports RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE if 1. + ; EBX Bit01: IA32_TSC_ADJUST MSR is supported if 1. + ; EBX Bit02: SGX + ; EBX Bit03: BMI1 + ; EBX Bit04: HLE + ;-> EBX Bit05: Intel® AVX2 + ; EBX Bit06: FDP_EXCPTN_ONLY. x87 FPU Data Pointer updated only on x87 exceptions if 1. + ; EBX Bit07: SMEP. Supports Supervisor Mode Execution Protection if 1. + ; EBX Bit08: BMI2 + ; EBX Bit09: Supports Enhanced REP MOVSB/STOSB if 1. + ; EBX Bit10: INVPCID + ; EBX Bit11: RTM + ; EBX Bit12: RDT-M. Supports Intel® Resource Director Technology (Intel® RDT) Monitoring capability if 1. + ; EBX Bit13: Deprecates FPU CS and FPU DS values if 1. + ; EBX Bit14: Intel® Memory Protection Extensions + ; EBX Bit15: RDT-A. Supports Intel® Resource Director Technology (Intel® RDT) Allocation capability if 1. + ;-> EBX Bit16: AVX512F + ;-> EBX Bit17: AVX512DQ + ; EBX Bit18: RDSEED + ; EBX Bit19: ADX + ; EBX Bit20: SMAP + ;-> EBX Bit21: AVX512_IFMA + ; EBX Bit22: Reserved + ; EBX Bit23: CLFLUSHOPT + ; EBX Bit24: CLWB + ; EBX Bit25: Intel Processor Trace + ; EBX Bit26: AVX512PF (Intel® Xeon Phi™ only.) + ; EBX Bit27: AVX512ER (Intel® Xeon Phi™ only.) + ; EBX Bit28: AVX512CD + ; EBX Bit29: SHA + ;-> EBX Bit30: AVX512BW + ;-> EBX Bit31: AVX512VL + + +; ECX bits + ; ECX Bit00: PREFETCHWT1 (Intel® Xeon Phi™ only.) + ;-> ECX Bit01: AVX512_VBMI + ; ECX Bit02: UMIP. Supports user-mode instruction prevention if 1. + ; ECX Bit03: PKU. Supports protection keys for user-mode pages if 1. + ; ECX Bit04: OSPKE. If 1, OS has set CR4.PKE to enable protection keys (and the RDPKRU/WRPKRU instructions). + ; ECX Bit05: WAITPKG + ;-> ECX Bit06: AVX512_VBMI2 + ; ECX Bit07: CET_SS. Supports CET shadow stack features if 1. + ; ECX Bit08: GFNI + ; ECX Bit09: VAES + ; ECX Bit10: VPCLMULQDQ + ;-> ECX Bit11: AVX512_VNNI + ;-> ECX Bit12: AVX512_BITALG + ; ECX Bit13: TME_EN. + ;-> ECX Bit14: AVX512_VPOPCNTDQ + ; ECX Bit15: Reserved + ; ECX Bit16: LA57. Supports 57-bit linear addresses and five-level paging if 1. + ; ECX Bits 21-17: The value of MAWAU used by the BNDLDX and BNDSTX instructions in 64-bit mode. + ; ECX Bit22: RDPID and IA32_TSC_AUX are available if 1. + ; ECX Bit23: KL. Supports Key Locker if 1. + ; ECX Bit24: Reserved + ; ECX Bit25: CLDEMOTE. Supports cache line demote if 1. + ; ECX Bit26: Reserved + ; ECX Bit27: MOVDIRI. Supports MOVDIRI if 1. + ; ECX Bit28: MOVDIR64B. Supports MOVDIR64B if 1. + ; ECX Bit29: ENQCMD: Supports Enqueue Stores if 1. + ; ECX Bit30: SGX_LC. Supports SGX Launch Configuration if 1. + ; ECX Bit31: PKS. Supports protection keys for supervisor-mode pages if 1. + +; EDX bits + ; EDX Bits 01-00: Reserved + ; EDX Bit02: AVX512_4VNNIW (Intel® Xeon Phi™ only.) + ; EDX Bit03: AVX512_4FMAPS (Intel® Xeon Phi™ only.) + ; EDX Bit04: Fast Short REP MOV + ; EDX Bit05: UINTR. If 1, the processor supports user interrupts. + ; EDX Bits 07-06: Reserved + ;-> EDX Bit08: AVX512_VP2INTERSECT + ; EDX Bit09: Reserved + ; EDX Bit10: MD_CLEAR supported. + ; EDX Bits 13-11: Reserved + ; EDX Bit14: SERIALIZE + ; EDX Bit15: Hybrid. If 1, the processor is identified as a hybrid part. + ; EDX Bit16: TSXLDTRK. If 1, the processor supports Intel TSX suspend load address tracking. + ; EDX Bit17: Reserved + ; EDX Bit18: PCONFIG + ; EDX Bit19: Reserved + ; EDX Bit20: CET_IBT. Supports CET indirect branch tracking features if 1. + ; EDX Bit21: Reserved + ;-> EDX Bit22: AMX-BF16. If 1, the processor supports tile computational operations on bfloat16 numbers. + ;-> EDX Bit23: AVX512_FP16 + ;-> EDX Bit24: AMX-TILE. If 1, the processor supports tile architecture + ;-> EDX Bit25: AMX-INT8. If 1, the processor supports tile computational operations on 8-bit integers. + ; EDX Bit26: Enumerates support for indirect branch restricted speculation (IBRS) and the indirect branch predictor barrier (IBPB). + ; EDX Bit27: Enumerates support for single thread indirect branch predictors (STIBP). + ; EDX Bit29: Enumerates support for the IA32_ARCH_CAPABILITIES MSR. + ; EDX Bit30: Enumerates support for the IA32_CORE_CAPABILITIES MSR. + ; EDX Bit31: Enumerates support for Speculative Store Bypass Disable (SSBD). + + + +%define STACK_ADJ 0x28+6*8 + +get_instr_info: + + sub rsp,STACK_ADJ + + mov [rsp+0*8],rbx + mov [rsp+1*8],rcx + mov [rsp+2*8],rdx + mov [rsp+3*8],r8 + mov [rsp+4*8],r9 + mov [rsp+5*8],r15 + + + ; NOTE: the upper bits 32-63 of the corresponding 64bit register are zeroed on 32bit movs! + ; so it is easy to adapt the scheme to more CPU-features occupying the upper 32 bits + mov r9d,CPU_IS_ARCH_X86_64 + +;LEAF_01H + mov eax,0x01 + cpuid + +;ECX + mov r8d,ecx + and r8d,1<<0 + shl r8d,CPU_HAS_SSE3-0 + or r9d,r8d + + mov r8d,ecx + and r8d,1<<9 + shl r8d,CPU_HAS_SSSE3-9 + or r9d,r8d + + mov r8d,ecx + and r8d,1<<19 + shr r8d,19-CPU_HAS_SSE4_1 + or r9d,r8d + + mov r8d,ecx + and r8d,1<<20 + shr r8d,20-CPU_HAS_SSE4_2 + or r9d,r8d + + mov r8d,ecx + and r8d,1<<28 + shr r8d,28-CPU_HAS_AVX + or r9d,r8d + + mov r8d,ecx + and r8d,1<<29 + shr r8d,29-CPU_HAS_F16C + or r9d,r8d + + +;EDX + mov r8d,edx + and r8d,1<<26 + shr r8d,26-CPU_HAS_SSE2 + or r9d,r8d + + +;LEAF_07H + mov eax,0x07 + mov ecx,0 + cpuid + +;EBX + mov r8d,ebx + and r8d,1<<5 + shl r8d,CPU_HAS_AVX2-5 + or r9d,r8d + + mov r8d,ebx + and r8d,1<<16 + shl r8d,CPU_HAS_AVX512_F-16 + or r9d,r8d + + mov r8d,ebx + and r8d,1<<17 + shl r8d,CPU_HAS_AVX512_DQ-17 + or r9d,r8d + + mov r8d,ebx + and r8d,1<<21 + shr r8d,21-CPU_HAS_AVX512_IFMA + or r9d,r8d + + mov r8d,ebx + and r8d,1<<30 + shr r8d,30-CPU_HAS_AVX512_BW + or r9d,r8d + + mov r8d,ebx + and r8d,1<<31 + shr r8d,31-CPU_HAS_AVX512_VL + or r9d,r8d + +;ECX + mov r8d,ecx + and r8d,1<<1 + shl r8d,CPU_HAS_AVX512_VBMI-1 + or r9d,r8d + + mov r8d,ecx + and r8d,1<<6 + shl r8d,CPU_HAS_AVX512_VBMI2-6 + or r9d,r8d + + mov r8d,ecx + and r8d,1<<11 + shl r8d,CPU_HAS_AVX512_VNNI-11 + or r9d,r8d + + mov r8d,ecx + and r8d,1<<12 + shl r8d,CPU_HAS_AVX512_BITALG-12 + or r9d,r8d + + mov r8d,ecx + and r8d,1<<14 + shl r8d,CPU_HAS_AVX512_VPOPCNTDQ-14 + or r9d,r8d + +;EDX + mov r8d,edx + and r8d,1<<8 + shl r8d,CPU_HAS_AVX512_VP2INTERSECT-8 + or r9d,r8d + + mov r8d,edx + and r8d,1<<22 + shl r8d,CPU_HAS_AMX_BF16-22 + or r9d,r8d + + mov r8d,edx + and r8d,1<<23 + shl r8d,CPU_HAS_AVX512_FP16-23 + or r9d,r8d + + mov r8d,edx + and r8d,1<<24 + shl r8d,CPU_HAS_AMX_TILE-24 + or r9d,r8d + + mov r8d,edx + and r8,1<<25 + shl r8,CPU_HAS_AMX_INT8-25 + or r9,r8 + +; example for CPU_HAS_property_GT_31 +; mov r8d,edx +; and r8,1<<26 +; shl r8,CPU_HAS_PROPERTY_GT_31-26 +; or r9,r8 + + + + mov rax,r9 + + mov rbx,[rsp+0*8] + mov rcx,[rsp+1*8] + mov rdx,[rsp+2*8] + mov r8 ,[rsp+3*8] + mov r9 ,[rsp+4*8] + mov r15,[rsp+5*8] + + add rsp,STACK_ADJ + + ret + +;---------------------------------------------------------------------------------------------- +%endif diff --git a/postgresql-15devel_orig/src/backend/utils/adt/encode.c b/postgresql-15devel/src/backend/utils/adt/encode.c index 6dd93f9..7c37989 100644 --- a/postgresql-15devel_orig/src/backend/utils/adt/encode.c +++ b/postgresql-15devel/src/backend/utils/adt/encode.c @@ -19,6 +19,7 @@ #include "utils/builtins.h" #include "utils/memutils.h" +#define ALGORITHM_ID_HEX_ENCODE 0 /* * Encoding conversion API. @@ -39,6 +40,28 @@ struct pg_encoding static const struct pg_encoding *pg_find_encoding(const char *name); + + /* TODO BEGIN of block which should be moved to global initialization */ + +static int64 cpu_capabilities_unmasked = -1; +static int64 cpu_capabilities = -1; +static int64 cpu_capabilities_mask = -1; +static int64 algorithm_disable_mask = 0; + +static int64 valid_impl_id_arr[64]; + +extern size_t apply_cpu_capabilities (int64 *capabilities, int64 *impl_id_arr, int64 mask); + + +extern size_t get_hex_encode_alloc_addon (size_t srclen, int64 impl_id); + +extern size_t get_hex_decode_alloc_addon (size_t srclen, int64 impl_id); + +extern size_t hex_encode_fast (char *dst, const char *src, size_t srclen, int64 impl_id); + + /* END init */ + + /* * SQL functions. */ @@ -161,14 +184,46 @@ uint64 hex_encode(const char *src, size_t len, char *dst) { const char *end = src + len; + size_t n_done = 0; + size_t resultlen; + size_t len_reduce = 256; - while (src < end) + /* TODO BEGIN of block which should be moved to global initialization */ + + /* + * Check the CPU capabilities only once. + * we call it through hex_enc_len in case this has not been called before + */ + if (cpu_capabilities < 0) { - *dst++ = hextbl[(*src >> 4) & 0xF]; - *dst++ = hextbl[*src & 0xF]; - src++; + resultlen = hex_enc_len(src,len); +// len_reduce = (resultlen>>1)-len; +// elog(NOTICE,"ppast_hex_enc_len len_reduce %d bytes",len_reduce); + } + + /* END init */ + +#if defined(__x86_64__) || defined(_M_AMD64) + if (len >= 512) + { + n_done = hex_encode_fast(dst,src,len-len_reduce,valid_impl_id_arr[ALGORITHM_ID_HEX_ENCODE]); } - return (uint64) len * 2; +#endif + + if (n_done < len) + { + src += n_done; + dst += n_done<<1; + while (src < end) + { + *dst++ = hextbl[(*src >> 4) & 0xF]; + *dst++ = hextbl[*src & 0xF]; + src++; + } + } +// elog(NOTICE,"post_hex_encode return double_len %d bytes",len<<1); + + return (uint64) len << 1; } static inline char @@ -223,13 +278,38 @@ hex_decode(const char *src, size_t len, char *dst) return p - dst; } -static uint64 +uint64 hex_enc_len(const char *src, size_t srclen) { - return (uint64) srclen << 1; + /* TODO BEGIN of block which should be moved to global initialization */ + + /* + * Check the CPU capabilities only once. + * When cpu_capabilities is not set (is < 0) we call the architecture- + * dependant instruction information. + * An architecture supported for ASM/SIMD acceleration returns a positive + * value, for all other (not yet) supported architectures we set it to 0. + */ +#if defined(__x86_64__) || defined(_M_AMD64) + if (cpu_capabilities < 0) + { + apply_cpu_capabilities(&cpu_capabilities_unmasked, valid_impl_id_arr, algorithm_disable_mask); + cpu_capabilities = cpu_capabilities_unmasked&cpu_capabilities_mask; + elog(NOTICE,"post_apply cpu_capabilities = %ld",cpu_capabilities); + elog(NOTICE,"post_apply valid_impl_id_0 = %ld",valid_impl_id_arr[ALGORITHM_ID_HEX_ENCODE]); + + } +#else + cpu_capabilities = 0; +#endif + + /* END init */ + + return (uint64) (srclen << 1) + + get_hex_encode_alloc_addon(srclen,valid_impl_id_arr[ALGORITHM_ID_HEX_ENCODE]); } -static uint64 +uint64 hex_dec_len(const char *src, size_t srclen) { return (uint64) srclen >> 1; diff --git a/postgresql-15devel/src/backend/utils/adt/hex_x86_64.asm b/postgresql-15devel/src/backend/utils/adt/hex_x86_64.asm new file mode 100644 index 0000000..c2fd0c6 --- /dev/null +++ b/postgresql-15devel/src/backend/utils/adt/hex_x86_64.asm @@ -0,0 +1,2915 @@ +%ifdef __NASM_MAJOR__ +%ifdef COMPILE_C_STYLE_COMMENTS +/*------------------------------------------------------------------------- + * + * hex_x86_64.asm + * Assembler routines for converting a buffer to hex (hex_encode_xxx) + * and restore the binary from hex code (hex_decode_xxx) on Intel X64 + * + * Copyright (c) 2021-2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/utils/adt/hex_x86_64.asm + * + *------------------------------------------------------------------------- + */ +%endif + + + + + + +; hex_x86_64.sam +; Assembler routines for converting a buffer to hex (hex_encode_xxx) +; and restore the binary from hex_code (hex_decode_xxx) on Intel X64 + + +; nasm -f WIN64 -g hex_x86_64.asm -l hex_x86_64.lis + +; golink /console hexdump.obj hex_x86_64.obj base64_x86_64.obj /files + +; Linux register order: %rdi, %rsi, %rdx, %rcx, %r8 and %r9 +; Windows register order: rcx, rdx, r8, r9 + +; Windows non volatile registers: rbx,rbp,rdi,rsi,rsp, r12,r13,r14,r15 and xmm6-xmm15 +; Linux non volatile registers: rbx,rbp, rsp, r12,r13,r14,r15 + +; nasm -f elf64 -g hex_x86_64.asm -l hex_x86_64_elf64.lis + + +%ifidn __OUTPUT_FORMAT__, win64 +%define __WIN__ 1 +%elifidn __OUTPUT_FORMAT__, elf64 +%define __ELF__ 1 +%endif + +%define NSHIFT_ADDRESS_TO_PAGE 12 + +%define N_BYTES_PER_SSE2 16 +%define N_BYTES_PER_AVX2 32 +%define N_BYTES_PER_AVX512 64 + +global get_hex_encode_alloc_addon +global get_hex_decode_alloc_addon + +global hex_encode_fast + +global hex_encode_sse2 +global hex_encode_ssse3 +global hex_encode_avx2 +global hex_encode_avx512bw + +global hex_decode_sse2 +global hex_decode_avx2 +global hex_decode_avx512bw + + +default rel + +section .rdata align=64 + +; values loaded with VMOVDQA64 in AVX512, so 64 bytes needed + +%define VPERM_AVX2_OFFS 0b11_01_10_00 + +VPERM_ENCODE_OFFSETS dq 0,4,1,5,2,6,3,7 +VPERM_DECODE_OFFSETS dq 0,2,4,6,1,3,5,7 + +ENCODE_SHUFFLE_TO_HEX times 4 db '0123456789abcdef' + +ENCODE_SHUFFLE_TO_HIGH_LOW times 4 db 8,0,9,1, 10,2,11,3, 12,4,13,5, 14,6,15,7 + + +; from here on values used with VPBROADCASTQ in AVX512 / VMOVDQA in AVX2, so only 16/32 bytes needed + +;BITMASK_UPPER_HALF times 32 db 0b1111_0000 +BITMASK_LOWER_HALF times 32 db 0b0000_1111 + +BITMASK_NIBBLE_3_IN_WORD times 16 dw 0x0F00 + +BITMASK_LITTLE_TO_BIG_ASCII times 32 db 0b1101_1111 +BITMASK_BIG_TO_LITTLE_ASCII times 32 db 0b0010_0000 + +BITMASK_ZERO_ONE times 32 db 0b0101_0101 + +BITMASK_ONE_ZERO times 32 db 0b1010_1010 + +BITMASK_SELECT_DIGIT times 32 db 0b0011_1111 + +ALL_BYTES_9 times 32 db 9 + +ASCII_LITTLE_A_ADD: +ALL_BYTES_39 times 32 db 39 + +ASCII_0_OFFSET: +ALL_BYTES_48 times 32 db 48 + +;ASCII_DIGIT_9 times 32 db 48+9 + +ASCII_LETTER_LITTLE_A times 32 db 'a' +ASCII_LETTER_LITTLE_F times 32 db 'f' + +HEX_ENCODE_ARRAYS: +HEX_ENC_MIN_SRC_LEN_ARR: + dq 0 + dq 128 + dq 512 + dq 512 + dq 1024 + +HEX_ENC_RESULT_BUFFER_OVERFLOW_ARR: + dq 0 + dq 64 + dq 128 + dq 128 + dq 256 + +HEX_ENC_IMPL_ROUTINE_ARR: + dq 0 + dq hex_encode_sse2 + dq hex_encode_ssse3 + dq hex_encode_avx2 + dq hex_encode_avx512bw + + +HEX_DECODE_ARRAYS: +HEX_DEC_MIN_SRC_LEN_ARR: + dq 0 + dq 128 + dq 512 + dq 1024 + +HEX_DEC_RESULT_BUFFER_OVERFLOW_ARR: + dq 0 + dq 64 + dq 128 + dq 256 + +HEX_DEC_IMPL_ROUTINE_ARR: + dq 0 + dq hex_decode_sse2 + dq hex_decode_avx2 + dq hex_decode_avx512bw + + + +section .text align=32 + + +%use smartalign + + ALIGNMODE p6 + +%ifdef __WIN__ +%define STACK_FOR_XMM 10*16 +%else +%define STACK_FOR_XMM 0 +%endif + +;---------------------------------------------------------------------------------------------- + +; get_hex_encode_alloc_addon returns the tail-handling-required allocation addon +; according to the request length and the maximum valid impl_id +; it looks for the correct values in the hex_enc_tables indexed by impl_id + +get_hex_encode_alloc_addon: + + sub rsp,0x28 + +%ifdef __WIN__ +; mov rcx,rcx ; WIN parameter 1 requested source len +; mov rdx,rdx ; WIN parameter 2 maximum valid impl_id +%else + mov rcx,rdi ; LINUX parameter 1 requested source len + mov rdx,rsi ; LINUX parameter 2 maximum valid impl_id +%endif + + lea r8,[HEX_ENC_MIN_SRC_LEN_ARR] +.loop_search: + cmp rcx,[r8+8*rdx] ; compare requested length with current impl_id + jge .offset_found + sub rdx,1 ; lower impl_id + jnz .loop_search +.offset_found: + lea r8,[HEX_ENC_RESULT_BUFFER_OVERFLOW_ARR] + mov rax,[r8+8*rdx] ; return the alloc_overflow + + add rsp,0x28 + + ret + + +;---------------------------------------------------------------------------------------------- + +; get_hex_decode_alloc_addon returns the tail-handling-required allocation addon +; according to the request length and the maximum valid impl_id +; It looks for the correct values in the hex_enc_tables indexed by impl_id + +get_hex_decode_alloc_addon: + + sub rsp,0x28 + +%ifdef __WIN__ +; mov rcx,rcx ; WIN parameter 1 requested source len +; mov rdx,rdx ; WIN parameter 2 maximum valid impl_id +%else + mov rcx,rdi ; LINUX parameter 1 requested source len + mov rdx,rsi ; LINUX parameter 2 maximum valid impl_id +%endif + + lea r8,[HEX_DEC_MIN_SRC_LEN_ARR] +.loop_search: + cmp rcx,[r8+8*rdx] ; compare requested length with current impl_id + jge .offset_found + sub rdx,1 ; lower impl_id + jnz .loop_search +.offset_found: + lea r8,[HEX_DEC_RESULT_BUFFER_OVERFLOW_ARR] + mov rax,[r8+8*rdx] ; return the alloc_overflow + + add rsp,0x28 + + ret + + + + +;---------------------------------------------------------------------------------------------- + +; hex_encode_fast is the dispatcher routine according to the cpu capabilities and +; the length of the encode request. +; +; Parameter 4 (moved to r15) is the maximum valid impl_id fullfilling the cpu requirements +; (determined at program initialization time outside this routine) +; The index into the HEX_ENCODE_ARRAYS is set to the maximum supported requirements. +; When r15 == 0 no fast encode is supported and a zero length is returned. + +%define STACK_ADJ 0x28+2*8 + +hex_encode_fast: + + sub rsp,STACK_ADJ + + mov [rsp+0*8],r9 + mov [rsp+1*8],r15 + + + ; r15 = checked highest valid index +%ifdef __WIN__ + mov rax,r8 ; WIN parameter 3 number of elements + mov r15,r9 ; WIN parameter 4 maximum valid impl_id +%else + mov rax,rdx ; LINUX parameter 3 number of elements + mov r15,rcx ; LINUX parameter 4 maximum valid impl_id +%endif + + lea r10,[HEX_ENC_MIN_SRC_LEN_ARR] + +.check_length: + cmp rax,[r10+8*r15] + jge .max_length_found + sub r15,1 + jnz .check_length + +.max_length_found: + xor rax,rax + cmp r15,0 + jz .return + + lea r10,[HEX_ENC_IMPL_ROUTINE_ARR] + call [r10+8*r15] + +.return: + mov r9,[rsp+0*8] + mov r15,[rsp+1*8] + + add rsp,STACK_ADJ + + ret + + + +%define STACK_ADJ 0x28+6*8+STACK_FOR_XMM + + + +;---------------------------------------------------------------------------------------------- + + +; xmm15 ; CONST ALL bytes 9 +; xmm14 ; CONST BITMASK_LOWER_HALF +; xmm13 ; CONST ASCII_0_OFFSET +; xmm12 ; CONST ASCII_LITTLE_A_ADD +; xmm11 ; Prefetch Input line 3 +; xmm10 ; Prefetch Input line 2 +; xmm9 ; Input Line 1 +; xmm8 ; Input Line 0 + +; xmm7 ; Unpack RL1 Rght Half low bits secnd line +; xmm6 ; Unpack RH1 Rght Half high bits secnd line +; xmm5 ; Unpack LL1 Left Half low bits secnd line +; xmm4 ; Unpack LH1 Left Half high bits secnd line +; xmm3 ; Unpack RL0 Rght Half low bits first line +; xmm2 ; Unpack RH0 Rght Half high bits first line +; xmm1 ; Unpack LL0 Left Half low bits first line +; xmm0 ; Unpack LH0 Left Half high bits first line + + + +%define NINP_BYTES_PER_ROUND 2*16 +%define NINP_BITSHIFT 5 + +hex_encode_sse2: + + sub rsp,STACK_ADJ + + mov [rsp+STACK_FOR_XMM+0*8],rdi + mov [rsp+STACK_FOR_XMM+1*8],rsi + mov [rsp+STACK_FOR_XMM+2*8],r12 + mov [rsp+STACK_FOR_XMM+3*8],r14 + mov [rsp+STACK_FOR_XMM+4*8],r15 + +%ifdef __WIN__ + + MOVDQA [rsp ],xmm6 + MOVDQA [rsp+1*16],xmm7 + MOVDQA [rsp+2*16],xmm8 + MOVDQA [rsp+3*16],xmm9 + MOVDQA [rsp+4*16],xmm10 + MOVDQA [rsp+5*16],xmm11 + MOVDQA [rsp+6*16],xmm12 + MOVDQA [rsp+7*16],xmm13 + MOVDQA [rsp+8*16],xmm14 + MOVDQA [rsp+9*16],xmm15 + + mov rdi,rcx ; parameter 1 output buffer + + mov rsi,rdx ; parameter 2 input buffer + + mov rdx,r8 ; parameter 3 number of elements + +%endif + + +;; initializer for QQ0 and QQ1 + + MOVDQU xmm8,[rsi+0*16] ; QQ0 p__23__ p__23__ l8 QL0 + MOVDQU xmm9,[rsi+1*16] ; QQ1 p__23__ p__23__ l8 QL0 + +;; initialize constants + + MOVDQA xmm15,[ALL_BYTES_9] ; p_23__ l3 + + MOVDQA xmm14,[BITMASK_NIBBLE_3_IN_WORD] ; p_23__ l3 + +; MOVDQA xmm13,[ALL_BYTES_48] ; p_23__ l3 + + MOVDQA xmm12,[ALL_BYTES_39] ; p_23__ l3 + + + MOVDQA xmm13,xmm12 + PADDB xmm13,xmm15 ; 48 = 39+9 + + +;; do page overshoot checks + + mov rax,NINP_BYTES_PER_ROUND + + + mov r9,rdx ; exact requested number of elements to process + add r9,rsi ; r9 last valid pointer +1 of requested input buffer + + mov r10,rsi ; r10 saved start of input buffer + mov r12,r9 ; r12 save of end of input buffer+1 + + lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read + + lea r8,[rdx+NINP_BYTES_PER_ROUND-1] + shr r8,NINP_BITSHIFT ; number of loops + shl r8,NINP_BITSHIFT + add r8,rsi ; r8 address of last byte+1 read in complete loops + add r8,NINP_BYTES_PER_ROUND-1 ; r8 address of last byte read in normal loop with overshoot + + mov r11,r8 + +; DISABLED for NO OVERSHOOT +; add r11,rax ; r11 address of last byte of prefetched data + + shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input + shr r8,NSHIFT_ADDRESS_TO_PAGE ; r8 page number of last byte read after normal round + + cmp rcx,r8 ; stay on same page + je .LSAME_PAGE_IN_ROUND + sub rdx,rax ; don't overshoot in reading: do one round less + +.LSAME_PAGE_IN_ROUND: + shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data + cmp rcx,r11 + je .LSAME_PAGE_IN_PREFETCH + sub rdx,rax ; don't overshoot in prefetch reading: do one round less + +.LSAME_PAGE_IN_PREFETCH: + add rdx,rsi ; rdx last valid pointer+1 for normal loop + + ; due to prefetch add one round to end checks + add rdx,rax + add r9,rax + + mov r11,rdi ; r11 saved start of output buffer + + mov rcx,NINP_BYTES_PER_ROUND<<1 ; increment of output buffer for each round + +;; start preprocessing before loop + + add rsi,rax ; add the number of processed array elements + + align 32 + +; ;IACA START_MARKER +; mov ebx, 111 +; db 0x64, 0x67, 0x90 + +.LHEXENCODE_LOOP: + + MOVDQA xmm6,xmm8 + PUNPCKLBW xmm6,xmm9 ; QL0 p____5 l1 QQ0 [Lin0_RghtH] [00 HL_0 00 HL_1 ...] + + + MOVDQA xmm7,xmm8 + PUNPCKHBW xmm7,xmm9 ; QL0 p____5 l1 QQ0 [Lin0_LeftH] [00 HL_0 00 HL_1 ...] + + MOVDQA xmm8,xmm10 + + MOVDQU xmm10,[rsi+0*16] ; QL0 p_____5 p1____5 l3+ QL0 + + +;; + MOVDQA xmm4,xmm6 + PSRLW xmm4,12 ; RL2 shift RL2 Hx to lower byte in word + MOVDQA xmm5,xmm6 + PAND xmm5,xmm14 ; RL2 mask nibble 3 in word (lower nibble shifted 8 bits left) + + MOVDQA xmm9,xmm11 + + MOVDQU xmm11,[rsi+1*16] ; QL1 p_____5 p1____5 l3+ QL0 + + add rsi,rax ; add the number of processed array elements + + + PSLLW xmm6,8 ; RL0 rotate (shift) RL0 1 byte to left + + MOVDQA xmm0,xmm6 + PSRLW xmm0,4+8 ; RL0 shift RL0 Hx to lower byte in word + POR xmm4,xmm5 ; RL2 low nibble, high nibble at correct position (0L0H) + + + + MOVDQA xmm1,xmm6 + PAND xmm1,xmm14 ; RL0 mask nibble 3 in word (lower nibble shifted 8 bits left) + + MOVDQA xmm6,xmm7 + PSLLW xmm6,8 ; RL1 rotate (shift) RL1 1 byte to left + + + MOVDQA xmm5,xmm4 + PCMPGTB xmm5,xmm15 ; RL2 all letters set to 0xFF, all digits to 0 + + + POR xmm0,xmm1 ; RL0 low nibble, high nibble at correct position (0L0H) + + PADDB xmm4,xmm13 ; RL2 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39 + + + MOVDQA xmm2,xmm6 + PSRLW xmm2,4+8 ; RL1 shift RL1 Hx to lower byte in word + MOVDQA xmm3,xmm6 + PAND xmm3,xmm14 ; RL1 mask nibble 3 in word (lower nibble shifted 8 bits left) + + MOVDQA xmm1,xmm0 + PCMPGTB xmm1,xmm15 ; RL0 all letters set to 0xFF, all digits to 0 + + + PAND xmm5,xmm12 ; RL2 for all letters set to 39, else 0 ( + ; RL2 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters) + + POR xmm2,xmm3 ; RL1 low nibble, high nibble at correct position (0L0H) + + PAND xmm1,xmm12 ; RL0 for all letters set to 39, else 0 ( + ; RL0 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters) + PADDB xmm4,xmm5 ; RL2 final result line RL2 + + PADDB xmm0,xmm13 ; RL0 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39 + + MOVDQA xmm3,xmm2 + PCMPGTB xmm3,xmm15 ; RL1 all letters set to 0xFF, all digits to 0 + + PADDB xmm2,xmm13 ; RL1 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39 + + + PADDB xmm0,xmm1 ; RL0 final result line RL0 + + MOVDQA xmm1,xmm7 + PSRLW xmm1,12 ; RL3 shift RL3 Hx to lower byte in word + + PAND xmm3,xmm12 ; RL1 for all letters set to 39, else 0 ( + ; RL1 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters) + PADDB xmm2,xmm3 ; RL1 final result line RL1 + + MOVDQU [rdi+0*16],xmm0 ; RL0 RL0 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump + + PAND xmm7,xmm14 ; RL3 mask nibble 3 in word (lower nibble shifted 8 bits left) + + MOVDQA xmm6,xmm7 + POR xmm6,xmm1 ; RL3 low nibble, high nibble at correct position (0L0H) + + + MOVDQU [rdi+1*16],xmm2 ; RL1 RL1 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump + + + MOVDQA xmm7,xmm6 + PCMPGTB xmm7,xmm15 ; RL3 all letters set to 0xFF, all digits to 0 + + PADDB xmm6,xmm13 ; RL3 add ASCII_0_OFFSET to all bytes, digits OK, letters yet missing +39 + + PAND xmm7,xmm12 ; RL3 for all letters set to 39, else 0 ( + ; RL3 example for 102 (letter f) - 48 (bit45) - 15 (value) = 39 (amount to add for letters) + MOVDQU [rdi+2*16],xmm4 ; RL2 RL2 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump + + + PADDB xmm6,xmm7 ; RL3 final result line RL2 + + MOVDQU [rdi+3*16],xmm6 ; RL3 RL3 p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump + + + add rdi,rcx ; add the number of processed output bytes + + cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1 + jl .LHEXENCODE_LOOP + + ; end of normal loop reached + ; we can do one more round when original count has been reduced by one round + cmp rax,0 + je .LFINISH_EXTRA + + cmp rdx,r9 ; input buffer length was not reduced when equal + je .LFINISH_NORMAL + + sub rsi,rax ; for prefetching the last round, load the last round again + sub rdx,rax ; adopt and condition for last round also + xor rax,rax + jmp .LHEXENCODE_LOOP + + +.LFINISH_EXTRA: + add rsi,NINP_BYTES_PER_ROUND ; add the extra round to get processed bytes + jmp .LFINISH + +.LFINISH_NORMAL: + sub rsi,NINP_BYTES_PER_ROUND ; sub the added prefetch round to get processed bytes + +.LFINISH: + ; r9 = address of requested input bytes+1 + ; rsi = address of processed input bytes+1 + ; now get the minimum of rdx,rsi to rax +;; sub rsi,rax ; for last round do nothing (rax=0), else sub increment for one round +;; sub r9,rax + + mov rax,r12 + cmp rsi,r12 ; get min from r12 (address of requested input) and rsi (address of done input) + + jge .LCALC_PROCESSED_BYTES + mov rax,rsi ; rax=address of last valid input byte+1 + +.LCALC_PROCESSED_BYTES: + sub rax,r10 ; sub the input buffer start address + ; rax = number of valid processed input bytes = return value + + cmp rsi,rdx ; compare rdx (address of requested input) and rsi (address of done input) + je .LNO_ZERO_OUT + + mov r15,rax ; number of elements to process + + shl r15,1 ; number of output bytes + + add r15,r11 ; pointer to next byte after full valid output buffer + + PXOR xmm0,xmm0 ; all zero +;ZERO MOVDQU [r15],xmm0 ; zero out one register width after last output + +.LNO_ZERO_OUT: + +%ifdef __WIN__ + + MOVDQA xmm6 ,[rsp ] + MOVDQA xmm7 ,[rsp+1*16] + MOVDQA xmm8 ,[rsp+2*16] + MOVDQA xmm9 ,[rsp+3*16] + MOVDQA xmm10,[rsp+4*16] + MOVDQA xmm11,[rsp+5*16] + MOVDQA xmm12,[rsp+6*16] + MOVDQA xmm13,[rsp+7*16] + MOVDQA xmm14,[rsp+8*16] + MOVDQA xmm15,[rsp+9*16] + + +%endif + + mov rdi,[rsp+STACK_FOR_XMM+0*8] + mov rsi,[rsp+STACK_FOR_XMM+1*8] + mov r12,[rsp+STACK_FOR_XMM+2*8] + mov r14,[rsp+STACK_FOR_XMM+3*8] + mov r15,[rsp+STACK_FOR_XMM+4*8] + + add rsp,STACK_ADJ + + ret + + +;---------------------------------------------------------------------------------------------- + + + + +; xmm15 ; CONST ALL bytes 9 +; xmm14 ; MAX byte value of all lines (Init all to letter 'F' (0x46) ;CONST BITMASK_LOWER_HALF +; xmm13 ; MIN byte value of non-ascii-digit values (not 3x) (Init all to letter 'A' (0x41) ;CONST BITMASK_ASCII_0 +; xmm12 ; ORed compare of all digit-values cmp > 9 (Init all zero) ;CONST BITMASK_WORD_LOWER_BYTE +; xmm11 ; Input line 3 +; xmm10 ; Input line 2 +; xmm9 ; Input line 1 +; xmm8 ; Input Line 0 + +; xmm7 ; Unpack RL1 Rght Half low bits secnd line +; xmm6 ; Unpack RH1 Rght Half high bits secnd line +; xmm5 ; Unpack LL1 Left Half low bits secnd line +; xmm4 ; Unpack LH1 Left Half high bits secnd line +; xmm3 ; Unpack RL0 Rght Half low bits first line +; xmm2 ; Unpack RH0 Rght Half high bits first line +; xmm1 ; Unpack LL0 Left Half low bits first line +; xmm0 ; Unpack LH0 Left Half high bits first line + + +%define NINP_BYTES_PER_ROUND 4*16 +%define NINP_BITSHIFT 6 + +hex_decode_sse2: + + sub rsp,STACK_ADJ + + mov [rsp+STACK_FOR_XMM+0*8],rdi + mov [rsp+STACK_FOR_XMM+1*8],rsi + mov [rsp+STACK_FOR_XMM+2*8],r12 + mov [rsp+STACK_FOR_XMM+3*8],r14 + mov [rsp+STACK_FOR_XMM+4*8],r15 + +%ifdef __WIN__ + + MOVDQA [rsp ],xmm6 + MOVDQA [rsp+1*16],xmm7 + MOVDQA [rsp+2*16],xmm8 + MOVDQA [rsp+3*16],xmm9 + MOVDQA [rsp+4*16],xmm10 + MOVDQA [rsp+5*16],xmm11 + MOVDQA [rsp+6*16],xmm12 + MOVDQA [rsp+7*16],xmm13 + MOVDQA [rsp+8*16],xmm14 + MOVDQA [rsp+9*16],xmm15 + + mov rdi,rcx ; parameter 1 output buffer + + mov rsi,rdx ; parameter 2 input buffer + + mov rdx,r8 ; parameter 3 number of elements + +%endif + +;; initializer for QQ0 and QQ1 + + MOVDQU xmm8,[rsi] ; + MOVDQU xmm9,[rsi+1*16] ; + + MOVDQU xmm10,[rsi+2*16] ; + MOVDQU xmm11,[rsi+3*16] ; + +;; initialize constants + + mov r15,[BITMASK_BIG_TO_LITTLE_ASCII] + + MOVDQA xmm7,[BITMASK_LOWER_HALF] + + MOVDQA xmm15,[ALL_BYTES_9] ; p_23__ l3 + + MOVDQA xmm14,[ASCII_LETTER_LITTLE_F] ; p_23__ l3 + + MOVDQA xmm13,[ASCII_LETTER_LITTLE_A] ; p_23__ l3 + + PXOR xmm12,xmm12 ; all zero + + MOVQ xmm2,r15 ; 0b0010_0000 + + +;; do page overshoot checks +;; due to end condition handling not done here, we only process full rounds + + mov rax,NINP_BYTES_PER_ROUND + + add rdx,NINP_BYTES_PER_ROUND-1 + shr rdx,NINP_BITSHIFT ; + shl rdx,NINP_BITSHIFT ; rdx number of bytes read in normal loop equiv to xxx full loops + + mov r9,rdx ; exact requested number of elements to process + add r9,rsi ; r9 last valid pointer +1 of requested input buffer + + mov r10,rsi ; r10 saved start of input buffer + mov r12,r9 ; r12 save of end of input buffer+1 + + lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read + + mov r11,r9 +; DISABLED for NO OVERSHOOT +; add r11,rax ; r11 address of last byte of prefetched data + + shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input + shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data + cmp rcx,r11 + je .LSAME_PAGE_IN_PREFETCH + sub rdx,rax ; don't overshoot in prefetch reading: do one round less + +.LSAME_PAGE_IN_PREFETCH: + add rdx,rsi ; rdx last valid pointer+1 for normal loop + + ; due to prefetch add one round to end checks + add rdx,rax + add r9,rax + + mov r11,rdi ; r11 saved start of output buffer + + mov rcx,NINP_BYTES_PER_ROUND>>1 ; increment of output buffer for each round + +;; start preprocessing before loop + + PUNPCKLQDQ xmm2,xmm2 ; all bytes 0b0010_0000 + +; PUNPCKLQDQ xmm7,xmm7 ; all bytes 0b0000_1111 + + MOVDQA xmm0,xmm2 + MOVDQA xmm1,xmm2 + + MOVDQA xmm4,xmm2 + MOVDQA xmm5,xmm2 + + add rsi,rax ; add the number of processed array elements + + align 32 + +; ;IACA START_MARKER +; mov ebx, 111 +; db 0x64, 0x67, 0x90 + +.LHEXDECODE_LOOP: + + MOVDQA xmm6,xmm2 + + + PSRAD xmm6,1 ; all bytes 0b0001_0000 + + POR xmm0,xmm8 ; line 0 all letters set to little ASCII a-f + POR xmm1,xmm9 + POR xmm4,xmm10 + POR xmm5,xmm11 + + PMAXUB xmm14,xmm0 + PMAXUB xmm14,xmm1 + PMAXUB xmm14,xmm4 + PMAXUB xmm14,xmm5 + +;max check finished + + POR xmm0,xmm6 ; line 0 with bits for ASCII_0 set (Byte OR 0bxx11_xxxx) + POR xmm1,xmm6 + POR xmm4,xmm6 + POR xmm5,xmm6 + + PCMPEQD xmm6,xmm6 ; all ONE + + PCMPEQB xmm0,xmm8 ; set to all ONE when ASCI Digit (forced bit 0bxx11_xxxx equal to orig value) + PCMPEQB xmm1,xmm9 + PCMPEQB xmm4,xmm10 + PCMPEQB xmm5,xmm11 + +;start min check line0+1 + MOVDQA xmm2,xmm0 ; copy all one when digit + MOVDQA xmm3,xmm1 + + PANDN xmm2,xmm6 ; set to all one for values NOT digits + PANDN xmm3,xmm6 + + PAND xmm2,xmm8 ; set to orig value when NOT ASCI Digit + PAND xmm3,xmm9 + + POR xmm2,xmm0 ; set all zero bytes to all one + POR xmm3,xmm1 + + PMINUB xmm13,xmm2 + PMINUB xmm13,xmm3 + + +;start min check line2+3 + MOVDQA xmm2,xmm4 ; copy all one when digit + MOVDQA xmm3,xmm5 + + + PANDN xmm2,xmm6 ; set to all one for values NOT digits + PANDN xmm3,xmm6 + + PAND xmm2,xmm10 ; set to orig value when NOT ASCI Digit + PAND xmm3,xmm11 + + POR xmm2,xmm4 ; set all zero bytes to all one + POR xmm3,xmm5 + + PMINUB xmm13,xmm2 + PMINUB xmm13,xmm3 + + +; start legal digit check + + MOVDQA xmm2,xmm0 ; copy all one when digit + MOVDQA xmm3,xmm1 + + PAND xmm2,xmm8 ; set to orig value when ASCI Digit + PAND xmm3,xmm9 + + PAND xmm2,xmm7 ; set to lower nibble value when ASCI Digit + PAND xmm2,xmm7 + + PCMPGTB xmm2,xmm15 ; set to all ONE when ASCI Digit and value > 9 + PCMPGTB xmm3,xmm15 + + POR xmm12,xmm2 ; accumulate illegal chars like ASCII digit and value > 9 + POR xmm12,xmm3 + + PAND xmm2,xmm10 ; set to orig value when ASCI Digit + PAND xmm3,xmm11 + + PAND xmm2,xmm7 ; set to lower nibble value when ASCI Digit + PAND xmm3,xmm7 + + + PCMPGTB xmm2,xmm15 ; set to all ONE when ASCI Digit and value > 9 + PCMPGTB xmm3,xmm15 ; set to orig value when ASCI Digit + + POR xmm12,xmm2 + POR xmm12,xmm3 + + + +;-- ; all checks accumulated, xmm2,xmm3,xmm6,xmm7 have lower nibbles of lines 0-3 + PCMPEQD xmm6,xmm6 ; all ONE + PSRLW xmm6,8 ; QQ0 p01____ p01____ l1 + + MOVDQA xmm2,xmm7 ; all bytes 0b0000_1111 + MOVDQA xmm3,xmm7 + + PAND xmm2,xmm8 ; all byte values only lower half (nibble) Line 0+1 + MOVDQU xmm8,[rsi+0*16] ; + PAND xmm3,xmm9 + MOVDQU xmm9,[rsi+1*16] ; + + PANDN xmm0,xmm15 ; put 9 to every element not DIGIT + PANDN xmm1,xmm15 + + PADDB xmm2,xmm0 ; add 9 to every nibble not DIGIT + PADDB xmm3,xmm1 + + MOVDQA xmm0,xmm2 + PSRLW xmm0,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + PSLLW xmm2,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + MOVDQA xmm1,xmm3 + PSRLW xmm1,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + PSLLW xmm3,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + POR xmm0,xmm2 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...] + POR xmm1,xmm3 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...] + + PAND xmm0,xmm6 ; line 0 + PAND xmm1,xmm6 ; line 1 + + PACKUSWB xmm0,xmm1 ; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...] + +; line 0 and 1 processed + + + MOVDQA xmm2,xmm7 ; all bytes 0b0000_1111 + MOVDQA xmm3,xmm7 + + PAND xmm2,xmm10 ; all byte values only lower half (nibble) Line 0+1 + MOVDQU xmm10,[rsi+2*16] ; + PAND xmm3,xmm11 + MOVDQU xmm11,[rsi+3*16] ; + + PANDN xmm4,xmm15 ; put 9 to every element not DIGIT + PANDN xmm5,xmm15 + + PADDB xmm2,xmm4 ; add 9 to every nibble not DIGIT + PADDB xmm3,xmm5 + + add rsi,rax ; add the number of processed array elements + + MOVDQU [rdi+0*16],xmm0 ; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump + + + MOVDQA xmm4,xmm2 + PSRLW xmm4,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + PSLLW xmm2,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + MOVDQA xmm5,xmm3 + PSRLW xmm5,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + PSLLW xmm3,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + + POR xmm4,xmm2 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...] + POR xmm5,xmm3 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...] + + MOVQ xmm2,r15 ; + PUNPCKLQDQ xmm2,xmm2 ; all bytes 0b0010_0000 + + MOVDQA xmm0,xmm2 + MOVDQA xmm1,xmm2 + +; MOVQ xmm7,rcx ; + + PAND xmm4,xmm6 ; + PAND xmm5,xmm6 ; line 1 + + PACKUSWB xmm4,xmm5 ; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...] + +; MOVDQA xmm1,xmm11 + + + MOVDQU [rdi+1*16],xmm4 ; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump + + MOVDQA xmm4,xmm2 + MOVDQA xmm5,xmm2 + +; PUNPCKLQDQ xmm7,xmm7 ; all bytes 0b0000_1111 + + + add rdi,rcx ; add the number of processed output bytes + + cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1 + jl .LHEXDECODE_LOOP + + ; end of normal loop reached + ; we can do one more round when original count has been reduced by one round + cmp rax,0 + je .LFINISH + + cmp rdx,r9 ; input buffer length was not reduced when equal + je .LFINISH + + sub rsi,rax ; for prefetching the last round, load the last round again + sub rdx,rax ; adopt and condition for last round also + xor rax,rax + jmp .LHEXDECODE_LOOP + +.LFINISH: + + mov rax,rdi + sub rax,r11 ; rax = number of output bytes + add rax,rax ; rax = number of valid processed input bytes = return value + +%ifdef __WIN__ + + MOVDQA xmm6 ,[rsp ] + MOVDQA xmm7 ,[rsp+1*16] + MOVDQA xmm8 ,[rsp+2*16] + MOVDQA xmm9 ,[rsp+3*16] + MOVDQA xmm10,[rsp+4*16] + MOVDQA xmm11,[rsp+5*16] + MOVDQA xmm12,[rsp+6*16] + MOVDQA xmm13,[rsp+7*16] + MOVDQA xmm14,[rsp+8*16] + MOVDQA xmm15,[rsp+9*16] + +%endif + + mov rdi,[rsp+STACK_FOR_XMM+0*8] + mov rsi,[rsp+STACK_FOR_XMM+1*8] + mov r12,[rsp+STACK_FOR_XMM+2*8] + mov r14,[rsp+STACK_FOR_XMM+3*8] + mov r15,[rsp+STACK_FOR_XMM+4*8] + + add rsp,STACK_ADJ + + ret + + + +;---------------------------------------------------------------------------------------------- + + + + +; ymm15 ; CONST ALL bytes 9 +; ymm14 ; MAX byte value of all lines (Init all to letter 'F' (0x46) ;CONST BITMASK_LOWER_HALF +; ymm13 ; MIN byte value of non-ascii-digit values (not 3x) (Init all to letter 'A' (0x41) ;CONST BITMASK_ASCII_0 +; ymm12 ; ORed compare of all digit-values cmp > 9 (Init all zero) ;CONST BITMASK_WORD_LOWER_BYTE +; ymm11 ; Input line 3 +; ymm10 ; Input line 2 +; ymm9 ; Input line 1 +; ymm8 ; Input Line 0 + +; ymm7 ; CONST BITMASK_LOWER_HALF ;Unpack RL1 Rght Half low bits secnd line +; ymm6 ; Unpack RH1 Rght Half high bits secnd line +; ymm5 ; Unpack LL1 Left Half low bits secnd line +; ymm4 ; Unpack LH1 Left Half high bits secnd line +; ymm3 ; Unpack RL0 Rght Half low bits first line +; ymm2 ; Unpack RH0 Rght Half high bits first line +; ymm1 ; Unpack LL0 Left Half low bits first line +; ymm0 ; Unpack LH0 Left Half high bits first line + + +%define NINP_BYTES_PER_ROUND 4*32 +%define NINP_BITSHIFT 7 + +hex_decode_avx2: + + sub rsp,STACK_ADJ + + mov [rsp+STACK_FOR_XMM+0*8],rdi + mov [rsp+STACK_FOR_XMM+1*8],rsi + mov [rsp+STACK_FOR_XMM+2*8],r12 + mov [rsp+STACK_FOR_XMM+3*8],r14 + mov [rsp+STACK_FOR_XMM+4*8],r15 + +%ifdef __WIN__ + + VMOVDQA [rsp ],xmm6 + VMOVDQA [rsp+1*16],xmm7 + VMOVDQA [rsp+2*16],xmm8 + VMOVDQA [rsp+3*16],xmm9 + VMOVDQA [rsp+4*16],xmm10 + VMOVDQA [rsp+5*16],xmm11 + VMOVDQA [rsp+6*16],xmm12 + VMOVDQA [rsp+7*16],xmm13 + VMOVDQA [rsp+8*16],xmm14 + VMOVDQA [rsp+9*16],xmm15 + + mov rdi,rcx ; parameter 1 output buffer + + mov rsi,rdx ; parameter 2 input buffer + + mov rdx,r8 ; parameter 3 number of elements + +%endif + +;; initializer for QQ0 and QQ1 + + VMOVDQU ymm8,[rsi+0*32] ; + VMOVDQU ymm9,[rsi+1*32] ; + + VMOVDQU ymm10,[rsi+2*32] ; + VMOVDQU ymm11,[rsi+3*32] ; + +;; initialize constants + + VMOVDQA ymm15,[ALL_BYTES_9] ; p_23__ l3 + + VMOVDQA ymm14,[ASCII_LETTER_LITTLE_F] ; p_23__ l3 + + VMOVDQA ymm13,[ASCII_LETTER_LITTLE_A] ; p_23__ l3 + + VMOVDQA ymm7,[BITMASK_LOWER_HALF] ; 0b0000_1111 + + VPXOR ymm12,ymm12 ; all zero + + VMOVDQA ymm2,[BITMASK_BIG_TO_LITTLE_ASCII] ; 0b0010_0000 + + + +;; do page overshoot checks +;; due to end condition handling not done here, we only process full rounds + + mov rax,NINP_BYTES_PER_ROUND + + add rdx,NINP_BYTES_PER_ROUND-1 + shr rdx,NINP_BITSHIFT ; + shl rdx,NINP_BITSHIFT ; rdx number of bytes read in normal loop equiv to xxx full loops + + mov r9,rdx ; exact requested number of elements to process + add r9,rsi ; r9 last valid pointer +1 of requested input buffer + + mov r10,rsi ; r10 saved start of input buffer + mov r12,r9 ; r12 save of end of input buffer+1 + + lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read + + mov r11,r9 +; DISABLED for NO OVERSHOOT +; add r11,rax ; r11 address of last byte of prefetched data + + shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input + shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data + cmp rcx,r11 + je .LSAME_PAGE_IN_PREFETCH + sub rdx,rax ; don't overshoot in prefetch reading: do one round less + +.LSAME_PAGE_IN_PREFETCH: + add rdx,rsi ; rdx last valid pointer+1 for normal loop + + ; due to prefetch add one round to end checks + add rdx,rax + add r9,rax + + mov r11,rdi ; r11 saved start of output buffer + + mov rcx,NINP_BYTES_PER_ROUND>>1 ; increment of output buffer for each round + +;; start preprocessing before loop + + add rsi,rax ; add the number of processed array elements + + align 32 + +; ;IACA START_MARKER +; mov ebx, 111 +; db 0x64, 0x67, 0x90 + +.LHEXDECODE_LOOP: + + VMOVDQA ymm6,ymm2 + + + VPSRAD ymm6,1 ; all bytes 0b0001_0000 + + VPOR ymm0,ymm2,ymm8 ; line 0 all letters set to little ASCII a-f + VPOR ymm1,ymm2,ymm9 + VPOR ymm4,ymm2,ymm10 + VPOR ymm5,ymm2,ymm11 + + VPMAXUB ymm14,ymm0 + VPMAXUB ymm14,ymm1 + VPMAXUB ymm14,ymm4 + VPMAXUB ymm14,ymm5 + +;max check finished + + VPOR ymm0,ymm6 ; line 0 with bits for ASCII_0 set (Byte OR 0bxx11_xxxx) + VPOR ymm1,ymm6 + VPOR ymm4,ymm6 + VPOR ymm5,ymm6 + + VPCMPEQD ymm6,ymm6 ; all ONE + + VPCMPEQB ymm0,ymm8 ; set to all ONE when ASCI Digit (forced bit 0bxx11_xxxx equal to orig value) + VPCMPEQB ymm1,ymm9 + VPCMPEQB ymm4,ymm10 + VPCMPEQB ymm5,ymm11 + +;start min check line0+1 + VPANDN ymm2,ymm0,ymm6 ; set to all one for values NOT digits + VPANDN ymm3,ymm1,ymm6 + + VPAND ymm2,ymm8 ; set to orig value when NOT ASCI Digit + VPAND ymm3,ymm9 + + VPOR ymm2,ymm0 ; set all zero bytes to all one + VPOR ymm3,ymm1 + + VPMINUB ymm13,ymm2 + VPMINUB ymm13,ymm3 + + +;start min check line2+3 + + + VPANDN ymm2,ymm4,ymm6 ; set to all one for values NOT digits + VPANDN ymm3,ymm5,ymm6 + + VPAND ymm2,ymm10 ; set to orig value when NOT ASCI Digit + VPAND ymm3,ymm11 + + VPOR ymm2,ymm4 ; set all zero bytes to all one + VPOR ymm3,ymm5 + + VPMINUB ymm13,ymm2 + VPMINUB ymm13,ymm3 + + +; start legal digit check + + VPAND ymm2,ymm0,ymm8 ; set to orig value when ASCI Digit + VPAND ymm3,ymm1,ymm9 + + VPAND ymm2,ymm7 ; set to lower nibble value when ASCI Digit + VPAND ymm2,ymm7 + + VPCMPGTB ymm2,ymm15 ; set to all ONE when ASCI Digit and value > 9 + VPCMPGTB ymm3,ymm15 + + VPOR ymm12,ymm2 ; accumulate illegal chars like ASCII digit and value > 9 + VPOR ymm12,ymm3 + + VPAND ymm2,ymm10 ; set to orig value when ASCI Digit + VPAND ymm3,ymm11 + + VPAND ymm2,ymm7 ; set to lower nibble value when ASCI Digit + VPAND ymm3,ymm7 + + + VPCMPGTB ymm2,ymm15 ; set to all ONE when ASCI Digit and value > 9 + VPCMPGTB ymm3,ymm15 ; set to orig value when ASCI Digit + + VPOR ymm12,ymm2 + VPOR ymm12,ymm3 + +; all (max, min and >9) checks finished + + +;-- ; all checks accumulated, ymm2,ymm3,ymm6,ymm7 have lower nibbles of lines 0-3 + VPCMPEQD ymm6,ymm6 ; all ONE + VPSRLW ymm6,8 ; QQ0 p01____ p01____ l1 + + VPAND ymm2,ymm7,ymm8 ; all byte values only lower half (nibble) Line 0+1 + VMOVDQU ymm8,[rsi+0*32] ; + VPAND ymm3,ymm7,ymm9 + VMOVDQU ymm9,[rsi+1*32] ; + + VPANDN ymm0,ymm15 ; put 9 to every element not DIGIT + VPANDN ymm1,ymm15 + + VPADDB ymm2,ymm0 ; add 9 to every nibble not DIGIT + VPADDB ymm3,ymm1 + + VPSRLW ymm0,ymm2,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + VPSLLW ymm2,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + VPSRLW ymm1,ymm3,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + VPSLLW ymm3,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + VPOR ymm0,ymm2 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...] + VPOR ymm1,ymm3 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...] + + VPAND ymm0,ymm6 ; line 0 + VPAND ymm1,ymm6 ; line 1 + + VPACKUSWB ymm0,ymm1 ; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...] + +; line 0 and 1 processed + + + VPAND ymm2,ymm7,ymm10 ; all byte values only lower half (nibble) Line 0+1 + VMOVDQU ymm10,[rsi+2*32] ; + VPAND ymm3,ymm7,ymm11 + VMOVDQU ymm11,[rsi+3*32] ; + + VPANDN ymm4,ymm15 ; put 9 to every element not DIGIT + VPANDN ymm5,ymm15 + + VPADDB ymm2,ymm4 ; add 9 to every nibble not DIGIT + VPADDB ymm3,ymm5 + + add rsi,rax ; add the number of processed array elements + + VMOVDQU [rdi+0*32],ymm0 ; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump + + + VPSRLW ymm4,ymm2,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + VPSLLW ymm2,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + VPSRLW ymm5,ymm3,8 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + VPSLLW ymm3,4 ; QQ0 p01____ p01____ l1 Q03 [Lin0_LeftH] [00 0H_0 00 0H_1 ...] + + + VPOR ymm4,ymm2 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...] + VPOR ymm5,ymm3 ; QQ0 p01___5 p01___5 l1 QQ3 [Lin0_LeftH] [AH AL_0 AH AL_1 ...] + + VMOVDQA ymm2,[BITMASK_BIG_TO_LITTLE_ASCII] ; + + + VPAND ymm4,ymm6 ; + VPAND ymm5,ymm6 ; line 1 + + VPACKUSWB ymm4,ymm5 ; QQ0 p_____5 p_1___5 l1 QQ0 [Lin0_LeftH] [HL_0 00 HL_1 00 ...] + + + VMOVDQU [rdi+1*32],ymm4 ; S0_ p____4_ p____4_ l1 [Lin0 Left Half] Store Hexdump + + add rdi,rcx ; add the number of processed output bytes + + + + cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1 + jl .LHEXDECODE_LOOP + + ; end of normal loop reached + ; we can do one more round when original count has been reduced by one round + cmp rax,0 + je .LFINISH + + cmp rdx,r9 ; input buffer length was not reduced when equal + je .LFINISH + + sub rsi,rax ; for prefetching the last round, load the last round again + sub rdx,rax ; adopt and condition for last round also + xor rax,rax + jmp .LHEXDECODE_LOOP + +.LFINISH: + + mov rax,rdi + sub rax,r11 ; rax = number of output bytes + add rax,rax ; rax = number of valid processed input bytes = return value + +%ifdef __WIN__ + + VMOVDQA xmm6 ,[rsp ] + VMOVDQA xmm7 ,[rsp+1*16] + VMOVDQA xmm8 ,[rsp+2*16] + VMOVDQA xmm9 ,[rsp+3*16] + VMOVDQA xmm10,[rsp+4*16] + VMOVDQA xmm11,[rsp+5*16] + VMOVDQA xmm12,[rsp+6*16] + VMOVDQA xmm13,[rsp+7*16] + VMOVDQA xmm14,[rsp+8*16] + VMOVDQA xmm15,[rsp+9*16] + +%endif + + mov rdi,[rsp+STACK_FOR_XMM+0*8] + mov rsi,[rsp+STACK_FOR_XMM+1*8] + mov r12,[rsp+STACK_FOR_XMM+2*8] + mov r14,[rsp+STACK_FOR_XMM+3*8] + mov r15,[rsp+STACK_FOR_XMM+4*8] + + add rsp,STACK_ADJ + + ret + + +;---------------------------------------------------------------------------------------------- + +; k7 ; compare flags lower eq little f, initially all ONE +; k6 ; compare flags greater eq little a, initially all ONE QL0,QL1 +; k5 ; CONST BITMASK_ZERO_ONE 0101010101010101 selecting lower half +; k4 ; digit flags QL3 +; k3 ; digit flags QL2 +; k2 ; digit flags QL1 +; k1 ; digit flags QL0 +; k0 ; + +; zmm31 ; CONST ALL ZERO +; zmm30 ; CONST BITMASK_LOWER_HALF +; zmm29 ; CONST ASCII_0_OFFSET +; zmm28 ; CONST ASCII_LITTLE_A_ADD +; zmm27 ; CONST VPERM_DECODE_OFFSETS +; zmm26 ; CONST ALL bytes 9 +; zmm25 ; Ouptut Line OH1 (Line 0 is stored in the source load regs zmm8-zmm23 +; zmm24 ; Output Line OL1 +; zmm23 ; Preload QL3 +; zmm22 ; Preload QL2 +; zmm21 ; Preload QL1 +; zmm20 ; Preload QL0 +; zmm19 ; Source Load QL3 +; zmm18 ; Source Load QL2 +; zmm17 ; Source Load QL1 +; zmm16 ; Source Load QL0 + +; zmm15 ; QL3 little a +; zmm14 ; QL2 little a +; zmm13 ; QL1 little a +; zmm12 ; QL0 little a +; zmm11 ; QL3 masked for digit +; zmm10 ; QL2 masked for digit +; zmm9 ; QL1 masked for digit +; zmm8 ; QL0 masked for digit +; zmm7 ; lower nibble masked QL3 +; zmm6 ; lower nibble masked QL2 +; zmm5 ; lower nibble masked QL1 +; zmm4 ; lower nibble masked QL0 +; zmm3 ; +; zmm2 ; +; zmm1 ; +; zmm0 ; + + +%define NINP_BYTES_PER_ROUND 4*64 +%define NINP_BITSHIFT 8 + +hex_decode_avx512bw: + + sub rsp,STACK_ADJ + + mov [rsp+STACK_FOR_XMM+0*8],rdi + mov [rsp+STACK_FOR_XMM+1*8],rsi + mov [rsp+STACK_FOR_XMM+2*8],r12 + mov [rsp+STACK_FOR_XMM+3*8],r14 + mov [rsp+STACK_FOR_XMM+4*8],r15 + +%ifdef __WIN__ + + VMOVDQA [rsp ],xmm6 + VMOVDQA [rsp+1*16],xmm7 + VMOVDQA [rsp+2*16],xmm8 + VMOVDQA [rsp+3*16],xmm9 + VMOVDQA [rsp+4*16],xmm10 + VMOVDQA [rsp+5*16],xmm11 + VMOVDQA [rsp+6*16],xmm12 + VMOVDQA [rsp+7*16],xmm13 + VMOVDQA [rsp+8*16],xmm14 + VMOVDQA [rsp+9*16],xmm15 + + mov rdi,rcx ; parameter 1 output buffer + + mov rsi,rdx ; parameter 2 input buffer + + mov rdx,r8 ; parameter 3 number of elements + +%endif + +;; initializer for QQ0 and QQ1 + + VMOVDQU64 zmm20,[rsi+0*64] ; QQ0 p____5 l3+ QL4 + VMOVDQU64 zmm21,[rsi+1*64] ; QQ0 p____5 l3+ QL5 + VMOVDQU64 zmm22,[rsi+2*64] ; QQ0 p____5 l3+ QL6 + VMOVDQU64 zmm23,[rsi+3*64] ; QQ0 p____5 l3+ QL7 + +;; initialize constants + + KXNORQ k7,k7,k7 ; all one + + VPBROADCASTQ zmm31,[ALL_BYTES_9] ; p_23__ l3 + + VPBROADCASTQ zmm30,[BITMASK_LOWER_HALF] ; p_23__ l3 + + KXNORQ k6,k6,k6 ; all one + + VPBROADCASTQ zmm29,[ASCII_LETTER_LITTLE_F] ; p_23__ l3 + + VPBROADCASTQ zmm28,[ASCII_LETTER_LITTLE_A] ; p_23__ l3 + + KMOVQ k5,[BITMASK_ZERO_ONE] + + VMOVDQA64 zmm27,[VPERM_DECODE_OFFSETS] ; p_23__ l3 + + VPBROADCASTQ zmm26,[BITMASK_BIG_TO_LITTLE_ASCII]; p_23__ l3 + + VPBROADCASTQ zmm25,[BITMASK_SELECT_DIGIT] ; p_23__ l3 + + +;; do page overshoot checks +;; due to end condition handling not done here, we only process full rounds + + mov rax,NINP_BYTES_PER_ROUND + + add rdx,NINP_BYTES_PER_ROUND-1 + shr rdx,NINP_BITSHIFT ; + shl rdx,NINP_BITSHIFT ; rdx number of bytes read in normal loop equiv to xxx full loops + + mov r9,rdx ; exact requested number of elements to process + add r9,rsi ; r9 last valid pointer +1 of requested input buffer + + mov r10,rsi ; r10 saved start of input buffer + mov r12,r9 ; r12 save of end of input buffer+1 + + lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read + + mov r11,r9 +; DISABLED for NO OVERSHOOT +; add r11,rax ; r11 address of last byte of prefetched data + + shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input + shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data + cmp rcx,r11 + je .LSAME_PAGE_IN_PREFETCH + sub rdx,rax ; don't overshoot in prefetch reading: do one round less + +.LSAME_PAGE_IN_PREFETCH: + add rdx,rsi ; rdx last valid pointer+1 for normal loop + + ; due to prefetch add one round to end checks + add rdx,rax + add r9,rax + + mov r11,rdi ; r11 saved start of output buffer + + mov rcx,NINP_BYTES_PER_ROUND>>1 ; increment of output buffer for each round + +;; start preprocessing before loop + +; VPUNPCKHBW zmm1,zmm16,zmm31 ; QL0 p____5 l1 QQ0 [Lin0_LeftH] [00 HL_0 00 HL_1 ...] +; VPUNPCKLBW zmm3,zmm16,zmm31 ; QL0 p____5 l1 QQ0 [Lin0_RghtH] [00 HL_0 00 HL_1 ...] + + add rsi,rax ; add the number of processed array elements + + align 32 + +; ;IACA START_MARKER +; mov ebx, 111 +; db 0x64, 0x67, 0x90 + + +; Vector Port info AVX512 +; ---------------------------------------- +; VPShift p0 l1 +; VPMax/Min p0 l1 +; VPMUL p0 l5 ; with 2FMA-Units p05 (SKX,CLX etc.) +; VPMOVB2M p0 l3 +; VPSUBUSB /SSB p0 l1 + +; VPALIGNR p5 l1 ;Shift of n*8 bits! +; VPERM p5 l3 +; VPERMI2x 1*p05+2*p5 l7 ; (l9 with flags) +; VPCompare p5 l3-l4 +; VP Pack/Unpack p5 l1(SKX) l3(TGL) +; VPSHUF p5 l1 + + +.LHEXDECODE_LOOP: + + VMOVDQA64 zmm16,zmm20 ; QL0 copy preload to load + VMOVDQA64 zmm17,zmm21 ; QL1 copy preload to load + VPANDQ zmm8,zmm25,zmm20 ; QL0 set bitmask for digits only + VPMAXUB zmm0,zmm20,zmm21 ; QL0,QL1 max from both lines +;;; VPCMPB k7{k7},zmm29,zmm20,2 ; QL0 compare lower_eq little f + + VMOVDQA64 zmm18,zmm22 ; QL2 + VMOVDQA64 zmm19,zmm23 ; QL3 + VPCMPEQB k1,zmm8,zmm20 ; QL0 compare for is digit + VPANDQ zmm9,zmm25,zmm21 ; QL1 set bitmask for digits only + + VMOVDQU64 zmm20,[rsi+0*64] ; QQ0 p____5 l3+ QL4 + VMOVDQU64 zmm21,[rsi+1*64] ; QL1 p____5 l3+ QL5 + + VPANDQ zmm10,zmm25,zmm18 ; QL2 set bitmask for digits only + VPCMPEQB k2,zmm9,zmm17 ; QL1 compare for is digit + + VMOVDQU64 zmm22,[rsi+2*64] ; QQ0 p____5 l3+ QL6 + VMOVDQU64 zmm23,[rsi+3*64] ; QQ0 p____5 l3+ QL7 + + VPANDQ zmm11,zmm25,zmm19 ; QL3 set bitmask for digits only + VPCMPEQB k3,zmm10,zmm18 ; QL2 compare for is digit + + VPMAXUB zmm1,zmm18,zmm19 ; QL2,QL3 max from both lines +;;; VPCMPB k7{k7},zmm29,zmm17,2 ; QL1 compare lower_eq little f + VPCMPEQB k4,zmm11,zmm19 ; QL2 compare for is digit + + add rsi,rax ; add the number of processed array elements + + VPORQ zmm12,zmm26,zmm16 ; QL0 set bit for little a + VPANDQ zmm4,zmm30,zmm16 ; QL0 bitmask lower nibble + + VPORQ zmm13,zmm26,zmm17 ; QL1 set bit for little a + VPANDQ zmm5,zmm30,zmm17 ; QL1 bitmask lower nibble + + VPMAXUB zmm0,zmm0,zmm1 ; QL0,QL1,QL2,QL3 max from 4 lines + VPADDB zmm4,zmm4,zmm31 ; QL0 add 9 + + VPORQ zmm14,zmm26,zmm18 ; QL2 set bit for little a + VPANDQ zmm6,zmm30,zmm18 ; QL2 bitmask lower nibble + + VPANDQ zmm7,zmm30,zmm19 ; QL3 bitmask lower nibble + VPCMPB k7{k7},zmm29,zmm0,2 ; QL0,QL1,QL2,QL3 compare lower_eq little f + + VPADDB zmm5,zmm5,zmm31 ; QL1 add 9 + VPORQ zmm15,zmm26,zmm19 ; QL3 set bit for little a + + VPADDB zmm6,zmm6,zmm31 ; QL2 add 9 + VPADDB zmm7,zmm7,zmm31 ; QL3 add 9 + + VPSUBB zmm4{k1},zmm4,zmm31 ; QL0 sub 9 for digits + VPSUBB zmm5{k2},zmm5,zmm31 ; QL1 sub 9 for digits + VPSUBB zmm6{k3},zmm6,zmm31 ; QL2 sub 9 for digits + VPSUBB zmm7{k4},zmm7,zmm31 ; QL3 sub 9 for digits + +; + + VPSRLW zmm0,zmm4,8 ; QL0 lower nibble-value + VPSLLW zmm4,zmm4,2 ; QL0 upper nibble_value + + VPADDB zmm0{k5}{z},zmm0,zmm4 ; QL0 values in lower byte of dword + + VPSRLW zmm1,zmm5,8 ; QL1 lower nibble-value + VPSLLW zmm5,zmm5,2 ; QL1 upper nibble_value + + VPADDB zmm1{k5}{z},zmm1,zmm5 ; QL1 values in lower byte of dword + + VPACKUSWB zmm0,zmm0,zmm1 ; QL0 vlues in single bytes + + VMOVDQA64 [rdi+0*64],zmm0 ;DEBUG ########### ; QL0 p____4_ p____49 l4 [Lin0 Left Half] Store Hexdump + + VPERMQ zmm1,zmm27,zmm0 ; QL0,QL1 byte values in right order + +; + + VPSRLW zmm2,zmm6,8 ; QL2 lower nibble-value + VPSLLW zmm6,zmm6,2 ; QL2 upper nibble_value + + VPADDB zmm2{k5}{z},zmm2,zmm6 ; QL2 values in lower byte of dword + + VPSRLW zmm3,zmm7,8 ; QL3 lower nibble-value + VPSLLW zmm7,zmm7,2 ; QL3 upper nibble_value + + VPADDB zmm3{k5}{z},zmm3,zmm7 ; QL3 values in lower byte of dword + + VPACKUSWB zmm2,zmm2,zmm3 ; QL2,QL3 vlues in single bytes + + VMOVDQA64 [rdi+1*64],zmm2 ;DEBUG ############## ; QL0 p____4_ p____49 l4 [Lin0 Rght Half] Store Hexdump + + VPERMQ zmm3,zmm27,zmm2 ; QL2,QL3 byte values in right order + +; -------- MISSING check for little a + + VMOVDQA64 [rdi+0*64],zmm1 ; QL0 p____4_ p____49 l4 [Lin0 Left Half] Store Hexdump + VMOVDQA64 [rdi+1*64],zmm3 ; QL0 p____4_ p____49 l4 [Lin0 Rght Half] Store Hexdump + + add rdi,rcx ; add the number of processed output bytes + + + cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1 + jl .LHEXDECODE_LOOP + + ; end of normal loop reached + ; we can do one more round when original count has been reduced by one round + cmp rax,0 + je .LFINISH + + cmp rdx,r9 ; input buffer length was not reduced when equal + je .LFINISH + + sub rsi,rax ; for prefetching the last round, load the last round again + sub rdx,rax ; adopt and condition for last round also + xor rax,rax + jmp .LHEXDECODE_LOOP + +.LFINISH: + + mov rax,rdi + sub rax,r11 ; rax = number of output bytes + add rax,rax ; rax = number of valid processed input bytes = return value + +%ifdef __WIN__ + + VMOVDQA xmm6 ,[rsp ] + VMOVDQA xmm7 ,[rsp+1*16] + VMOVDQA xmm8 ,[rsp+2*16] + VMOVDQA xmm9 ,[rsp+3*16] + VMOVDQA xmm10,[rsp+4*16] + VMOVDQA xmm11,[rsp+5*16] + VMOVDQA xmm12,[rsp+6*16] + VMOVDQA xmm13,[rsp+7*16] + VMOVDQA xmm14,[rsp+8*16] + VMOVDQA xmm15,[rsp+9*16] + +%endif + + mov rdi,[rsp+STACK_FOR_XMM+0*8] + mov rsi,[rsp+STACK_FOR_XMM+1*8] + mov r12,[rsp+STACK_FOR_XMM+2*8] + mov r14,[rsp+STACK_FOR_XMM+3*8] + mov r15,[rsp+STACK_FOR_XMM+4*8] + + add rsp,STACK_ADJ + + ret + + +;---------------------------------------------------------------------------------------------- + + +; ymm15 ; Source Load QL7 +; ymm14 ; Source Load QL6 +; ymm13 ; Source Load QL5 +; ymm12 ; Source Load QL4 +; ymm11 ; Source Load QL3 +; ymm10 ; Source Load QL2 +; ymm9 ; Source Load QL1 +; ymm8 ; Source Load QL0 + +; ymm7 ; CONST ENCODE_SHUFFLE_TO_HEX +; ymm6 ; CONST BITMASK_NIBBLE_3_IN_WORD +; ymm5 ; Shift temp for High nibble 1 +; ymm4 ; Shift temp for High nibble 0 +; ymm3 ; Temp3 +; ymm2 ; Temp2 +; ymm1 ; Temp1 +; ymm0 ; Temp0 + + +%define NINP_BYTES_PER_ROUND 8*32 +%define NINP_BITSHIFT 8 + +hex_encode_avx2: + + sub rsp,STACK_ADJ + + mov [rsp+STACK_FOR_XMM+0*8],rdi + mov [rsp+STACK_FOR_XMM+1*8],rsi + mov [rsp+STACK_FOR_XMM+2*8],r12 + mov [rsp+STACK_FOR_XMM+3*8],r14 + mov [rsp+STACK_FOR_XMM+4*8],r15 + +%ifdef __WIN__ + + VMOVDQA [rsp ],xmm6 + VMOVDQA [rsp+1*16],xmm7 + VMOVDQA [rsp+2*16],xmm8 + VMOVDQA [rsp+3*16],xmm9 + VMOVDQA [rsp+4*16],xmm10 + VMOVDQA [rsp+5*16],xmm11 + VMOVDQA [rsp+6*16],xmm12 + VMOVDQA [rsp+7*16],xmm13 + VMOVDQA [rsp+8*16],xmm14 + VMOVDQA [rsp+9*16],xmm15 + + mov rdi,rcx ; parameter 1 output buffer + + mov rsi,rdx ; parameter 2 input buffer + + mov rdx,r8 ; parameter 3 number of elements + +%endif + +;; Loading QL0-QL3, prefetching QL4-QL7 + + VPERMQ ymm8, [rsi+0*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL0 + VPERMQ ymm9, [rsi+1*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL1 + VPERMQ ymm10,[rsi+2*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL2 + VPERMQ ymm11,[rsi+3*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL3 + + VPERMQ ymm12,[rsi+4*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL4 + VPERMQ ymm13,[rsi+5*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL5 + VPERMQ ymm14,[rsi+6*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL6 + VPERMQ ymm15,[rsi+7*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL7 + +;; initialize constants + + VMOVDQA ymm7,[ENCODE_SHUFFLE_TO_HEX] ; p_23__ l3 + + VMOVDQA ymm6,[BITMASK_LOWER_HALF] ; p_23__ l3 + +;; do page overshoot checks + + mov rax,NINP_BYTES_PER_ROUND + + + mov r9,rdx ; exact requested number of elements to process + add r9,rsi ; r9 last valid pointer +1 of requested input buffer + + mov r10,rsi ; r10 saved start of input buffer + mov r12,r9 ; r12 save of end of input buffer+1 + + lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read + + lea r8,[rdx+NINP_BYTES_PER_ROUND-1] + shr r8,NINP_BITSHIFT ; number of loops + shl r8,NINP_BITSHIFT + add r8,rsi ; r8 address of last byte+1 read in complete loops + add r8,NINP_BYTES_PER_ROUND-1 ; r8 address of last byte read in normal loop with overshoot + + mov r11,r8 + +; DISABLED for NO OVERSHOOT +; add r11,rax ; r11 address of last byte of prefetched data + + shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input + shr r8,NSHIFT_ADDRESS_TO_PAGE ; r8 page number of last byte read after normal round + + cmp rcx,r8 ; stay on same page + je .LSAME_PAGE_IN_ROUND + sub rdx,rax ; don't overshoot in reading: do one round less + +.LSAME_PAGE_IN_ROUND: + shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data + cmp rcx,r11 + je .LSAME_PAGE_IN_PREFETCH + sub rdx,rax ; don't overshoot in prefetch reading: do one round less + +.LSAME_PAGE_IN_PREFETCH: + add rdx,rsi ; rdx last valid pointer+1 for normal loop + + ; due to prefetch add one round to end checks + add rdx,rax + add r9,rax + + mov r11,rdi ; r11 saved start of output buffer + + mov rcx,NINP_BYTES_PER_ROUND<<1 ; increment of output buffer for each round + +;; start preprocessing before loop + + VPSRLQ ymm4,ymm8,4 ; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte + VPSRLQ ymm5,ymm9,4 ; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte + + VPUNPCKLBW ymm0,ymm4,ymm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPUNPCKHBW ymm1,ymm4,ymm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + add rsi,rax ; add the number of processed array elements + + + align 32 + +; ;IACA START_MARKER +; mov ebx, 111 +; db 0x64, 0x67, 0x90 + + +.LHEXENCODE_LOOP: + +;; process unpacked AAA in YMM0-YMM4 and YMM8-YMM11, UNPCK BBB to YMM0-YMM1, PreLoad AAA to YMM8-YMM11 +;;RL00,RL01,RL02,RL03,RL04,RL05,RL06,RL07 + VPUNPCKLBW ymm2,ymm5,ymm9 ; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0] + VPSRLQ ymm4,ymm10,4 ; AAA RL04,RL05 QL2 shift Hx to lower nibble in byte + VPUNPCKHBW ymm3,ymm5,ymm9 ; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0] + VPAND ymm0,ymm0,ymm6 ; AAA RL00 mask lower nibble + + VPSRLQ ymm5,ymm11,4 ; AAA RL06,RL07 QL3 shift Hx to lower nibble in byte + VPUNPCKLBW ymm8,ymm4,ymm10 ; AAA RL04 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPAND ymm1,ymm1,ymm6 ; AAA RL01 mask lower nibble + VPUNPCKHBW ymm9,ymm4,ymm10 ; AAA RL05 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + VPAND ymm2,ymm2,ymm6 ; AAA RL02 mask lower nibble + VPUNPCKLBW ymm10,ymm5,ymm11 ; AAA RL06 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPAND ymm3,ymm3,ymm6 ; AAA RL03 mask lower nibble + VPUNPCKHBW ymm11,ymm5,ymm11 ; AAA RL07 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + VPSHUFB ymm0,ymm7,ymm0 ; AAA RL00 shuffle_to_hex_digits + VPAND ymm8,ymm8,ymm6 ; AAA RL04 mask lower nibble + VPSHUFB ymm1,ymm7,ymm1 ; AAA RL01 shuffle_to_hex_digits + VPAND ymm9,ymm9,ymm6 ; AAA RL05 mask lower nibble + + VPSHUFB ymm2,ymm7,ymm2 ; AAA RL02 shuffle_to_hex_digits + VMOVDQU [rdi+0*32],ymm0 ; AAA RL00 Store Hexdump + VPAND ymm10,ymm10,ymm6 ; AAA RL06 mask lower nibble + + VPSHUFB ymm3,ymm7,ymm3 ; AAA RL03 shuffle_to_hex_digits + VMOVDQU [rdi+1*32],ymm1 ; AAA RL01 Store Hexdump + VPAND ymm11,ymm11,ymm6 ; AAA RL07 mask lower nibble + + VPSHUFB ymm8,ymm7,ymm8 ; AAA RL04 shuffle_to_hex_digits + VPSRLQ ymm4,ymm12,4 ; BBB RL08,RL09 QL4 shift Hx to lower nibble in byte + VMOVDQU [rdi+2*32],ymm2 ; AAA RL02 Store Hexdump + VPSHUFB ymm9,ymm7,ymm9 ; AAA RL05 shuffle_to_hex_digits + VPSRLQ ymm5,ymm13,4 ; BBB RL10,RL11 QL5 shift Hx to lower nibble in byte + VMOVDQU [rdi+3*32],ymm3 ; AAA RL03 Store Hexdump + + VPSHUFB ymm10,ymm7,ymm10 ; AAA RL06 shuffle_to_hex_digits + VMOVDQU [rdi+4*32],ymm8 ; AAA RL04 Store Hexdump + VPERMQ ymm8, [rsi+0*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL0 + VMOVDQU [rdi+5*32],ymm9 ; AAA RL05 Store Hexdump + VPERMQ ymm9, [rsi+1*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL1 + + VPSHUFB ymm11,ymm7,ymm11 ; AAA RL07 shuffle_to_hex_digits + VMOVDQU [rdi+6*32],ymm10 ; AAA RL06 Store Hexdump + VPUNPCKLBW ymm0,ymm4,ymm12 ; BBB RL08 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPERMQ ymm10,[rsi+2*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL2 + + VMOVDQU [rdi+7*32],ymm11 ; AAA RL07 Store Hexdump + VPERMQ ymm11,[rsi+3*32],VPERM_AVX2_OFFS ; AAA p_____5 p1____5 l3+ QL3 + VPUNPCKHBW ymm1,ymm4,ymm12 ; AAA RL09 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + +;; process unpacked BBB in YMM0-YMM4 and YMM9-YMM15, UNPCK AAA to YMM0-YMM1, PreLoad BBB to YMM12-YMM15 +;;RL08,RL09,RL10,RL11,RL12,RL13,RL14,RL15 + VPUNPCKLBW ymm2,ymm5,ymm13 ; BBB RL10 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0] + VPSRLQ ymm4,ymm14,4 ; BBB RL12,RL13 QL6 shift Hx to lower nibble in byte + VPUNPCKHBW ymm3,ymm5,ymm13 ; BBB RL11 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0] + VPAND ymm0,ymm0,ymm6 ; BBB RL08 mask lower nibble + + VPSRLQ ymm5,ymm15,4 ; BBB RL14,RL15 QL7 shift Hx to lower nibble in byte + VPUNPCKLBW ymm12,ymm4,ymm14 ; BBB RL12 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPAND ymm1,ymm1,ymm6 ; BBB RL09 mask lower nibble + VPUNPCKHBW ymm13,ymm4,ymm14 ; BBB RL13 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + VPAND ymm2,ymm2,ymm6 ; BBB RL10 mask lower nibble + VPUNPCKLBW ymm14,ymm5,ymm15 ; BBB RL14 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPAND ymm3,ymm3,ymm6 ; BBB RL11 mask lower nibble + VPUNPCKHBW ymm15,ymm5,ymm15 ; BBB RL15 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + VPSHUFB ymm0,ymm7,ymm0 ; BBB RL08 shuffle_to_hex_digits + VPAND ymm12,ymm12,ymm6 ; BBB RL12 mask lower nibble + VPSHUFB ymm1,ymm7,ymm1 ; BBB RL09 shuffle_to_hex_digits + VPAND ymm13,ymm13,ymm6 ; BBB RL13 mask lower nibble + + VPSHUFB ymm2,ymm7,ymm2 ; BBB RL10 shuffle_to_hex_digits + VMOVDQU [rdi+8*32],ymm0 ; BBB RL08 Store Hexdump + VPAND ymm14,ymm14,ymm6 ; BBB RL14 mask lower nibble + + VPSHUFB ymm3,ymm7,ymm3 ; BBB RL11 shuffle_to_hex_digits + VMOVDQU [rdi+9*32],ymm1 ; BBB RL09 Store Hexdump + VPAND ymm15,ymm15,ymm6 ; BBB RL15 mask lower nibble + + VPSHUFB ymm12,ymm7,ymm12 ; BBB RL12 shuffle_to_hex_digits + VPSRLQ ymm4,ymm8,4 ; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte + VMOVDQU [rdi+10*32],ymm2 ; BBB RL10 Store Hexdump + VPSHUFB ymm13,ymm7,ymm13 ; BBB RL13 shuffle_to_hex_digits + VPSRLQ ymm5,ymm9,4 ; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte + VMOVDQU [rdi+11*32],ymm3 ; BBB RL11 Store Hexdump + + VPSHUFB ymm14,ymm7,ymm14 ; BBB RL14 shuffle_to_hex_digits + VMOVDQU [rdi+12*32],ymm12 ; BBB RL12 Store Hexdump + VPERMQ ymm12, [rsi+4*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL0 + VMOVDQU [rdi+13*32],ymm13 ; BBB RL13 Store Hexdump + VPERMQ ymm13, [rsi+5*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL1 + + VPSHUFB ymm15,ymm7,ymm15 ; BBB RL15 shuffle_to_hex_digits + VMOVDQU [rdi+14*32],ymm14 ; BBB RL14 Store Hexdump + VPUNPCKLBW ymm0,ymm4,ymm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPERMQ ymm14,[rsi+6*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL2 + + VMOVDQU [rdi+15*32],ymm15 ; BBB RL15 Store Hexdump + VPERMQ ymm15,[rsi+7*32],VPERM_AVX2_OFFS ; BBB p_____5 p1____5 l3+ QL3 + + add rsi,rax ; add the number of processed array elements + + VPUNPCKHBW ymm1,ymm4,ymm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + add rdi,rcx ; add the number of processed output bytes + + + cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1 + jl .LHEXENCODE_LOOP + + ; end of normal loop reached + ; we can do one more round when original count has been reduced by one round + cmp rax,0 + je .LFINISH_EXTRA + + cmp rdx,r9 ; input buffer length was not reduced when equal + je .LFINISH_NORMAL + + sub rsi,rax ; for prefetching the last round, load the last round again + sub rdx,rax ; adopt and condition for last round also + xor rax,rax + jmp .LHEXENCODE_LOOP + + +.LFINISH_EXTRA: + add rsi,NINP_BYTES_PER_ROUND ; add the extra round to get processed bytes + jmp .LFINISH + +.LFINISH_NORMAL: + sub rsi,NINP_BYTES_PER_ROUND ; sub the added prefetch round to get processed bytes + +.LFINISH: + ; r9 = address of requested input bytes+1 + ; rsi = address of processed input bytes+1 + ; now get the minimum of rdx,rsi to rax +;; sub rsi,rax ; for last round do nothing (rax=0), else sub increment for one round +;; sub r9,rax + + mov rax,r12 + cmp rsi,r12 ; get min from r12 (address of requested input) and rsi (address of done input) + + jge .LCALC_PROCESSED_BYTES + mov rax,rsi ; rax=address of last valid input byte+1 + +.LCALC_PROCESSED_BYTES: + sub rax,r10 ; sub the input buffer start address + ; rax = number of valid processed input bytes = return value + + cmp rsi,rdx ; compare rdx (address of requested input) and rsi (address of done input) + je .LNO_ZERO_OUT + + mov r15,rax ; number of elements to process + + shl r15,1 ; number of output bytes + + add r15,r11 ; pointer to next byte after full valid output buffer + + + VPXOR ymm0,ymm0,ymm0 ; all zero +;ZERO VMOVDQU [r15],ymm0 ; zero out one register width after last output + +.LNO_ZERO_OUT: + +%ifdef __WIN__ + + VMOVDQA xmm6 ,[rsp ] + VMOVDQA xmm7 ,[rsp+1*16] + VMOVDQA xmm8 ,[rsp+2*16] + VMOVDQA xmm9 ,[rsp+3*16] + VMOVDQA xmm10,[rsp+4*16] + VMOVDQA xmm11,[rsp+5*16] + VMOVDQA xmm12,[rsp+6*16] + VMOVDQA xmm13,[rsp+7*16] + VMOVDQA xmm14,[rsp+8*16] + VMOVDQA xmm15,[rsp+9*16] + +%endif + + mov rdi,[rsp+STACK_FOR_XMM+0*8] + mov rsi,[rsp+STACK_FOR_XMM+1*8] + mov r12,[rsp+STACK_FOR_XMM+2*8] + mov r14,[rsp+STACK_FOR_XMM+3*8] + mov r15,[rsp+STACK_FOR_XMM+4*8] + + add rsp,STACK_ADJ + + + ret + +;---------------------------------------------------------------------------------------------- + + +; xmm15 ; Source Load QL7 +; xmm14 ; Source Load QL6 +; xmm13 ; Source Load QL5 +; xmm12 ; Source Load QL4 +; xmm11 ; Source Load QL3 +; xmm10 ; Source Load QL2 +; xmm9 ; Source Load QL1 +; xmm8 ; Source Load QL0 + +; xmm7 ; CONST ENCODE_SHUFFLE_TO_HEX +; xmm6 ; CONST BITMASK_NIBBLE_3_IN_WORD +; xmm5 ; Shift temp for High nibble 1 +; xmm4 ; Shift temp for High nibble 0 +; xmm3 ; Temp3 +; xmm2 ; Temp2 +; xmm1 ; Temp1 +; xmm0 ; Temp0 + + + +%define NINP_BYTES_PER_ROUND 8*16 +%define NINP_BITSHIFT 7 + + +hex_encode_ssse3: + + sub rsp,STACK_ADJ + + mov [rsp+STACK_FOR_XMM+0*8],rdi + mov [rsp+STACK_FOR_XMM+1*8],rsi + mov [rsp+STACK_FOR_XMM+2*8],r12 + mov [rsp+STACK_FOR_XMM+3*8],r14 + mov [rsp+STACK_FOR_XMM+4*8],r15 + +%ifdef __WIN__ + + MOVDQA [rsp ],xmm6 + MOVDQA [rsp+1*16],xmm7 + MOVDQA [rsp+2*16],xmm8 + MOVDQA [rsp+3*16],xmm9 + MOVDQA [rsp+4*16],xmm10 + MOVDQA [rsp+5*16],xmm11 + MOVDQA [rsp+6*16],xmm12 + MOVDQA [rsp+7*16],xmm13 + MOVDQA [rsp+8*16],xmm14 + MOVDQA [rsp+9*16],xmm15 + + mov rdi,rcx ; parameter 1 output buffer + + mov rsi,rdx ; parameter 2 input buffer + + mov rdx,r8 ; parameter 3 number of elements + +%endif + +;; Loading QL0-QL3, prefetching QL4-QL7 + + MOVDQU xmm8, [rsi+0*16] ; AAA p_____5 p1____5 l3+ QL0 + MOVDQU xmm9, [rsi+1*16] ; AAA p_____5 p1____5 l3+ QL1 + MOVDQU xmm10,[rsi+2*16] ; AAA p_____5 p1____5 l3+ QL2 + MOVDQU xmm11,[rsi+3*16] ; AAA p_____5 p1____5 l3+ QL3 + + MOVDQU xmm12,[rsi+4*16] ; BBB p_____5 p1____5 l3+ QL4 + MOVDQU xmm13,[rsi+5*16] ; BBB p_____5 p1____5 l3+ QL5 + MOVDQU xmm14,[rsi+6*16] ; BBB p_____5 p1____5 l3+ QL6 + MOVDQU xmm15,[rsi+7*16] ; BBB p_____5 p1____5 l3+ QL7 + +;; initialize constants + + MOVDQA xmm7,[ENCODE_SHUFFLE_TO_HEX] ; p_23__ l3 + + MOVDQA xmm6,[BITMASK_LOWER_HALF] ; p_23__ l3 + +;; do page overshoot checks + + mov rax,NINP_BYTES_PER_ROUND + + + mov r9,rdx ; exact requested number of elements to process + add r9,rsi ; r9 last valid pointer +1 of requested input buffer + + mov r10,rsi ; r10 saved start of input buffer + mov r12,r9 ; r12 save of end of input buffer+1 + + lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read + + lea r8,[rdx+NINP_BYTES_PER_ROUND-1] + shr r8,NINP_BITSHIFT ; number of loops + shl r8,NINP_BITSHIFT + add r8,rsi ; r8 address of last byte+1 read in complete loops + add r8,NINP_BYTES_PER_ROUND-1 ; r8 address of last byte read in normal loop with overshoot + + mov r11,r8 + +; DISABLED for NO OVERSHOOT +; add r11,rax ; r11 address of last byte of prefetched data + + shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte requested input + shr r8,NSHIFT_ADDRESS_TO_PAGE ; r8 page number of last byte read after normal round + + cmp rcx,r8 ; stay on same page + je .LSAME_PAGE_IN_ROUND + sub rdx,rax ; don't overshoot in reading: do one round less + +.LSAME_PAGE_IN_ROUND: + shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data + cmp rcx,r11 + je .LSAME_PAGE_IN_PREFETCH + sub rdx,rax ; don't overshoot in prefetch reading: do one round less + +.LSAME_PAGE_IN_PREFETCH: + add rdx,rsi ; rdx last valid pointer+1 for normal loop + + ; due to prefetch add one round to end checks + add rdx,rax + add r9,rax + + mov r11,rdi ; r11 saved start of output buffer + + mov rcx,NINP_BYTES_PER_ROUND<<1 ; increment of output buffer for each round + +;; start preprocessing before loop + + MOVDQA xmm4,xmm8 + PSRLQ xmm4,4 ; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte + MOVDQA xmm5,xmm9 + PSRLQ xmm5,4 ; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte + + MOVDQA xmm0,xmm4 + PUNPCKLBW xmm0,xmm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + MOVDQA xmm1,xmm4 + PUNPCKHBW xmm1,xmm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + add rsi,rax ; add the number of processed array elements + + + align 32 + +; ;IACA START_MARKER +; mov ebx, 111 +; db 0x64, 0x67, 0x90 + +.LHEXENCODE_LOOP: + +;; process unpacked AAA in XMM0-XMM4 and XMM8-XMM11, UNPCK BBB to XMM0-XMM1, PreLoad AAA to XMM8-XMM11 +;;RL00,RL01,RL02,RL03,RL04,RL05,RL06,RL07 + MOVDQA xmm2,xmm5 + PUNPCKLBW xmm2,xmm9 ; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0] + MOVDQA xmm4,xmm10 + PSRLQ xmm4,4 ; AAA RL04,RL05 QL2 shift Hx to lower nibble in byte + MOVDQA xmm3,xmm5 + PUNPCKHBW xmm3,xmm9 ; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0] + PAND xmm0,xmm6 ; AAA RL00 mask lower nibble + + MOVDQA xmm5,xmm11 + PSRLQ xmm5,4 ; AAA RL06,RL07 QL3 shift Hx to lower nibble in byte + MOVDQA xmm8,xmm4 + PUNPCKLBW xmm8,xmm10 ; AAA RL04 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + PAND xmm1,xmm6 ; AAA RL01 mask lower nibble + MOVDQA xmm9,xmm4 + PUNPCKHBW xmm9,xmm10 ; AAA RL05 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + PAND xmm2,xmm6 ; AAA RL02 mask lower nibble + MOVDQA xmm10,xmm5 + PUNPCKLBW xmm10,xmm11 ; AAA RL06 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + PAND xmm3,xmm6 ; AAA RL03 mask lower nibble + MOVDQA xmm4,xmm5 + PUNPCKHBW xmm4,xmm11 ; AAA RL07 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + MOVDQA xmm11,xmm4 + + MOVDQA xmm4,xmm7 + PSHUFB xmm4,xmm0 ; AAA RL00 shuffle_to_hex_digits + PAND xmm8,xmm6 ; AAA RL04 mask lower nibble + MOVDQA xmm5,xmm7 + PSHUFB xmm5,xmm1 ; AAA RL01 shuffle_to_hex_digits + PAND xmm9,xmm6 ; AAA RL05 mask lower nibble + + MOVDQA xmm0,xmm7 + PSHUFB xmm0,xmm2 ; AAA RL02 shuffle_to_hex_digits + MOVDQU [rdi+0*16],xmm4 ; AAA RL00 Store Hexdump + PAND xmm10,xmm6 ; AAA RL06 mask lower nibble + + MOVDQA xmm1,xmm7 + PSHUFB xmm1,xmm3 ; AAA RL03 shuffle_to_hex_digits + MOVDQU [rdi+1*16],xmm5 ; AAA RL01 Store Hexdump + PAND xmm11,xmm6 ; AAA RL07 mask lower nibble + + MOVDQA xmm2,xmm7 + PSHUFB xmm2,xmm8 ; AAA RL04 shuffle_to_hex_digits + MOVDQA xmm4,xmm12 + PSRLQ xmm4,4 ; BBB RL08,RL09 QL4 shift Hx to lower nibble in byte + MOVDQU [rdi+2*16],xmm0 ; AAA RL02 Store Hexdump + MOVDQA xmm3,xmm7 + PSHUFB xmm3,xmm9 ; AAA RL05 shuffle_to_hex_digits + MOVDQA xmm5,xmm13 + PSRLQ xmm5,4 ; BBB RL10,RL11 QL5 shift Hx to lower nibble in byte + MOVDQU [rdi+3*16],xmm1 ; AAA RL03 Store Hexdump + + MOVDQA xmm0,xmm7 + PSHUFB xmm0,xmm10 ; AAA RL06 shuffle_to_hex_digits + MOVDQU [rdi+4*16],xmm2 ; AAA RL04 Store Hexdump + MOVDQU xmm8, [rsi+0*16] ; AAA p_____5 p1____5 l3+ QL0 + MOVDQU [rdi+5*16],xmm3 ; AAA RL05 Store Hexdump + MOVDQU xmm9, [rsi+1*16] ; AAA p_____5 p1____5 l3+ QL1 + + MOVDQA xmm1,xmm7 + PSHUFB xmm1,xmm11 ; AAA RL07 shuffle_to_hex_digits + MOVDQU [rdi+6*16],xmm0 ; AAA RL06 Store Hexdump + MOVDQA xmm0,xmm4 + PUNPCKLBW xmm0,xmm12 ; BBB RL08 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + MOVDQU xmm10,[rsi+2*16] ; AAA p_____5 p1____5 l3+ QL2 + + MOVDQU [rdi+7*16],xmm1 ; AAA RL07 Store Hexdump + MOVDQU xmm11,[rsi+3*16] ; AAA p_____5 p1____5 l3+ QL3 + MOVDQA xmm1,xmm4 + PUNPCKHBW xmm1,xmm12 ; AAA RL09 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + +;; process unpacked BBB in XMM0-XMM4 and XMM9-XMM15, UNPCK AAA to XMM0-XMM1, PreLoad BBB to XMM12-XMM15 +;;RL08,RL09,RL10,RL11,RL12,RL13,RL14,RL15 + MOVDQA xmm2,xmm5 + PUNPCKLBW xmm2,xmm13 ; BBB RL10 p____5 l1 QQ0 [Lin0_RghtH] [... R06_HL1 R04_HL1 R06_HL0 R04_HL0] + MOVDQA xmm4,xmm14 + PSRLQ xmm4,4 ; BBB RL12,RL13 QL6 shift Hx to lower nibble in byte + MOVDQA xmm3,xmm5 + PUNPCKHBW xmm3,xmm13 ; BBB RL11 p____5 l1 QQ0 [Lin0_LeftH] [... R07_HL1 R05_HL1 R07_HL0 R05_HL0] + PAND xmm0,xmm6 ; BBB RL08 mask lower nibble + + MOVDQA xmm5,xmm15 + PSRLQ xmm5,4 ; BBB RL14,RL15 QL7 shift Hx to lower nibble in byte + MOVDQA xmm12,xmm4 + PUNPCKLBW xmm12,xmm14 ; BBB RL12 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + PAND xmm1,xmm6 ; BBB RL09 mask lower nibble + MOVDQA xmm13,xmm4 + PUNPCKHBW xmm13,xmm14 ; BBB RL13 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + PAND xmm2,xmm6 ; BBB RL10 mask lower nibble + MOVDQA xmm14,xmm5 + PUNPCKLBW xmm14,xmm15 ; BBB RL14 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + PAND xmm3,xmm6 ; BBB RL11 mask lower nibble + MOVDQA xmm4,xmm5 + PUNPCKHBW xmm4,xmm15 ; BBB RL15 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + MOVDQA xmm15,xmm4 + + MOVDQA xmm4,xmm7 + PSHUFB xmm4,xmm0 ; BBB RL08 shuffle_to_hex_digits + PAND xmm12,xmm6 ; BBB RL12 mask lower nibble + MOVDQA xmm5,xmm7 + PSHUFB xmm5,xmm1 ; BBB RL09 shuffle_to_hex_digits + PAND xmm13,xmm6 ; BBB RL13 mask lower nibble + + MOVDQA xmm0,xmm7 + PSHUFB xmm0,xmm2 ; BBB RL10 shuffle_to_hex_digits + MOVDQU [rdi+8*16],xmm4 ; BBB RL08 Store Hexdump + PAND xmm14,xmm6 ; BBB RL14 mask lower nibble + + MOVDQA xmm1,xmm7 + PSHUFB xmm1,xmm3 ; BBB RL11 shuffle_to_hex_digits + MOVDQU [rdi+9*16],xmm5 ; BBB RL09 Store Hexdump + PAND xmm15,xmm6 ; BBB RL15 mask lower nibble + + MOVDQA xmm2,xmm7 + PSHUFB xmm2,xmm12 ; BBB RL12 shuffle_to_hex_digits + MOVDQA xmm4,xmm8 + PSRLQ xmm4,4 ; AAA RL00,RL01 QL0 shift Hx to lower nibble in byte + MOVDQU [rdi+10*16],xmm0 ; BBB RL10 Store Hexdump + MOVDQA xmm3,xmm7 + PSHUFB xmm3,xmm13 ; BBB RL13 shuffle_to_hex_digits + MOVDQA xmm5,xmm9 + PSRLQ xmm5,4 ; AAA RL02,RL03 QL1 shift Hx to lower nibble in byte + MOVDQU [rdi+11*16],xmm1 ; BBB RL11 Store Hexdump + + MOVDQA xmm0,xmm7 + PSHUFB xmm0,xmm14 ; BBB RL14 shuffle_to_hex_digits + MOVDQU [rdi+12*16],xmm2 ; BBB RL12 Store Hexdump + MOVDQU xmm12, [rsi+4*16] ; BBB p_____5 p1____5 l3+ QL0 + MOVDQU [rdi+13*16],xmm3 ; BBB RL13 Store Hexdump + MOVDQU xmm13, [rsi+5*16] ; BBB p_____5 p1____5 l3+ QL1 + + MOVDQA xmm1,xmm7 + PSHUFB xmm1,xmm15 ; BBB RL15 shuffle_to_hex_digits + MOVDQU [rdi+14*16],xmm0 ; BBB RL14 Store Hexdump + MOVDQA xmm0,xmm4 + PUNPCKLBW xmm0,xmm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + MOVDQU xmm14,[rsi+6*16] ; BBB p_____5 p1____5 l3+ QL2 + + MOVDQU [rdi+15*16],xmm1 ; BBB RL15 Store Hexdump + MOVDQU xmm15,[rsi+7*16] ; BBB p_____5 p1____5 l3+ QL3 + + add rsi,rax ; add the number of processed array elements + + MOVDQA xmm1,xmm4 + PUNPCKHBW xmm1,xmm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + add rdi,rcx ; add the number of processed output bytes + + + cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1 + jl .LHEXENCODE_LOOP + + ; end of normal loop reached + ; we can do one more round when original count has been reduced by one round + cmp rax,0 + je .LFINISH_EXTRA + + cmp rdx,r9 ; input buffer length was not reduced when equal + je .LFINISH_NORMAL + + sub rsi,rax ; for prefetching the last round, load the last round again + sub rdx,rax ; adopt and condition for last round also + xor rax,rax + jmp .LHEXENCODE_LOOP + + +.LFINISH_EXTRA: + add rsi,NINP_BYTES_PER_ROUND ; add the extra round to get processed bytes + jmp .LFINISH + +.LFINISH_NORMAL: + sub rsi,NINP_BYTES_PER_ROUND ; sub the added prefetch round to get processed bytes + +.LFINISH: + ; r9 = address of requested input bytes+1 + ; rsi = address of processed input bytes+1 + ; now get the minimum of rdx,rsi to rax +;; sub rsi,rax ; for last round do nothing (rax=0), else sub increment for one round +;; sub r9,rax + + mov rax,r12 + cmp rsi,r12 ; get min from r12 (address of requested input) and rsi (address of done input) + + jge .LCALC_PROCESSED_BYTES + mov rax,rsi ; rax=address of last valid input byte+1 + +.LCALC_PROCESSED_BYTES: + sub rax,r10 ; sub the input buffer start address + ; rax = number of valid processed input bytes = return value + + cmp rsi,rdx ; compare rdx (address of requested input) and rsi (address of done input) + je .LNO_ZERO_OUT + + mov r15,rax ; number of elements to process + + shl r15,1 ; number of output bytes + + add r15,r11 ; pointer to next byte after full valid output buffer + + + PXOR xmm0,xmm0 ; all zero +;ZERO MOVDQU [r15],xmm0 ; zero out one register width after last output + +.LNO_ZERO_OUT: + +%ifdef __WIN__ + + MOVDQA xmm6 ,[rsp ] + MOVDQA xmm7 ,[rsp+1*16] + MOVDQA xmm8 ,[rsp+2*16] + MOVDQA xmm9 ,[rsp+3*16] + MOVDQA xmm10,[rsp+4*16] + MOVDQA xmm11,[rsp+5*16] + MOVDQA xmm12,[rsp+6*16] + MOVDQA xmm13,[rsp+7*16] + MOVDQA xmm14,[rsp+8*16] + MOVDQA xmm15,[rsp+9*16] + +%endif + + mov rdi,[rsp+STACK_FOR_XMM+0*8] + mov rsi,[rsp+STACK_FOR_XMM+1*8] + mov r12,[rsp+STACK_FOR_XMM+2*8] + mov r14,[rsp+STACK_FOR_XMM+3*8] + mov r15,[rsp+STACK_FOR_XMM+4*8] + + add rsp,STACK_ADJ + + ret + + + +;---------------------------------------------------------------------------------------------- + +; k7 ; CONST BITMASK_ONE_ZERO 1010101010101010 selecting upper half +; k6 ; +; k5 ; CONST BITMASK_ZERO_ONE 0101010101010101 selecting lower half +; k4 ; digit flags QL3 +; k3 ; digit flags QL2 +; k2 ; digit flags QL1 +; k1 ; digit flags QL0 +; k0 ; + +; zmm31 ; CONST ENCODE_SHUFFLE_TO_HEX +; zmm30 ; CONST BITMASK_NIBBLE_3_IN_WORD +; zmm29 ; CONST VPERM_ENCODE_OFFSETS +; zmm28 ; CONST ALL_BYTES_39 ; CONST 48 = 39+9, calculated in the middle +; zmm27 ; Unpack Upper RL5 RL7 +; zmm26 ; Unpack Lower RL4 RL6 +; zmm25 ; Unpack Upper RL1 RL3 +; zmm24 ; Unpack Lower RL0 RL2 +; zmm23 ; Source Load QLF +; zmm22 ; Source Load QLE +; zmm21 ; Source Load QLD +; zmm20 ; Source Load QLC +; zmm19 ; Source Load QLB +; zmm18 ; Source Load QLA +; zmm17 ; Source Load QL9 +; zmm16 ; Source Load QL8 + +; zmm15 ; Source Load QL7 +; zmm14 ; Source Load QL6 +; zmm13 ; Source Load QL5 +; zmm12 ; Source Load QL4 +; zmm11 ; Source Load QL3 +; zmm10 ; Source Load QL2 +; zmm9 ; Source Load QL1 +; zmm8 ; Source Load QL0 +; zmm7 ; RL3 +; zmm6 ; RL3 +; zmm5 ; RL2 +; zmm4 ; RL2 +; zmm3 ; +; zmm2 ; RL1 +; zmm1 ; CONST ALL bytes 48 +; zmm0 ; RL0 + +%define NHALF_INP_BYTES_PER_ROUND 8*64 +%define NINP_BYTES_PER_ROUND 2*NHALF_INP_BYTES_PER_ROUND +%define NINP_BITSHIFT 10 + +hex_encode_avx512bw: + + sub rsp,STACK_ADJ + + mov [rsp+STACK_FOR_XMM+0*8],rdi + mov [rsp+STACK_FOR_XMM+1*8],rsi + mov [rsp+STACK_FOR_XMM+2*8],r12 + mov [rsp+STACK_FOR_XMM+3*8],r14 + mov [rsp+STACK_FOR_XMM+4*8],r15 + +%ifdef __WIN__ + + VMOVDQA [rsp ],xmm6 + VMOVDQA [rsp+1*16],xmm7 + VMOVDQA [rsp+2*16],xmm8 + VMOVDQA [rsp+3*16],xmm9 + VMOVDQA [rsp+4*16],xmm10 + VMOVDQA [rsp+5*16],xmm11 + VMOVDQA [rsp+6*16],xmm12 + VMOVDQA [rsp+7*16],xmm13 + VMOVDQA [rsp+8*16],xmm14 + VMOVDQA [rsp+9*16],xmm15 + + mov rdi,rcx ; parameter 1 output buffer + + mov rsi,rdx ; parameter 2 input buffer + + mov rdx,r8 ; parameter 3 number of elements + +%endif + + VMOVDQA64 zmm29,[VPERM_ENCODE_OFFSETS] ; p_23__ l3 + +;; initializer for QQ0 and QQ1 + + VPERMQ zmm8 ,zmm29,[rsi+0*64] ; AAA p____5 l3+ QL0 RL00,RL01 + VPERMQ zmm9 ,zmm29,[rsi+1*64] ; AAA p____5 l3+ QL1 RL02,RL03 + VPERMQ zmm10,zmm29,[rsi+2*64] ; AAA p____5 l3+ QL2 RL04,RL05 + VPERMQ zmm11,zmm29,[rsi+3*64] ; AAA p____5 l3+ QL3 RL06,RL07 + + VPERMQ zmm12,zmm29,[rsi+4*64] ; BBB p____5 l3+ QL4 RL08,RL09 + VPERMQ zmm13,zmm29,[rsi+5*64] ; BBB p____5 l3+ QL5 RL10,RL11 + VPERMQ zmm14,zmm29,[rsi+6*64] ; BBB p____5 l3+ QL6 RL12,RL13 + VPERMQ zmm15,zmm29,[rsi+7*64] ; BBB p____5 l3+ QL7 RL14,RL15 + + add rsi,rax ; add half the number of processed array elements + + VPERMQ zmm16,zmm29,[rsi+0*64] ; CCC p____5 l3+ QL8 RL16,RL17 + VPERMQ zmm17,zmm29,[rsi+1*64] ; CCC p____5 l3+ QL9 RL18,RL19 + VPERMQ zmm18,zmm29,[rsi+2*64] ; CCC p____5 l3+ QLA RL20,RL21 + VPERMQ zmm19,zmm29,[rsi+3*64] ; CCC p____5 l3+ QLB RL22,RL23 + + VPERMQ zmm20,zmm29,[rsi+4*64] ; DDD p____5 l3+ QLC RL24,RL25 + VPERMQ zmm21,zmm29,[rsi+5*64] ; DDD p____5 l3+ QLD RL26,RL27 + VPERMQ zmm22,zmm29,[rsi+6*64] ; DDD p____5 l3+ QLE RL28,RL29 + VPERMQ zmm23,zmm29,[rsi+7*64] ; DDD p____5 l3+ QLF RL30,RL31 + +;; initialize constants + + KMOVQ k7,[BITMASK_ONE_ZERO] + + VMOVDQA64 zmm31,[ENCODE_SHUFFLE_TO_HEX] ; p_23__ l3 + VMOVDQA64 zmm1,zmm31 + + VPBROADCASTQ zmm30,[BITMASK_LOWER_HALF] ; p_23__ l3 + + + VMOVDQA64 zmm28,[ENCODE_SHUFFLE_TO_HIGH_LOW] ; p_23__ l3 + +;; do page overshoot checks + + mov rax,NHALF_INP_BYTES_PER_ROUND + + + mov r9,rdx ; exact requested number of elements to process + add r9,rsi ; r9 last valid pointer +1 of requested input buffer + + mov r10,rsi ; r10 saved start of input buffer + mov r12,r9 ; r12 save of end of input buffer+1 + + lea rcx,[rsi+rdx-1] ; rcx address of last byte requested to read + + lea r8,[rdx+NINP_BYTES_PER_ROUND-1] + shr r8,NINP_BITSHIFT ; number of loops + shl r8,NINP_BITSHIFT + add r8,rsi ; r8 address of last byte+1 read in complete loops + add r8,NINP_BYTES_PER_ROUND-1 ; r8 address of last byte read in normal loop with overshoot + + mov r11,r8 + +; DISABLED for NO OVERSHOOT +; add r11,rax ; r11 address of last byte of prefetched data (ONLY HALF A ROUND!) + sub r11,rax ; r11 address of last byte of prefetched data (ONLY HALF A ROUND!) + sub r11,rax ; r11 address of last byte of prefetched data (ONLY HALF A ROUND!) + sub r11,rax ; r11 address of last byte of prefetched data (ONLY HALF A ROUND!) + + shr rcx,NSHIFT_ADDRESS_TO_PAGE ; rcx page number of last byte after normal round + shr r8,NSHIFT_ADDRESS_TO_PAGE ; r8 page number of last byte after prefetch + + cmp rcx,r8 ; stay on same page + je .LSAME_PAGE_IN_ROUND + sub rdx,rax ; don't overshoot in reading: do one round less + sub rdx,rax ; don't overshoot in reading: do one round less + +.LSAME_PAGE_IN_ROUND: + shr r11,NSHIFT_ADDRESS_TO_PAGE ; r11 page number of byte after prefetched data + cmp rcx,r11 + je .LSAME_PAGE_IN_PREFETCH + sub rdx,rax ; don't overshoot in prefetch reading: do one round less + sub rdx,rax ; don't overshoot in prefetch reading: do one round less + +.LSAME_PAGE_IN_PREFETCH: + add rdx,rsi ; rdx last valid pointer+1 for normal loop + + ; due to prefetch add one round to end checks + add rdx,rax + add r9,rax + + mov r11,rdi ; r11 saved start of output buffer + + mov rcx,NHALF_INP_BYTES_PER_ROUND<<1 ; increment of output buffer for each round + + +;; start preprocessing before loop + + VPSRLQ zmm2,zmm8,4 ; AAA RL00+RL01 QL0 shift Hx to lower nibble in byte + VPSRLQ zmm3,zmm9,4 ; AAA RL02+RL03 QL1 shift Hx to lower nibble in byte + + VPUNPCKLBW zmm0,zmm2,zmm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPUNPCKHBW zmm1,zmm2,zmm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + VPSRLQ zmm6,zmm10,4 ; AAA RL04+RL05 QL2 shift Hx to lower nibble in byte + VPSRLQ zmm7,zmm11,4 ; AAA RL06+RL07 QL3 shift Hx to lower nibble in byte + + VPUNPCKLBW zmm2,zmm3,zmm9 ; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPUNPCKHBW zmm3,zmm3,zmm9 ; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + add rsi,rax ; add half the number of processed array elements + + align 32 + +; ;IACA START_MARKER +; mov ebx, 111 +; db 0x64, 0x67, 0x90 + +; Vector Port info AVX512 +; ---------------------------------------- +; VPShift p0 l1 +; VPMax/Min p0 l1 +; VPMUL p0 l5 ; with 2FMA-Units p05 (SKX,CLX etc.) +; VPMOVB2M p0 l3 +; VPSUBUSB /SSB p0 l1 + +; VPALIGNR p5 l1 ;Shift of n*8 bits! +; VPERM p5 l3 +; VPERMI2x 1*p05+2*p5 l7 ; (l9 with flags) +; VPCompare p5 l3-l4 +; VP Pack/Unpack p5 l1(SKX) l3(TGL) +; VPSHUF p5 l1 + + +.LHEXENCODE_LOOP: + +;; AAA+BBB +; process unpacked AAA (QL0-QL4=RL00-RL07) in zmm0-zmm7 and process BBB (QL4-QL7=RL08-RL15) in zmm8-zmm15 and zmm2+zmm3 + + VPUNPCKLBW zmm4,zmm6,zmm10 ; AAA RL04 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPANDQ zmm0,zmm0,zmm30 ; AAA RL00 mask lower nibble + + VPUNPCKHBW zmm5,zmm6,zmm10 ; AAA RL05 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + VPSRLQ zmm10,zmm12,4 ; BBB RL08+RL09 QL4 shift Hx to lower nibble in byte + VPUNPCKLBW zmm8,zmm10,zmm12 ; BBB RL08 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPANDQ zmm1,zmm1,zmm30 ; AAA RL01 mask lower nibble + + VPUNPCKLBW zmm6,zmm7,zmm11 ; AAA RL06 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPANDQ zmm2,zmm2,zmm30 ; AAA RL02 mask lower nibble + VPUNPCKHBW zmm7,zmm7,zmm11 ; AAA RL07 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + VPSRLQ zmm11,zmm13,4 ; BBB RL10+RL11 QL5 shift Hx to lower nibble in byte + VPANDQ zmm8,zmm8,zmm30 ; BBB RL08 mask lower nibble + + VPUNPCKHBW zmm9,zmm10,zmm12 ; BBB RL09 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + VPANDQ zmm3,zmm3,zmm30 ; AAA RL03 mask lower nibble + VPUNPCKLBW zmm10,zmm11,zmm13 ; BBB RL10 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPANDQ zmm9,zmm9,zmm30 ; BBB RL09 mask lower nibble + + VPSHUFB zmm0,zmm31,zmm0 ; AAA RL00 shuffle_to_hex_digits + VPANDQ zmm4,zmm4,zmm30 ; AAA RL04 mask lower nibble + VPUNPCKHBW zmm11,zmm11,zmm13 ; BBB RL11 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + VPANDQ zmm10,zmm10,zmm30 ; BBB RL10 mask lower nibble + + VPSHUFB zmm1,zmm31,zmm1 ; AAA RL01 shuffle_to_hex_digits + VPANDQ zmm5,zmm5,zmm30 ; AAA RL05 mask lower nibble + VPSHUFB zmm8,zmm31,zmm8 ; BBB RL08 shuffle_to_hex_digits + VPANDQ zmm11,zmm11,zmm30 ; BBB RL11 mask lower nibble + + VPSHUFB zmm2,zmm31,zmm2 ; AAA RL02 shuffle_to_hex_digits + VMOVDQA64 [rdi+0*64],zmm0 ; AAA RL00 Store Hexdump + VMOVDQA64 [rdi+1*64],zmm1 ; AAA RL01 Store Hexdump + VPANDQ zmm6,zmm6,zmm30 ; AAA RL06 mask lower nibble + VPSHUFB zmm9,zmm31,zmm9 ; BBB RL09 shuffle_to_hex_digits + + VPSHUFB zmm3,zmm31,zmm3 ; AAA RL03 shuffle_to_hex_digits + VPANDQ zmm7,zmm7,zmm30 ; AAA RL07 mask lower nibble + VMOVDQA64 [rdi+2*64],zmm2 ; AAA RL02 Store Hexdump + VPSRLQ zmm2,zmm14,4 ; BBB RL12+RL13 QL6 shift Hx to lower nibble in byte + VPSHUFB zmm10,zmm31,zmm10 ; BBB RL10 shuffle_to_hex_digits + VMOVDQA64 [rdi+3*64],zmm3 ; AAA RL03 Store Hexdump + + VPSRLQ zmm3,zmm15,4 ; BBB RL14+RL15 QL7 shift Hx to lower nibble in byte + VPUNPCKLBW zmm12,zmm2,zmm14 ; BBB RL12 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPSHUFB zmm4,zmm31,zmm4 ; AAA RL04 shuffle_to_hex_digits + VMOVDQA64 [rdi+4*64],zmm4 ; AAA RL04 Store Hexdump + VPSHUFB zmm11,zmm31,zmm11 ; BBB RL11 shuffle_to_hex_digits + VPUNPCKHBW zmm13,zmm2,zmm14 ; BBB RL13 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + VPSHUFB zmm5,zmm31,zmm5 ; AAA RL05 shuffle_to_hex_digits + VPANDQ zmm12,zmm12,zmm30 ; BBB RL12 mask lower nibble + VPUNPCKLBW zmm14,zmm3,zmm15 ; BBB RL14 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VMOVDQA64 [rdi+5*64],zmm5 ; AAA RL05 Store Hexdump + + VPSHUFB zmm6,zmm31,zmm6 ; AAA RL06 shuffle_to_hex_digits + VPANDQ zmm13,zmm13,zmm30 ; BBB RL13 mask lower nibble + VPUNPCKHBW zmm15,zmm3,zmm15 ; BBB RL15 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + VPSHUFB zmm7,zmm31,zmm7 ; AAA RL07 shuffle_to_hex_digits + VPANDQ zmm14,zmm14,zmm30 ; BBB RL14 mask lower nibble + VMOVDQA64 [rdi+6*64],zmm6 ; AAA RL06 Store Hexdump + VMOVDQA64 [rdi+7*64],zmm7 ; AAA RL07 Store Hexdump + VPSHUFB zmm12,zmm31,zmm12 ; BBB RL12 shuffle_to_hex_digits + VPANDQ zmm15,zmm15,zmm30 ; BBB RL15 mask lower nibble +; + VMOVDQA64 [rdi+8*64],zmm8 ; BBB RL08 Store Hexdump + VPERMQ zmm8 ,zmm29,[rsi+0*64] ; AAA p____5 l3+ QL0 RL00,RL01 + VMOVDQA64 [rdi+9*64],zmm9 ; BBB RL09 Store Hexdump + VPERMQ zmm9 ,zmm29,[rsi+1*64] ; AAA p____5 l3+ QL1 RL02,RL03 + VPSHUFB zmm13,zmm31,zmm13 ; BBB RL13 shuffle_to_hex_digits + + VMOVDQA64 [rdi+10*64],zmm10 ; BBB RL10 Store Hexdump + VPERMQ zmm10,zmm29,[rsi+2*64] ; AAA p____5 l3+ QL2 RL04,RL05 + VPSRLQ zmm2,zmm16,4 ; CCC RL16+RL17 QL8 shift Hx to lower nibble in byte + VMOVDQA64 [rdi+11*64],zmm11 ; BBB RL11 Store Hexdump + VPERMQ zmm11,zmm29,[rsi+3*64] ; AAA p____5 l3+ QL3 RL06,RL07 + + VPSHUFB zmm14,zmm31,zmm14 ; BBB RL14 shuffle_to_hex_digits + VPSRLQ zmm3,zmm17,4 ; CCC RL18+RL19 QL9 shift Hx to lower nibble in byte + VPSHUFB zmm15,zmm31,zmm15 ; BBB RL15 shuffle_to_hex_digits + + VPUNPCKLBW zmm0,zmm2,zmm16 ; CCC RL16 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPSRLQ zmm6,zmm18,4 ; CCC RL20+RL21 QLA shift Hx to lower nibble in byte + VMOVDQA64 [rdi+12*64],zmm12 ; BBB RL12 Store Hexdump + VPERMQ zmm12,zmm29,[rsi+4*64] ; BBB p____5 l3+ QL4 RL08,RL09 + + VPUNPCKHBW zmm1,zmm2,zmm16 ; CCC RL17 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + VPSRLQ zmm7,zmm19,4 ; CCC RL22+RL23 QLB shift Hx to lower nibble in byte + VMOVDQA64 [rdi+13*64],zmm13 ; BBB RL13 Store Hexdump + VPERMQ zmm13,zmm29,[rsi+5*64] ; BBB p____5 l3+ QL5 RL10,RL11 + + VMOVDQA64 [rdi+14*64],zmm14 ; BBB RL14 Store Hexdump + VPERMQ zmm14,zmm29,[rsi+6*64] ; BBB p____5 l3+ QL6 RL12,RL13 + VPUNPCKLBW zmm2,zmm3,zmm17 ; CCC RL18 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VMOVDQA64 [rdi+15*64],zmm15 ; BBB RL15 Store Hexdump + + add rdi,rcx ; add half the number of processed output bytes + + VPERMQ zmm15,zmm29,[rsi+7*64] ; BBB p____5 l3+ QL7 RL14,RL15 + VPUNPCKHBW zmm3,zmm3,zmm17 ; CCC RL19 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + +;; CCC+DDD +; process unpacked CCC (QL8-QLC=RL16-RL23) in zmm0-zmm7 and process DDD (QLC-QLF=RL24-RL31) in zmm16-zmm23 and zmm2+zmm3 + add rsi,rax ; add half the number of processed array elements + + VPUNPCKLBW zmm4,zmm6,zmm18 ; CCC RL20 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPANDQ zmm0,zmm0,zmm30 ; CCC RL16 mask lower nibble + + VPUNPCKHBW zmm5,zmm6,zmm18 ; CCC RL21 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + VPSRLQ zmm18,zmm20,4 ; DDD RL24+RL25 QLC shift Hx to lower nibble in byte + VPUNPCKLBW zmm16,zmm18,zmm20 ; DDD RL24 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPANDQ zmm1,zmm1,zmm30 ; CCC RL17 mask lower nibble + + VPUNPCKLBW zmm6,zmm7,zmm19 ; CCC RL22 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPANDQ zmm2,zmm2,zmm30 ; CCC RL18 mask lower nibble + VPUNPCKHBW zmm7,zmm7,zmm19 ; CCC RL23 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + VPSRLQ zmm19,zmm21,4 ; DDD RL26+RL27 QLD shift Hx to lower nibble in byte + VPANDQ zmm16,zmm16,zmm30 ; DDD RL24 mask lower nibble + + VPUNPCKHBW zmm17,zmm18,zmm20 ; DDD RL25 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + VPANDQ zmm3,zmm3,zmm30 ; CCC RL19 mask lower nibble + VPUNPCKLBW zmm18,zmm19,zmm21 ; DDD RL26 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPANDQ zmm17,zmm17,zmm30 ; DDD RL25 mask lower nibble + + VPSHUFB zmm0,zmm31,zmm0 ; CCC RL16 shuffle_to_hex_digits + VPANDQ zmm4,zmm4,zmm30 ; CCC RL20 mask lower nibble + VPUNPCKHBW zmm19,zmm19,zmm21 ; DDD RL27 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + VPANDQ zmm18,zmm18,zmm30 ; DDD RL26 mask lower nibble + + VPSHUFB zmm1,zmm31,zmm1 ; CCC RL17 shuffle_to_hex_digits + VPANDQ zmm5,zmm5,zmm30 ; CCC RL21 mask lower nibble + VPSHUFB zmm16,zmm31,zmm16 ; DDD RL24 shuffle_to_hex_digits + VPANDQ zmm19,zmm19,zmm30 ; DDD RL27 mask lower nibble + + VPSHUFB zmm2,zmm31,zmm2 ; CCC RL18 shuffle_to_hex_digits + VMOVDQA64 [rdi+0*64],zmm0 ; CCC RL16 Store Hexdump + VMOVDQA64 [rdi+1*64],zmm1 ; CCC RL17 Store Hexdump + VPANDQ zmm6,zmm6,zmm30 ; CCC RL22 mask lower nibble + VPSHUFB zmm17,zmm31,zmm17 ; DDD RL25 shuffle_to_hex_digits + + VPSHUFB zmm3,zmm31,zmm3 ; CCC RL19 shuffle_to_hex_digits + VPANDQ zmm7,zmm7,zmm30 ; CCC RL23 mask lower nibble + VMOVDQA64 [rdi+2*64],zmm2 ; CCC RL18 Store Hexdump + VPSRLQ zmm2,zmm22,4 ; DDD RL28+RL29 QLE shift Hx to lower nibble in byte + VPSHUFB zmm18,zmm31,zmm18 ; DDD RL26 shuffle_to_hex_digits + VMOVDQA64 [rdi+3*64],zmm3 ; CCC RL19 Store Hexdump + + VPSRLQ zmm3,zmm23,4 ; DDD RL30+RL31 QLF shift Hx to lower nibble in byte + VPUNPCKLBW zmm20,zmm2,zmm22 ; DDD RL28 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPSHUFB zmm4,zmm31,zmm4 ; CCC RL20 shuffle_to_hex_digits + VMOVDQA64 [rdi+4*64],zmm4 ; CCC RL20 Store Hexdump + VPSHUFB zmm19,zmm31,zmm19 ; DDD RL27 shuffle_to_hex_digits + VPUNPCKHBW zmm21,zmm2,zmm22 ; DDD RL29 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + VPSHUFB zmm5,zmm31,zmm5 ; CCC RL21 shuffle_to_hex_digits + VPANDQ zmm20,zmm20,zmm30 ; DDD RL28 mask lower nibble + VPUNPCKLBW zmm22,zmm3,zmm23 ; DDD RL30 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VMOVDQA64 [rdi+5*64],zmm5 ; CCC RL21 Store Hexdump + + VPSHUFB zmm6,zmm31,zmm6 ; CCC RL22 shuffle_to_hex_digits + VPANDQ zmm21,zmm21,zmm30 ; DDD RL29 mask lower nibble + VPUNPCKHBW zmm23,zmm3,zmm23 ; DDD RL31 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + VPSHUFB zmm7,zmm31,zmm7 ; CCC RL23 shuffle_to_hex_digits + VPANDQ zmm22,zmm22,zmm30 ; DDD RL30 mask lower nibble + VMOVDQA64 [rdi+6*64],zmm6 ; CCC RL22 Store Hexdump + VMOVDQA64 [rdi+7*64],zmm7 ; CCC RL23 Store Hexdump + VPSHUFB zmm20,zmm31,zmm20 ; DDD RL28 shuffle_to_hex_digits + VPANDQ zmm23,zmm23,zmm30 ; DDD RL31 mask lower nibble +; + VMOVDQA64 [rdi+8*64],zmm16 ; DDD RL24 Store Hexdump + VPERMQ zmm16,zmm29,[rsi+0*64] ; CCC p____5 l3+ QL8 RL16,RL17 + VMOVDQA64 [rdi+9*64],zmm17 ; DDD RL25 Store Hexdump + VPERMQ zmm17,zmm29,[rsi+1*64] ; CCC p____5 l3+ QL9 RL18,RL19 + VPSHUFB zmm21,zmm31,zmm21 ; DDD RL29 shuffle_to_hex_digits + + VMOVDQA64 [rdi+10*64],zmm18 ; DDD RL26 Store Hexdump + VPERMQ zmm18,zmm29,[rsi+2*64] ; CCC p____5 l3+ QLA RL20,RL21 + VPSRLQ zmm2,zmm8,4 ; AAA RL00+RL01 QL0 shift Hx to lower nibble in byte + VMOVDQA64 [rdi+11*64],zmm19 ; DDD RL27 Store Hexdump + VPERMQ zmm19,zmm29,[rsi+3*64] ; CCC p____5 l3+ QLB RL22,RL23 + + VPSHUFB zmm22,zmm31,zmm22 ; DDD RL30 shuffle_to_hex_digits + VPSRLQ zmm3,zmm9,4 ; AAA RL02+RL03 QL1 shift Hx to lower nibble in byte + VPSHUFB zmm23,zmm31,zmm23 ; DDD RL31 shuffle_to_hex_digits + + VPUNPCKLBW zmm0,zmm2,zmm8 ; AAA RL00 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VPSRLQ zmm6,zmm10,4 ; AAA RL04+RL05 QL2 shift Hx to lower nibble in byte + VMOVDQA64 [rdi+12*64],zmm20 ; DDD RL28 Store Hexdump + VPERMQ zmm20,zmm29,[rsi+4*64] ; DDD p____5 l3+ QLC RL24,RL25 + + VPUNPCKHBW zmm1,zmm2,zmm8 ; AAA RL01 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + VPSRLQ zmm7,zmm11,4 ; AAA RL06+RL07 QL3 shift Hx to lower nibble in byte + VMOVDQA64 [rdi+13*64],zmm21 ; DDD RL29 Store Hexdump + VPERMQ zmm21,zmm29,[rsi+5*64] ; DDD p____5 l3+ QLD RL26,RL27 + + VMOVDQA64 [rdi+14*64],zmm22 ; DDD RL30 Store Hexdump + VPERMQ zmm22,zmm29,[rsi+6*64] ; DDD p____5 l3+ QLE RL28,RL29 + VPUNPCKLBW zmm2,zmm3,zmm9 ; AAA RL02 p____5 l1 QQ0 [Lin0_RghtH] [... R02_HL1 R00_HL1 R02_HL0 R00_HL0] + VMOVDQA64 [rdi+15*64],zmm23 ; DDD RL31 Store Hexdump + VPERMQ zmm23,zmm29,[rsi+7*64] ; DDD p____5 l3+ QLF RL30,RL31 + + add rsi,rax ; add half the number of processed array elements + + VPUNPCKHBW zmm3,zmm3,zmm9 ; AAA RL03 p____5 l1 QQ0 [Lin0_LeftH] [... R03_HL1 R01_HL1 R03_HL0 R01_HL0] + + add rdi,rcx ; add half the number of processed output bytes + + cmp rsi,rdx ; compare incremented rsi (= pointer to next chunk) to last valid pointer +1 + jl .LHEXENCODE_LOOP + + ; end of normal loop reached + ; we can do one more round when original count has been reduced by one round + cmp rax,0 + je .LFINISH_EXTRA + + cmp rdx,r9 ; input buffer length was not reduced when equal + je .LFINISH_NORMAL + + add rax,rax ; rax is only half the bytes of input round, so double it + sub rsi,rax ; for prefetching the last round, load the last round again + sub rdx,rax ; adopt and condition for last round also + mov rax,0 + jmp .LHEXENCODE_LOOP + + +.LFINISH_EXTRA: + add rsi,NINP_BYTES_PER_ROUND ; add the extra round to get processed bytes + jmp .LFINISH + +.LFINISH_NORMAL: + sub rsi,NINP_BYTES_PER_ROUND ; sub the added prefetch round to get processed bytes + +.LFINISH: + ; r9 = address of requested input bytes+1 + ; rsi = address of processed input bytes+1 + ; now get the minimum of rdx,rsi to rax +;; sub rsi,rax ; for last round do nothing (rax=0), else sub increment for one round +;; sub r9,rax + + mov rax,r12 + cmp rsi,r12 ; get min from rdx (address of requested input) and rsi (address of done input) + + jge .LCALC_PROCESSED_BYTES + mov rax,rsi ; rax=address of last valid input byte+1 + +.LCALC_PROCESSED_BYTES: + sub rax,r10 ; sub the input buffer start address + ; rax = number of valid processed input bytes = return value + + cmp rsi,rdx ; compare rdx (address of requested input) and rsi (address of done input) + je .LNO_ZERO_OUT + + mov r15,rax ; number of elements to process + + shl r15,1 ; number of output bytes + + add r15,r11 ; pointer to next byte after full valid output buffer + + + VPXORQ zmm0,zmm0,zmm0 ; all zero +;ZERO VMOVDQU64 [r15],zmm0 ; zero out one register width after last output + +.LNO_ZERO_OUT: + +%ifdef __WIN__ + + VMOVDQA xmm6 ,[rsp ] + VMOVDQA xmm7 ,[rsp+1*16] + VMOVDQA xmm8 ,[rsp+2*16] + VMOVDQA xmm9 ,[rsp+3*16] + VMOVDQA xmm10,[rsp+4*16] + VMOVDQA xmm11,[rsp+5*16] + VMOVDQA xmm12,[rsp+6*16] + VMOVDQA xmm13,[rsp+7*16] + VMOVDQA xmm14,[rsp+8*16] + VMOVDQA xmm15,[rsp+9*16] + +%endif + + mov rdi,[rsp+STACK_FOR_XMM+0*8] + mov rsi,[rsp+STACK_FOR_XMM+1*8] + mov r12,[rsp+STACK_FOR_XMM+2*8] + mov r14,[rsp+STACK_FOR_XMM+3*8] + mov r15,[rsp+STACK_FOR_XMM+4*8] + + add rsp,STACK_ADJ + + ret + +;---------------------------------------------------------------------------------------------- + +%endif diff --git a/postgresql-15devel_orig/src/backend/utils/adt/varlena.c b/postgresql-15devel/src/backend/utils/adt/varlena.c index bd3091b..183f67f 100644 --- a/postgresql-15devel_orig/src/backend/utils/adt/varlena.c +++ b/postgresql-15devel/src/backend/utils/adt/varlena.c @@ -397,7 +397,7 @@ byteaout(PG_FUNCTION_ARGS) if (bytea_output == BYTEA_OUTPUT_HEX) { /* Print hex format */ - rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1); + rp = result = palloc(hex_enc_len(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena)) + 2 + 1); *rp++ = '\\'; *rp++ = 'x'; rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp); diff --git a/postgresql-15devel_orig/src/include/utils/builtins.h b/postgresql-15devel/src/include/utils/builtins.h index b07eefa..e6efb73 100644 --- a/postgresql-15devel_orig/src/include/utils/builtins.h +++ b/postgresql-15devel/src/include/utils/builtins.h @@ -35,6 +35,9 @@ extern int errdomainconstraint(Oid datatypeOid, const char *conname); extern uint64 hex_encode(const char *src, size_t len, char *dst); extern uint64 hex_decode(const char *src, size_t len, char *dst); +extern uint64 hex_enc_len(const char *src, size_t srclen); +extern uint64 hex_dec_len(const char *src, size_t srclen); + /* int.c */ extern int2vector *buildint2vector(const int16 *int2s, int n);