fix: breakpad use miniz
Some checks failed
sm-rpc / build (Debug, arm-linux-gnueabihf) (push) Successful in 1m34s
sm-rpc / build (Debug, aarch64-linux-gnu) (push) Successful in 2m46s
sm-rpc / build (Debug, host.gcc) (push) Failing after 1m28s
sm-rpc / build (Release, aarch64-linux-gnu) (push) Successful in 2m14s
sm-rpc / build (Release, arm-linux-gnueabihf) (push) Successful in 2m8s
sm-rpc / build (Debug, mipsel-linux-gnu) (push) Successful in 5m35s
sm-rpc / build (Release, host.gcc) (push) Failing after 1m55s
sm-rpc / build (Release, mipsel-linux-gnu) (push) Successful in 7m21s
Some checks failed
sm-rpc / build (Debug, arm-linux-gnueabihf) (push) Successful in 1m34s
sm-rpc / build (Debug, aarch64-linux-gnu) (push) Successful in 2m46s
sm-rpc / build (Debug, host.gcc) (push) Failing after 1m28s
sm-rpc / build (Release, aarch64-linux-gnu) (push) Successful in 2m14s
sm-rpc / build (Release, arm-linux-gnueabihf) (push) Successful in 2m8s
sm-rpc / build (Debug, mipsel-linux-gnu) (push) Successful in 5m35s
sm-rpc / build (Release, host.gcc) (push) Failing after 1m55s
sm-rpc / build (Release, mipsel-linux-gnu) (push) Successful in 7m21s
This commit is contained in:
147
third_party/zlib-ng/arch/x86/Makefile.in
vendored
Normal file
147
third_party/zlib-ng/arch/x86/Makefile.in
vendored
Normal file
@@ -0,0 +1,147 @@
|
||||
# Makefile for zlib
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2
|
||||
AVX512VNNIFLAG=-mavx512vnni -mbmi2
|
||||
AVX2FLAG=-mavx2 -mbmi2
|
||||
SSE2FLAG=-msse2
|
||||
SSSE3FLAG=-mssse3
|
||||
SSE42FLAG=-msse4.2
|
||||
PCLMULFLAG=-mpclmul
|
||||
VPCLMULFLAG=-mvpclmulqdq
|
||||
XSAVEFLAG=-mxsave
|
||||
NOLTOFLAG=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: \
|
||||
x86_features.o x86_features.lo \
|
||||
adler32_avx2.o adler32_avx2.lo \
|
||||
adler32_avx512.o adler32_avx512.lo \
|
||||
adler32_avx512_vnni.o adler32_avx512_vnni.lo \
|
||||
adler32_sse42.o adler32_sse42.lo \
|
||||
adler32_ssse3.o adler32_ssse3.lo \
|
||||
chunkset_avx2.o chunkset_avx2.lo \
|
||||
chunkset_avx512.o chunkset_avx512.lo \
|
||||
chunkset_sse2.o chunkset_sse2.lo \
|
||||
chunkset_ssse3.o chunkset_ssse3.lo \
|
||||
compare256_avx2.o compare256_avx2.lo \
|
||||
compare256_sse2.o compare256_sse2.lo \
|
||||
crc32_pclmulqdq.o crc32_pclmulqdq.lo \
|
||||
crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
|
||||
slide_hash_avx2.o slide_hash_avx2.lo \
|
||||
slide_hash_sse2.o slide_hash_sse2.lo
|
||||
|
||||
x86_features.o:
|
||||
$(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
|
||||
|
||||
x86_features.lo:
|
||||
$(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
|
||||
|
||||
chunkset_avx2.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
|
||||
|
||||
chunkset_avx2.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
|
||||
|
||||
chunkset_avx512.o:
|
||||
$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
|
||||
|
||||
chunkset_avx512.lo:
|
||||
$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
|
||||
|
||||
chunkset_sse2.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
|
||||
|
||||
chunkset_sse2.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
|
||||
|
||||
chunkset_ssse3.o:
|
||||
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
|
||||
|
||||
chunkset_ssse3.lo:
|
||||
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
|
||||
|
||||
compare256_avx2.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
|
||||
|
||||
compare256_avx2.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
|
||||
|
||||
compare256_sse2.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
|
||||
|
||||
compare256_sse2.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
|
||||
|
||||
crc32_pclmulqdq.o:
|
||||
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
|
||||
|
||||
crc32_pclmulqdq.lo:
|
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
|
||||
|
||||
crc32_vpclmulqdq.o:
|
||||
$(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
|
||||
|
||||
crc32_vpclmulqdq.lo:
|
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
|
||||
|
||||
slide_hash_avx2.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
|
||||
|
||||
slide_hash_avx2.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
|
||||
|
||||
slide_hash_sse2.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
|
||||
|
||||
slide_hash_sse2.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
|
||||
|
||||
adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
|
||||
|
||||
adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
|
||||
|
||||
adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
|
||||
$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
|
||||
|
||||
adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
|
||||
$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
|
||||
|
||||
adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
|
||||
$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
|
||||
|
||||
adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
|
||||
$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
|
||||
|
||||
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
|
||||
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
|
||||
|
||||
adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
|
||||
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
|
||||
|
||||
adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
|
||||
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
|
||||
|
||||
adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
|
||||
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean
|
||||
rm -f Makefile
|
||||
145
third_party/zlib-ng/arch/x86/adler32_avx2.c
vendored
Normal file
145
third_party/zlib-ng/arch/x86/adler32_avx2.c
vendored
Normal file
@@ -0,0 +1,145 @@
|
||||
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* Authors:
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef X86_AVX2
|
||||
|
||||
#include "zbuild.h"
|
||||
#include <immintrin.h>
|
||||
#include "adler32_p.h"
|
||||
#include "adler32_avx2_p.h"
|
||||
#include "x86_intrins.h"
|
||||
|
||||
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
|
||||
|
||||
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
|
||||
if (src == NULL) return 1L;
|
||||
if (len == 0) return adler;
|
||||
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel:
|
||||
if (len < 16) {
|
||||
if (COPY) {
|
||||
return adler32_copy_len_16(adler0, src, dst, len, adler1);
|
||||
} else {
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
}
|
||||
} else if (len < 32) {
|
||||
if (COPY) {
|
||||
return adler32_fold_copy_sse42(adler, dst, src, len);
|
||||
} else {
|
||||
return adler32_ssse3(adler, src, len);
|
||||
}
|
||||
}
|
||||
|
||||
__m256i vs1, vs2;
|
||||
|
||||
const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
|
||||
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||
const __m256i dot3v = _mm256_set1_epi16(1);
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
|
||||
while (len >= 32) {
|
||||
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
|
||||
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
|
||||
__m256i vs1_0 = vs1;
|
||||
__m256i vs3 = _mm256_setzero_si256();
|
||||
|
||||
size_t k = MIN(len, NMAX);
|
||||
k -= k % 32;
|
||||
len -= k;
|
||||
|
||||
while (k >= 32) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
|
||||
*/
|
||||
__m256i vbuf = _mm256_loadu_si256((__m256i*)src);
|
||||
src += 32;
|
||||
k -= 32;
|
||||
|
||||
__m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
|
||||
|
||||
if (COPY) {
|
||||
_mm256_storeu_si256((__m256i*)dst, vbuf);
|
||||
dst += 32;
|
||||
}
|
||||
|
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm256_add_epi32(vs3, vs1_0);
|
||||
__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
|
||||
__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
|
||||
vs2 = _mm256_add_epi32(vsum2, vs2);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
/* Defer the multiplication with 32 to outside of the loop */
|
||||
vs3 = _mm256_slli_epi32(vs3, 5);
|
||||
vs2 = _mm256_add_epi32(vs2, vs3);
|
||||
|
||||
/* The compiler is generating the following sequence for this integer modulus
|
||||
* when done the scalar way, in GPRs:
|
||||
|
||||
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
|
||||
(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
|
||||
|
||||
mov $0x80078071,%edi // move magic constant into 32 bit register %edi
|
||||
...
|
||||
vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
|
||||
mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
|
||||
imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
|
||||
shr $0x2f,%rsi // shift right by 47
|
||||
imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
|
||||
sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
|
||||
...
|
||||
// repeats for each element with vpextract instructions
|
||||
|
||||
This is tricky with AVX2 for a number of reasons:
|
||||
1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
|
||||
2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
|
||||
back down to 32 bit precision later (there is in AVX512)
|
||||
3.) Full width integer multiplications aren't cheap
|
||||
|
||||
We can, however, do a relatively cheap sequence for horizontal sums.
|
||||
Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
|
||||
previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
|
||||
that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
|
||||
performed on the maximum possible inputs before overflow
|
||||
*/
|
||||
|
||||
|
||||
/* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
|
||||
* conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
|
||||
* This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
|
||||
* what the compiler is doing to avoid integer divisions. */
|
||||
adler0 = partial_hsum256(vs1) % BASE;
|
||||
adler1 = hsum256(vs2) % BASE;
|
||||
}
|
||||
|
||||
adler = adler0 | (adler1 << 16);
|
||||
|
||||
if (len) {
|
||||
goto rem_peel;
|
||||
}
|
||||
|
||||
return adler;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
|
||||
return adler32_fold_copy_impl(adler, NULL, src, len, 0);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
return adler32_fold_copy_impl(adler, dst, src, len, 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
32
third_party/zlib-ng/arch/x86/adler32_avx2_p.h
vendored
Normal file
32
third_party/zlib-ng/arch/x86/adler32_avx2_p.h
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
/* adler32_avx2_p.h -- adler32 avx2 utility functions
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ADLER32_AVX2_P_H_
|
||||
#define ADLER32_AVX2_P_H_
|
||||
|
||||
#if defined(X86_AVX2) || defined(X86_AVX512VNNI)
|
||||
|
||||
/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
|
||||
static inline uint32_t hsum256(__m256i x) {
|
||||
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
|
||||
_mm256_castsi256_si128(x));
|
||||
__m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
|
||||
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
|
||||
return (uint32_t)_mm_cvtsi128_si32(sum3);
|
||||
}
|
||||
|
||||
static inline uint32_t partial_hsum256(__m256i x) {
|
||||
/* We need a permutation vector to extract every other integer. The
|
||||
* rest are going to be zeros */
|
||||
const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
|
||||
__m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec);
|
||||
__m128i non_zero_sse = _mm256_castsi256_si128(non_zero);
|
||||
__m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse));
|
||||
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
|
||||
return (uint32_t)_mm_cvtsi128_si32(sum3);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
108
third_party/zlib-ng/arch/x86/adler32_avx512.c
vendored
Normal file
108
third_party/zlib-ng/arch/x86/adler32_avx512.c
vendored
Normal file
@@ -0,0 +1,108 @@
|
||||
/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef X86_AVX512
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
#include "arch_functions.h"
|
||||
#include <immintrin.h>
|
||||
#include "x86_intrins.h"
|
||||
#include "adler32_avx512_p.h"
|
||||
|
||||
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
|
||||
if (src == NULL) return 1L;
|
||||
if (len == 0) return adler;
|
||||
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel:
|
||||
if (len < 64) {
|
||||
/* This handles the remaining copies, just call normal adler checksum after this */
|
||||
if (COPY) {
|
||||
__mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
|
||||
__m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
|
||||
_mm512_mask_storeu_epi8(dst, storemask, copy_vec);
|
||||
}
|
||||
|
||||
return adler32_avx2(adler, src, len);
|
||||
}
|
||||
|
||||
__m512i vbuf, vs1_0, vs3;
|
||||
|
||||
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
|
||||
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
|
||||
56, 57, 58, 59, 60, 61, 62, 63, 64);
|
||||
const __m512i dot3v = _mm512_set1_epi16(1);
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
size_t k;
|
||||
|
||||
while (len >= 64) {
|
||||
__m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
|
||||
__m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
|
||||
vs1_0 = vs1;
|
||||
vs3 = _mm512_setzero_si512();
|
||||
|
||||
k = MIN(len, NMAX);
|
||||
k -= k % 64;
|
||||
len -= k;
|
||||
|
||||
while (k >= 64) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm512_loadu_si512(src);
|
||||
|
||||
if (COPY) {
|
||||
_mm512_storeu_si512(dst, vbuf);
|
||||
dst += 64;
|
||||
}
|
||||
|
||||
src += 64;
|
||||
k -= 64;
|
||||
|
||||
__m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
|
||||
__m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
|
||||
vs1 = _mm512_add_epi32(vs1_sad, vs1);
|
||||
vs3 = _mm512_add_epi32(vs3, vs1_0);
|
||||
__m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
|
||||
vs2 = _mm512_add_epi32(vsum2, vs2);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs3 = _mm512_slli_epi32(vs3, 6);
|
||||
vs2 = _mm512_add_epi32(vs2, vs3);
|
||||
|
||||
adler0 = partial_hsum(vs1) % BASE;
|
||||
adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
|
||||
}
|
||||
|
||||
adler = adler0 | (adler1 << 16);
|
||||
|
||||
/* Process tail (len < 64). */
|
||||
if (len) {
|
||||
goto rem_peel;
|
||||
}
|
||||
|
||||
return adler;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
return adler32_fold_copy_impl(adler, dst, src, len, 1);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
|
||||
return adler32_fold_copy_impl(adler, NULL, src, len, 0);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
57
third_party/zlib-ng/arch/x86/adler32_avx512_p.h
vendored
Normal file
57
third_party/zlib-ng/arch/x86/adler32_avx512_p.h
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
#ifndef AVX512_FUNCS_H
|
||||
#define AVX512_FUNCS_H
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/* Written because Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsic macros */
|
||||
#if defined(_MSC_VER) && !defined(_MM_K0_REG8)
|
||||
# undef _mm512_extracti64x4_epi64
|
||||
# define _mm512_extracti64x4_epi64(v1, e1) _mm512_maskz_extracti64x4_epi64(UINT8_MAX, v1, e1)
|
||||
# undef _mm512_set1_epi16
|
||||
# define _mm512_set1_epi16(e1) _mm512_maskz_set1_epi16(UINT32_MAX, e1)
|
||||
# undef _mm512_maddubs_epi16
|
||||
# define _mm512_maddubs_epi16(v1, v2) _mm512_maskz_maddubs_epi16(UINT32_MAX, v1, v2)
|
||||
#endif
|
||||
|
||||
/* Written because *_add_epi32(a) sets off ubsan */
|
||||
static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
|
||||
__m256i a = _mm512_extracti64x4_epi64(x, 1);
|
||||
__m256i b = _mm512_extracti64x4_epi64(x, 0);
|
||||
|
||||
__m256i a_plus_b = _mm256_add_epi32(a, b);
|
||||
__m128i c = _mm256_extracti128_si256(a_plus_b, 1);
|
||||
__m128i d = _mm256_extracti128_si256(a_plus_b, 0);
|
||||
__m128i c_plus_d = _mm_add_epi32(c, d);
|
||||
|
||||
__m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
|
||||
__m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
|
||||
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
|
||||
__m128i sum4 = _mm_add_epi32(sum2, sum3);
|
||||
|
||||
return _mm_cvtsi128_si32(sum4);
|
||||
}
|
||||
|
||||
static inline uint32_t partial_hsum(__m512i x) {
|
||||
/* We need a permutation vector to extract every other integer. The
|
||||
* rest are going to be zeros. Marking this const so the compiler stands
|
||||
* a better chance of keeping this resident in a register through entire
|
||||
* loop execution. We certainly have enough zmm registers (32) */
|
||||
const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
|
||||
1, 1, 1, 1, 1, 1, 1, 1);
|
||||
|
||||
__m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
|
||||
|
||||
/* From here, it's a simple 256 bit wide reduction sum */
|
||||
__m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
|
||||
|
||||
/* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
|
||||
* pretty slow, much slower than the longer instruction sequence below */
|
||||
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
|
||||
_mm256_castsi256_si128(non_zero_avx));
|
||||
__m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
|
||||
__m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
|
||||
return (uint32_t)_mm_cvtsi128_si32(sum3);
|
||||
}
|
||||
|
||||
#endif
|
||||
210
third_party/zlib-ng/arch/x86/adler32_avx512_vnni.c
vendored
Normal file
210
third_party/zlib-ng/arch/x86/adler32_avx512_vnni.c
vendored
Normal file
@@ -0,0 +1,210 @@
|
||||
/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
|
||||
* Based on Brian Bockelman's AVX2 version
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef X86_AVX512VNNI
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
#include "arch_functions.h"
|
||||
#include <immintrin.h>
|
||||
#include "x86_intrins.h"
|
||||
#include "adler32_avx512_p.h"
|
||||
#include "adler32_avx2_p.h"
|
||||
|
||||
Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
|
||||
if (src == NULL) return 1L;
|
||||
if (len == 0) return adler;
|
||||
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel:
|
||||
if (len < 32)
|
||||
return adler32_ssse3(adler, src, len);
|
||||
|
||||
if (len < 64)
|
||||
return adler32_avx2(adler, src, len);
|
||||
|
||||
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
|
||||
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
|
||||
56, 57, 58, 59, 60, 61, 62, 63, 64);
|
||||
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
__m512i vs1, vs2;
|
||||
|
||||
while (len >= 64) {
|
||||
vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
|
||||
vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
|
||||
size_t k = MIN(len, NMAX);
|
||||
k -= k % 64;
|
||||
len -= k;
|
||||
__m512i vs1_0 = vs1;
|
||||
__m512i vs3 = _mm512_setzero_si512();
|
||||
/* We might get a tad bit more ILP here if we sum to a second register in the loop */
|
||||
__m512i vs2_1 = _mm512_setzero_si512();
|
||||
__m512i vbuf0, vbuf1;
|
||||
|
||||
/* Remainder peeling */
|
||||
if (k % 128) {
|
||||
vbuf1 = _mm512_loadu_si512((__m512i*)src);
|
||||
|
||||
src += 64;
|
||||
k -= 64;
|
||||
|
||||
__m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
|
||||
vs1 = _mm512_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm512_add_epi32(vs3, vs1_0);
|
||||
vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
/* Manually unrolled this loop by 2 for an decent amount of ILP */
|
||||
while (k >= 128) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
|
||||
*/
|
||||
vbuf0 = _mm512_loadu_si512((__m512i*)src);
|
||||
vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
|
||||
src += 128;
|
||||
k -= 128;
|
||||
|
||||
__m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
|
||||
vs1 = _mm512_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm512_add_epi32(vs3, vs1_0);
|
||||
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
|
||||
* instructions to eliminate them */
|
||||
vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
|
||||
|
||||
vs3 = _mm512_add_epi32(vs3, vs1);
|
||||
vs1_sad = _mm512_sad_epu8(vbuf1, zero);
|
||||
vs1 = _mm512_add_epi32(vs1, vs1_sad);
|
||||
vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs3 = _mm512_slli_epi32(vs3, 6);
|
||||
vs2 = _mm512_add_epi32(vs2, vs3);
|
||||
vs2 = _mm512_add_epi32(vs2, vs2_1);
|
||||
|
||||
adler0 = partial_hsum(vs1) % BASE;
|
||||
adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
|
||||
}
|
||||
|
||||
adler = adler0 | (adler1 << 16);
|
||||
|
||||
/* Process tail (len < 64). */
|
||||
if (len) {
|
||||
goto rem_peel;
|
||||
}
|
||||
|
||||
return adler;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
if (src == NULL) return 1L;
|
||||
if (len == 0) return adler;
|
||||
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel_copy:
|
||||
if (len < 32) {
|
||||
/* This handles the remaining copies, just call normal adler checksum after this */
|
||||
__mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
|
||||
__m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
|
||||
_mm256_mask_storeu_epi8(dst, storemask, copy_vec);
|
||||
|
||||
return adler32_ssse3(adler, src, len);
|
||||
}
|
||||
|
||||
const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
|
||||
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
__m256i vs1, vs2;
|
||||
|
||||
while (len >= 32) {
|
||||
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
|
||||
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
|
||||
size_t k = MIN(len, NMAX);
|
||||
k -= k % 32;
|
||||
len -= k;
|
||||
__m256i vs1_0 = vs1;
|
||||
__m256i vs3 = _mm256_setzero_si256();
|
||||
/* We might get a tad bit more ILP here if we sum to a second register in the loop */
|
||||
__m256i vs2_1 = _mm256_setzero_si256();
|
||||
__m256i vbuf0, vbuf1;
|
||||
|
||||
/* Remainder peeling */
|
||||
if (k % 64) {
|
||||
vbuf1 = _mm256_loadu_si256((__m256i*)src);
|
||||
_mm256_storeu_si256((__m256i*)dst, vbuf1);
|
||||
dst += 32;
|
||||
|
||||
src += 32;
|
||||
k -= 32;
|
||||
|
||||
__m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
|
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm256_add_epi32(vs3, vs1_0);
|
||||
vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
/* Manually unrolled this loop by 2 for an decent amount of ILP */
|
||||
while (k >= 64) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
|
||||
*/
|
||||
vbuf0 = _mm256_loadu_si256((__m256i*)src);
|
||||
vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
|
||||
_mm256_storeu_si256((__m256i*)dst, vbuf0);
|
||||
_mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
|
||||
dst += 64;
|
||||
src += 64;
|
||||
k -= 64;
|
||||
|
||||
__m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
|
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm256_add_epi32(vs3, vs1_0);
|
||||
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
|
||||
* instructions to eliminate them */
|
||||
vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
|
||||
|
||||
vs3 = _mm256_add_epi32(vs3, vs1);
|
||||
vs1_sad = _mm256_sad_epu8(vbuf1, zero);
|
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad);
|
||||
vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs3 = _mm256_slli_epi32(vs3, 5);
|
||||
vs2 = _mm256_add_epi32(vs2, vs3);
|
||||
vs2 = _mm256_add_epi32(vs2, vs2_1);
|
||||
|
||||
adler0 = partial_hsum256(vs1) % BASE;
|
||||
adler1 = hsum256(vs2) % BASE;
|
||||
}
|
||||
|
||||
adler = adler0 | (adler1 << 16);
|
||||
|
||||
/* Process tail (len < 64). */
|
||||
if (len) {
|
||||
goto rem_peel_copy;
|
||||
}
|
||||
|
||||
return adler;
|
||||
}
|
||||
|
||||
#endif
|
||||
120
third_party/zlib-ng/arch/x86/adler32_sse42.c
vendored
Normal file
120
third_party/zlib-ng/arch/x86/adler32_sse42.c
vendored
Normal file
@@ -0,0 +1,120 @@
|
||||
/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
#include "adler32_ssse3_p.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
#ifdef X86_SSE42
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel:
|
||||
if (len < 16) {
|
||||
return adler32_copy_len_16(adler0, src, dst, len, adler1);
|
||||
}
|
||||
|
||||
__m128i vbuf, vbuf_0;
|
||||
__m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
|
||||
v_sad_sum2, vsum2, vsum2_0;
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
|
||||
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||
const __m128i dot3v = _mm_set1_epi16(1);
|
||||
size_t k;
|
||||
|
||||
while (len >= 16) {
|
||||
|
||||
k = MIN(len, NMAX);
|
||||
k -= k % 16;
|
||||
len -= k;
|
||||
|
||||
vs1 = _mm_cvtsi32_si128(adler0);
|
||||
vs2 = _mm_cvtsi32_si128(adler1);
|
||||
|
||||
vs3 = _mm_setzero_si128();
|
||||
vs2_0 = _mm_setzero_si128();
|
||||
vs1_0 = vs1;
|
||||
|
||||
while (k >= 32) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm_loadu_si128((__m128i*)src);
|
||||
vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
|
||||
src += 32;
|
||||
k -= 32;
|
||||
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
|
||||
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
|
||||
_mm_storeu_si128((__m128i*)dst, vbuf);
|
||||
_mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
|
||||
dst += 32;
|
||||
|
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
|
||||
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
|
||||
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
|
||||
vs3 = _mm_add_epi32(vs1_0, vs3);
|
||||
|
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
|
||||
vs1 = _mm_add_epi32(v_sad_sum2, vs1);
|
||||
vs2 = _mm_add_epi32(vsum2, vs2);
|
||||
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs2 = _mm_add_epi32(vs2_0, vs2);
|
||||
vs3 = _mm_slli_epi32(vs3, 5);
|
||||
vs2 = _mm_add_epi32(vs3, vs2);
|
||||
vs3 = _mm_setzero_si128();
|
||||
|
||||
while (k >= 16) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm_loadu_si128((__m128i*)src);
|
||||
src += 16;
|
||||
k -= 16;
|
||||
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
|
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
|
||||
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
|
||||
vs3 = _mm_add_epi32(vs1_0, vs3);
|
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
vs2 = _mm_add_epi32(vsum2, vs2);
|
||||
vs1_0 = vs1;
|
||||
|
||||
_mm_storeu_si128((__m128i*)dst, vbuf);
|
||||
dst += 16;
|
||||
}
|
||||
|
||||
vs3 = _mm_slli_epi32(vs3, 4);
|
||||
vs2 = _mm_add_epi32(vs2, vs3);
|
||||
|
||||
adler0 = partial_hsum(vs1) % BASE;
|
||||
adler1 = hsum(vs2) % BASE;
|
||||
}
|
||||
|
||||
/* If this is true, there's fewer than 16 elements remaining */
|
||||
if (len) {
|
||||
goto rem_peel;
|
||||
}
|
||||
|
||||
return adler0 | (adler1 << 16);
|
||||
}
|
||||
|
||||
#endif
|
||||
156
third_party/zlib-ng/arch/x86/adler32_ssse3.c
vendored
Normal file
156
third_party/zlib-ng/arch/x86/adler32_ssse3.c
vendored
Normal file
@@ -0,0 +1,156 @@
|
||||
/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
#include "adler32_ssse3_p.h"
|
||||
|
||||
#ifdef X86_SSSE3
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
|
||||
/* split Adler-32 into component sums */
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
|
||||
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||
const __m128i dot3v = _mm_set1_epi16(1);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
__m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
|
||||
vbuf_0, v_sad_sum2, vsum2, vsum2_0;
|
||||
|
||||
/* If our buffer is unaligned (likely), make the determination whether
|
||||
* or not there's enough of a buffer to consume to make the scalar, aligning
|
||||
* additions worthwhile or if it's worth it to just eat the cost of an unaligned
|
||||
* load. This is a pretty simple test, just test if 16 - the remainder + len is
|
||||
* < 16 */
|
||||
size_t max_iters = NMAX;
|
||||
size_t rem = (uintptr_t)buf & 15;
|
||||
size_t align_offset = 16 - rem;
|
||||
size_t k = 0;
|
||||
if (rem) {
|
||||
if (len < 16 + align_offset) {
|
||||
/* Let's eat the cost of this one unaligned load so that
|
||||
* we don't completely skip over the vectorization. Doing
|
||||
* 16 bytes at a time unaligned is better than 16 + <= 15
|
||||
* sums */
|
||||
vbuf = _mm_loadu_si128((__m128i*)buf);
|
||||
len -= 16;
|
||||
buf += 16;
|
||||
vs1 = _mm_cvtsi32_si128(adler);
|
||||
vs2 = _mm_cvtsi32_si128(sum2);
|
||||
vs3 = _mm_setzero_si128();
|
||||
vs1_0 = vs1;
|
||||
goto unaligned_jmp;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < align_offset; ++i) {
|
||||
adler += *(buf++);
|
||||
sum2 += adler;
|
||||
}
|
||||
|
||||
/* lop off the max number of sums based on the scalar sums done
|
||||
* above */
|
||||
len -= align_offset;
|
||||
max_iters -= align_offset;
|
||||
}
|
||||
|
||||
|
||||
while (len >= 16) {
|
||||
vs1 = _mm_cvtsi32_si128(adler);
|
||||
vs2 = _mm_cvtsi32_si128(sum2);
|
||||
vs3 = _mm_setzero_si128();
|
||||
vs2_0 = _mm_setzero_si128();
|
||||
vs1_0 = vs1;
|
||||
|
||||
k = (len < max_iters ? len : max_iters);
|
||||
k -= k % 16;
|
||||
len -= k;
|
||||
|
||||
while (k >= 32) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm_load_si128((__m128i*)buf);
|
||||
vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
|
||||
buf += 32;
|
||||
k -= 32;
|
||||
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
|
||||
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
|
||||
vs3 = _mm_add_epi32(vs1_0, vs3);
|
||||
|
||||
vs1 = _mm_add_epi32(v_sad_sum2, vs1);
|
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
|
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
|
||||
vs2 = _mm_add_epi32(vsum2, vs2);
|
||||
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
|
||||
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs2 = _mm_add_epi32(vs2_0, vs2);
|
||||
vs3 = _mm_slli_epi32(vs3, 5);
|
||||
vs2 = _mm_add_epi32(vs3, vs2);
|
||||
vs3 = _mm_setzero_si128();
|
||||
|
||||
while (k >= 16) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm_load_si128((__m128i*)buf);
|
||||
buf += 16;
|
||||
k -= 16;
|
||||
|
||||
unaligned_jmp:
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
|
||||
vs3 = _mm_add_epi32(vs1_0, vs3);
|
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
|
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
vs2 = _mm_add_epi32(vsum2, vs2);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs3 = _mm_slli_epi32(vs3, 4);
|
||||
vs2 = _mm_add_epi32(vs2, vs3);
|
||||
|
||||
/* We don't actually need to do a full horizontal sum, since psadbw is actually doing
|
||||
* a partial reduction sum implicitly and only summing to integers in vector positions
|
||||
* 0 and 2. This saves us some contention on the shuffle port(s) */
|
||||
adler = partial_hsum(vs1) % BASE;
|
||||
sum2 = hsum(vs2) % BASE;
|
||||
max_iters = NMAX;
|
||||
}
|
||||
|
||||
/* Process tail (len < 16). */
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
}
|
||||
|
||||
#endif
|
||||
29
third_party/zlib-ng/arch/x86/adler32_ssse3_p.h
vendored
Normal file
29
third_party/zlib-ng/arch/x86/adler32_ssse3_p.h
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ADLER32_SSSE3_P_H_
|
||||
#define ADLER32_SSSE3_P_H_
|
||||
|
||||
#ifdef X86_SSSE3
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
|
||||
static inline uint32_t partial_hsum(__m128i x) {
|
||||
__m128i second_int = _mm_srli_si128(x, 8);
|
||||
__m128i sum = _mm_add_epi32(x, second_int);
|
||||
return _mm_cvtsi128_si32(sum);
|
||||
}
|
||||
|
||||
static inline uint32_t hsum(__m128i x) {
|
||||
__m128i sum1 = _mm_unpackhi_epi64(x, x);
|
||||
__m128i sum2 = _mm_add_epi32(x, sum1);
|
||||
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
|
||||
__m128i sum4 = _mm_add_epi32(sum2, sum3);
|
||||
return _mm_cvtsi128_si32(sum4);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
44
third_party/zlib-ng/arch/x86/avx2_tables.h
vendored
Normal file
44
third_party/zlib-ng/arch/x86/avx2_tables.h
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
#ifndef _AVX2_TABLES_H
|
||||
#define _AVX2_TABLES_H
|
||||
|
||||
#include "../generic/chunk_permute_table.h"
|
||||
|
||||
/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
|
||||
* never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
|
||||
static const lut_rem_pair perm_idx_lut[29] = {
|
||||
{ 0, 2}, /* 3 */
|
||||
{ 0, 0}, /* don't care */
|
||||
{ 1 * 32, 2}, /* 5 */
|
||||
{ 2 * 32, 2}, /* 6 */
|
||||
{ 3 * 32, 4}, /* 7 */
|
||||
{ 0 * 32, 0}, /* don't care */
|
||||
{ 4 * 32, 5}, /* 9 */
|
||||
{ 5 * 32, 22}, /* 10 */
|
||||
{ 6 * 32, 21}, /* 11 */
|
||||
{ 7 * 32, 20}, /* 12 */
|
||||
{ 8 * 32, 6}, /* 13 */
|
||||
{ 9 * 32, 4}, /* 14 */
|
||||
{10 * 32, 2}, /* 15 */
|
||||
{ 0 * 32, 0}, /* don't care */
|
||||
{11 * 32, 15}, /* 17 */
|
||||
{11 * 32 + 16, 14}, /* 18 */
|
||||
{11 * 32 + 16 * 2, 13}, /* 19 */
|
||||
{11 * 32 + 16 * 3, 12}, /* 20 */
|
||||
{11 * 32 + 16 * 4, 11}, /* 21 */
|
||||
{11 * 32 + 16 * 5, 10}, /* 22 */
|
||||
{11 * 32 + 16 * 6, 9}, /* 23 */
|
||||
{11 * 32 + 16 * 7, 8}, /* 24 */
|
||||
{11 * 32 + 16 * 8, 7}, /* 25 */
|
||||
{11 * 32 + 16 * 9, 6}, /* 26 */
|
||||
{11 * 32 + 16 * 10, 5}, /* 27 */
|
||||
{11 * 32 + 16 * 11, 4}, /* 28 */
|
||||
{11 * 32 + 16 * 12, 3}, /* 29 */
|
||||
{11 * 32 + 16 * 13, 2}, /* 30 */
|
||||
{11 * 32 + 16 * 14, 1} /* 31 */
|
||||
};
|
||||
|
||||
static const uint16_t half_rem_vals[13] = {
|
||||
1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
|
||||
};
|
||||
|
||||
#endif
|
||||
130
third_party/zlib-ng/arch/x86/chunkset_avx2.c
vendored
Normal file
130
third_party/zlib-ng/arch/x86/chunkset_avx2.c
vendored
Normal file
@@ -0,0 +1,130 @@
|
||||
/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "zbuild.h"
|
||||
#include "zmemory.h"
|
||||
|
||||
#ifdef X86_AVX2
|
||||
#include "avx2_tables.h"
|
||||
#include <immintrin.h>
|
||||
#include "x86_intrins.h"
|
||||
|
||||
typedef __m256i chunk_t;
|
||||
typedef __m128i halfchunk_t;
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
#define HAVE_CHUNKMEMSET_16
|
||||
#define HAVE_CHUNK_MAG
|
||||
#define HAVE_HALF_CHUNK
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi16(zng_memread_2(from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi32(zng_memread_4(from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi64x(zng_memread_8(from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
|
||||
/* See explanation in chunkset_avx512.c */
|
||||
#if defined(_MSC_VER) && _MSC_VER <= 1900
|
||||
halfchunk_t half = _mm_loadu_si128((__m128i*)from);
|
||||
*chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1);
|
||||
#else
|
||||
*chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm256_loadu_si256((__m256i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm256_storeu_si256((__m256i *)out, *chunk);
|
||||
}
|
||||
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
|
||||
__m256i ret_vec;
|
||||
/* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
|
||||
* compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in
|
||||
* GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
|
||||
*chunk_rem = lut_rem.remval;
|
||||
|
||||
/* See note in chunkset_ssse3.c for why this is ok */
|
||||
__msan_unpoison(buf + dist, 32 - dist);
|
||||
|
||||
if (dist < 16) {
|
||||
/* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
|
||||
* broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
|
||||
* shuffles and combining the halves later */
|
||||
const __m256i permute_xform =
|
||||
_mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
|
||||
__m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
|
||||
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
|
||||
perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
|
||||
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
|
||||
ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
|
||||
} else {
|
||||
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
|
||||
__m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
|
||||
/* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
|
||||
__m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
|
||||
__m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
|
||||
__m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1);
|
||||
/* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
|
||||
* shuffle those values */
|
||||
__m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
|
||||
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
|
||||
}
|
||||
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
|
||||
*chunk = _mm_loadu_si128((__m128i *)s);
|
||||
}
|
||||
|
||||
static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
|
||||
_mm_storeu_si128((__m128i *)out, *chunk);
|
||||
}
|
||||
|
||||
static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
|
||||
/* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
|
||||
* unlikely to be actually written or read from */
|
||||
return _mm256_zextsi128_si256(*chunk);
|
||||
}
|
||||
|
||||
static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
|
||||
__m128i perm_vec, ret_vec;
|
||||
__msan_unpoison(buf + dist, 16 - dist);
|
||||
ret_vec = _mm_loadu_si128((__m128i*)buf);
|
||||
*chunk_rem = half_rem_vals[dist - 3];
|
||||
|
||||
perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
|
||||
ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
|
||||
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_avx2
|
||||
#define CHUNKCOPY chunkcopy_avx2
|
||||
#define CHUNKUNROLL chunkunroll_avx2
|
||||
#define CHUNKMEMSET chunkmemset_avx2
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_avx2
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
||||
182
third_party/zlib-ng/arch/x86/chunkset_avx512.c
vendored
Normal file
182
third_party/zlib-ng/arch/x86/chunkset_avx512.c
vendored
Normal file
@@ -0,0 +1,182 @@
|
||||
/* chunkset_avx512.c -- AVX512 inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "zbuild.h"
|
||||
#include "zmemory.h"
|
||||
|
||||
#ifdef X86_AVX512
|
||||
|
||||
#include "avx2_tables.h"
|
||||
#include <immintrin.h>
|
||||
#include "x86_intrins.h"
|
||||
|
||||
typedef __m256i chunk_t;
|
||||
typedef __m128i halfchunk_t;
|
||||
typedef __mmask32 mask_t;
|
||||
typedef __mmask16 halfmask_t;
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
#define HAVE_CHUNKMEMSET_16
|
||||
#define HAVE_CHUNK_MAG
|
||||
#define HAVE_HALF_CHUNK
|
||||
#define HAVE_MASKED_READWRITE
|
||||
#define HAVE_CHUNKCOPY
|
||||
#define HAVE_HALFCHUNKCOPY
|
||||
|
||||
static inline halfmask_t gen_half_mask(unsigned len) {
|
||||
return (halfmask_t)_bzhi_u32(0xFFFF, len);
|
||||
}
|
||||
|
||||
static inline mask_t gen_mask(unsigned len) {
|
||||
return (mask_t)_bzhi_u32(0xFFFFFFFF, len);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi16(zng_memread_2(from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi32(zng_memread_4(from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi64x(zng_memread_8(from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
|
||||
/* Unfortunately there seems to be a compiler bug in Visual Studio 2015 where
|
||||
* the load is dumped to the stack with an aligned move for this memory-register
|
||||
* broadcast. The vbroadcasti128 instruction is 2 fewer cycles and this dump to
|
||||
* stack doesn't exist if compiled with optimizations. For the sake of working
|
||||
* properly in a debugger, let's take the 2 cycle penalty */
|
||||
#if defined(_MSC_VER) && _MSC_VER <= 1900
|
||||
halfchunk_t half = _mm_loadu_si128((__m128i*)from);
|
||||
*chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1);
|
||||
#else
|
||||
*chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm256_loadu_si256((__m256i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm256_storeu_si256((__m256i *)out, *chunk);
|
||||
}
|
||||
|
||||
static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
|
||||
Assert(len > 0, "chunkcopy should never have a length 0");
|
||||
|
||||
chunk_t chunk;
|
||||
uint32_t rem = len % sizeof(chunk_t);
|
||||
|
||||
if (len < sizeof(chunk_t)) {
|
||||
mask_t rem_mask = gen_mask(rem);
|
||||
chunk = _mm256_maskz_loadu_epi8(rem_mask, from);
|
||||
_mm256_mask_storeu_epi8(out, rem_mask, chunk);
|
||||
return out + rem;
|
||||
}
|
||||
|
||||
loadchunk(from, &chunk);
|
||||
rem = (rem == 0) ? sizeof(chunk_t) : rem;
|
||||
storechunk(out, &chunk);
|
||||
out += rem;
|
||||
from += rem;
|
||||
len -= rem;
|
||||
|
||||
while (len > 0) {
|
||||
loadchunk(from, &chunk);
|
||||
storechunk(out, &chunk);
|
||||
out += sizeof(chunk_t);
|
||||
from += sizeof(chunk_t);
|
||||
len -= sizeof(chunk_t);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
|
||||
__m256i ret_vec;
|
||||
*chunk_rem = lut_rem.remval;
|
||||
|
||||
/* See the AVX2 implementation for more detailed comments. This is that + some masked
|
||||
* loads to avoid an out of bounds read on the heap */
|
||||
|
||||
if (dist < 16) {
|
||||
const __m256i permute_xform =
|
||||
_mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
|
||||
__m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
|
||||
halfmask_t load_mask = gen_half_mask(dist);
|
||||
__m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf);
|
||||
perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
|
||||
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
|
||||
ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
|
||||
} else {
|
||||
halfmask_t load_mask = gen_half_mask(dist - 16);
|
||||
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
|
||||
__m128i ret_vec1 = _mm_maskz_loadu_epi8(load_mask, (__m128i*)(buf + 16));
|
||||
__m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
|
||||
halfmask_t xlane_mask = _mm_cmp_epi8_mask(perm_vec1, _mm_set1_epi8(15), _MM_CMPINT_LE);
|
||||
__m128i latter_half = _mm_mask_shuffle_epi8(ret_vec1, xlane_mask, ret_vec0, perm_vec1);
|
||||
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
|
||||
}
|
||||
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
|
||||
_mm_storeu_si128((__m128i *)out, *chunk);
|
||||
}
|
||||
|
||||
static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
|
||||
/* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
|
||||
* unlikely to be actually written or read from */
|
||||
return _mm256_zextsi128_si256(*chunk);
|
||||
}
|
||||
|
||||
static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
|
||||
__m128i perm_vec, ret_vec;
|
||||
halfmask_t load_mask = gen_half_mask(dist);
|
||||
ret_vec = _mm_maskz_loadu_epi8(load_mask, buf);
|
||||
*chunk_rem = half_rem_vals[dist - 3];
|
||||
|
||||
perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
|
||||
ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
|
||||
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
|
||||
Assert(len > 0, "chunkcopy should never have a length 0");
|
||||
halfchunk_t chunk;
|
||||
|
||||
uint32_t rem = len % sizeof(halfchunk_t);
|
||||
if (rem == 0) {
|
||||
rem = sizeof(halfchunk_t);
|
||||
}
|
||||
|
||||
halfmask_t rem_mask = gen_half_mask(rem);
|
||||
chunk = _mm_maskz_loadu_epi8(rem_mask, from);
|
||||
_mm_mask_storeu_epi8(out, rem_mask, chunk);
|
||||
|
||||
return out + rem;
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_avx512
|
||||
#define CHUNKUNROLL chunkunroll_avx512
|
||||
#define CHUNKMEMSET chunkmemset_avx512
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx512
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_avx512
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
||||
49
third_party/zlib-ng/arch/x86/chunkset_sse2.c
vendored
Normal file
49
third_party/zlib-ng/arch/x86/chunkset_sse2.c
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zmemory.h"
|
||||
|
||||
#ifdef X86_SSE2
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef __m128i chunk_t;
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi16(zng_memread_2(from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi32(zng_memread_4(from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi64x(zng_memread_8(from));
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm_loadu_si128((__m128i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm_storeu_si128((__m128i *)out, *chunk);
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_sse2
|
||||
#define CHUNKCOPY chunkcopy_sse2
|
||||
#define CHUNKUNROLL chunkunroll_sse2
|
||||
#define CHUNKMEMSET chunkmemset_sse2
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_sse2
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
||||
86
third_party/zlib-ng/arch/x86/chunkset_ssse3.c
vendored
Normal file
86
third_party/zlib-ng/arch/x86/chunkset_ssse3.c
vendored
Normal file
@@ -0,0 +1,86 @@
|
||||
/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zmemory.h"
|
||||
|
||||
#if defined(X86_SSSE3)
|
||||
#include <immintrin.h>
|
||||
#include "../generic/chunk_permute_table.h"
|
||||
|
||||
typedef __m128i chunk_t;
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
#define HAVE_CHUNK_MAG
|
||||
|
||||
static const lut_rem_pair perm_idx_lut[13] = {
|
||||
{0, 1}, /* 3 */
|
||||
{0, 0}, /* don't care */
|
||||
{1 * 32, 1}, /* 5 */
|
||||
{2 * 32, 4}, /* 6 */
|
||||
{3 * 32, 2}, /* 7 */
|
||||
{0 * 32, 0}, /* don't care */
|
||||
{4 * 32, 7}, /* 9 */
|
||||
{5 * 32, 6}, /* 10 */
|
||||
{6 * 32, 5}, /* 11 */
|
||||
{7 * 32, 4}, /* 12 */
|
||||
{8 * 32, 3}, /* 13 */
|
||||
{9 * 32, 2}, /* 14 */
|
||||
{10 * 32, 1},/* 15 */
|
||||
};
|
||||
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi16(zng_memread_2(from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi32(zng_memread_4(from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi64x(zng_memread_8(from));
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm_loadu_si128((__m128i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm_storeu_si128((__m128i *)out, *chunk);
|
||||
}
|
||||
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
|
||||
__m128i perm_vec, ret_vec;
|
||||
/* Important to note:
|
||||
* This is _not_ to subvert the memory sanitizer but to instead unpoison some
|
||||
* bytes we willingly and purposefully load uninitialized that we swizzle over
|
||||
* in a vector register, anyway. If what we assume is wrong about what is used,
|
||||
* the memory sanitizer will still usefully flag it */
|
||||
__msan_unpoison(buf + dist, 16 - dist);
|
||||
ret_vec = _mm_loadu_si128((__m128i*)buf);
|
||||
*chunk_rem = lut_rem.remval;
|
||||
|
||||
perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
|
||||
ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
|
||||
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_ssse3
|
||||
#define CHUNKMEMSET chunkmemset_ssse3
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
|
||||
#define CHUNKCOPY chunkcopy_ssse3
|
||||
#define CHUNKUNROLL chunkunroll_ssse3
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_ssse3
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
||||
64
third_party/zlib-ng/arch/x86/compare256_avx2.c
vendored
Normal file
64
third_party/zlib-ng/arch/x86/compare256_avx2.c
vendored
Normal file
@@ -0,0 +1,64 @@
|
||||
/* compare256_avx2.c -- AVX2 version of compare256
|
||||
* Copyright Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zmemory.h"
|
||||
#include "deflate.h"
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
|
||||
|
||||
#include <immintrin.h>
|
||||
#ifdef _MSC_VER
|
||||
# include <nmmintrin.h>
|
||||
#endif
|
||||
|
||||
static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
__m256i ymm_src0, ymm_src1, ymm_cmp;
|
||||
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
|
||||
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
|
||||
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
|
||||
unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
|
||||
if (mask != 0xFFFFFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src0 += 32, src1 += 32, len += 32;
|
||||
|
||||
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
|
||||
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
|
||||
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
|
||||
mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
|
||||
if (mask != 0xFFFFFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src0 += 32, src1 += 32, len += 32;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_avx2_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_avx2
|
||||
#define COMPARE256 compare256_avx2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_avx2
|
||||
#define COMPARE256 compare256_avx2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
97
third_party/zlib-ng/arch/x86/compare256_sse2.c
vendored
Normal file
97
third_party/zlib-ng/arch/x86/compare256_sse2.c
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
/* compare256_sse2.c -- SSE2 version of compare256
|
||||
* Copyright Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zmemory.h"
|
||||
#include "deflate.h"
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
int align_offset = ((uintptr_t)src0) & 15;
|
||||
const uint8_t *end0 = src0 + 256;
|
||||
const uint8_t *end1 = src1 + 256;
|
||||
__m128i xmm_src0, xmm_src1, xmm_cmp;
|
||||
|
||||
/* Do the first load unaligned, than all subsequent ones we have at least
|
||||
* one aligned load. Sadly aligning both loads is probably unrealistic */
|
||||
xmm_src0 = _mm_loadu_si128((__m128i*)src0);
|
||||
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
|
||||
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
|
||||
|
||||
unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
|
||||
|
||||
/* Compiler _may_ turn this branch into a ptest + movemask,
|
||||
* since a lot of those uops are shared and fused */
|
||||
if (mask != 0xFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
int align_adv = 16 - align_offset;
|
||||
len += align_adv;
|
||||
src0 += align_adv;
|
||||
src1 += align_adv;
|
||||
|
||||
/* Do a flooring division (should just be a shift right) */
|
||||
int num_iter = (256 - len) / 16;
|
||||
|
||||
for (int i = 0; i < num_iter; ++i) {
|
||||
xmm_src0 = _mm_load_si128((__m128i*)src0);
|
||||
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
|
||||
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
|
||||
|
||||
mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
|
||||
|
||||
/* Compiler _may_ turn this branch into a ptest + movemask,
|
||||
* since a lot of those uops are shared and fused */
|
||||
if (mask != 0xFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
len += 16, src0 += 16, src1 += 16;
|
||||
}
|
||||
|
||||
if (align_offset) {
|
||||
src0 = end0 - 16;
|
||||
src1 = end1 - 16;
|
||||
len = 256 - 16;
|
||||
|
||||
xmm_src0 = _mm_loadu_si128((__m128i*)src0);
|
||||
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
|
||||
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
|
||||
|
||||
mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
|
||||
|
||||
if (mask != 0xFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
}
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_sse2_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_sse2
|
||||
#define COMPARE256 compare256_sse2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_sse2
|
||||
#define COMPARE256 compare256_sse2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
199
third_party/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
vendored
Normal file
199
third_party/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
vendored
Normal file
@@ -0,0 +1,199 @@
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
* doc/crc-pclmulqdq.pdf
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Copyright (C) 2016 Marian Beermann (support for initial value)
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef COPY
|
||||
Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
#else
|
||||
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
|
||||
#endif
|
||||
unsigned long algn_diff;
|
||||
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
||||
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
|
||||
__m128i xmm_crc_part = _mm_setzero_si128();
|
||||
char ALIGNED_(16) partial_buf[16] = { 0 };
|
||||
#ifndef COPY
|
||||
__m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
|
||||
int32_t first = init_crc != 0;
|
||||
|
||||
/* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
|
||||
* for the aligning load that occurs. If there's an initial CRC, to carry it forward through
|
||||
* the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
|
||||
* up to 15 bytes + one full vector load. */
|
||||
assert(len >= 16 || first == 0);
|
||||
#endif
|
||||
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
if (len < 16) {
|
||||
if (len == 0)
|
||||
return;
|
||||
|
||||
memcpy(partial_buf, src, len);
|
||||
xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
|
||||
#ifdef COPY
|
||||
memcpy(dst, partial_buf, len);
|
||||
#endif
|
||||
goto partial;
|
||||
}
|
||||
|
||||
algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
|
||||
if (algn_diff) {
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
|
||||
dst += algn_diff;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_crc_part);
|
||||
|
||||
if (algn_diff < 4 && init_crc != 0) {
|
||||
xmm_t0 = xmm_crc_part;
|
||||
if (len >= 32) {
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
|
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
||||
} else {
|
||||
memcpy(partial_buf, src + 16, len - 16);
|
||||
xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
|
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
||||
src += 16;
|
||||
len -= 16;
|
||||
#ifdef COPY
|
||||
dst -= algn_diff;
|
||||
#endif
|
||||
goto partial;
|
||||
}
|
||||
|
||||
src += 16;
|
||||
len -= 16;
|
||||
}
|
||||
#endif
|
||||
|
||||
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
|
||||
src += algn_diff;
|
||||
len -= algn_diff;
|
||||
}
|
||||
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
if (len >= 256) {
|
||||
#ifdef COPY
|
||||
size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
|
||||
dst += n;
|
||||
#else
|
||||
size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
|
||||
xmm_initial, first);
|
||||
first = 0;
|
||||
#endif
|
||||
len -= n;
|
||||
src += n;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (len >= 64) {
|
||||
len -= 64;
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
||||
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
|
||||
src += 64;
|
||||
|
||||
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
|
||||
_mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
|
||||
dst += 64;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_t0);
|
||||
#endif
|
||||
|
||||
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
|
||||
}
|
||||
|
||||
/*
|
||||
* len = num bytes left - 64
|
||||
*/
|
||||
if (len >= 48) {
|
||||
len -= 48;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
||||
src += 48;
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
|
||||
dst += 48;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_t0);
|
||||
#endif
|
||||
fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
|
||||
} else if (len >= 32) {
|
||||
len -= 32;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
src += 32;
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
dst += 32;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_t0);
|
||||
#endif
|
||||
fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
|
||||
} else if (len >= 16) {
|
||||
len -= 16;
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
src += 16;
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
dst += 16;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_t0);
|
||||
#endif
|
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
||||
}
|
||||
|
||||
partial:
|
||||
if (len) {
|
||||
memcpy(&xmm_crc_part, src, len);
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
|
||||
memcpy(dst, partial_buf, len);
|
||||
#endif
|
||||
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
}
|
||||
|
||||
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
}
|
||||
107
third_party/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
vendored
Normal file
107
third_party/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
vendored
Normal file
@@ -0,0 +1,107 @@
|
||||
/* crc32_fold_vpclmulqdq_tpl.h -- VPCMULQDQ-based CRC32 folding template.
|
||||
* Copyright Wangyang Guo (wangyang.guo@intel.com)
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef COPY
|
||||
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
#else
|
||||
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
|
||||
__m128i init_crc, int32_t first) {
|
||||
__m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
|
||||
#endif
|
||||
__m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
|
||||
__m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
|
||||
__m512i z0, z1, z2, z3;
|
||||
size_t len_tmp = len;
|
||||
const __m512i zmm_fold4 = _mm512_set4_epi32(
|
||||
0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||
const __m512i zmm_fold16 = _mm512_set4_epi32(
|
||||
0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
|
||||
|
||||
// zmm register init
|
||||
zmm_crc0 = _mm512_setzero_si512();
|
||||
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
|
||||
#ifndef COPY
|
||||
XOR_INITIAL512(zmm_t0);
|
||||
#endif
|
||||
zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
|
||||
zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
|
||||
zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
|
||||
|
||||
/* already have intermediate CRC in xmm registers
|
||||
* fold4 with 4 xmm_crc to get zmm_crc0
|
||||
*/
|
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
|
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
|
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
|
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
|
||||
|
||||
#ifdef COPY
|
||||
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
|
||||
_mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
|
||||
_mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
|
||||
_mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
|
||||
dst += 256;
|
||||
#endif
|
||||
len -= 256;
|
||||
src += 256;
|
||||
|
||||
// fold-16 loops
|
||||
while (len >= 256) {
|
||||
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
|
||||
zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
|
||||
zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
|
||||
zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
|
||||
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
|
||||
z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
|
||||
z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
|
||||
z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
|
||||
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
|
||||
zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
|
||||
zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
|
||||
zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
|
||||
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
|
||||
zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96);
|
||||
zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96);
|
||||
zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96);
|
||||
|
||||
#ifdef COPY
|
||||
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
|
||||
_mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
|
||||
_mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
|
||||
_mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
|
||||
dst += 256;
|
||||
#endif
|
||||
len -= 256;
|
||||
src += 256;
|
||||
}
|
||||
// zmm_crc[0,1,2,3] -> zmm_crc0
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96);
|
||||
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96);
|
||||
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96);
|
||||
|
||||
// zmm_crc0 -> xmm_crc[0, 1, 2, 3]
|
||||
*xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
|
||||
*xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
|
||||
*xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
|
||||
*xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
|
||||
|
||||
return (len_tmp - len); // return n bytes processed
|
||||
}
|
||||
30
third_party/zlib-ng/arch/x86/crc32_pclmulqdq.c
vendored
Normal file
30
third_party/zlib-ng/arch/x86/crc32_pclmulqdq.c
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
* doc/crc-pclmulqdq.pdf
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Copyright (C) 2016 Marian Beermann (support for initial value)
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef X86_PCLMULQDQ_CRC
|
||||
|
||||
#define CRC32_FOLD_COPY crc32_fold_pclmulqdq_copy
|
||||
#define CRC32_FOLD crc32_fold_pclmulqdq
|
||||
#define CRC32_FOLD_RESET crc32_fold_pclmulqdq_reset
|
||||
#define CRC32_FOLD_FINAL crc32_fold_pclmulqdq_final
|
||||
#define CRC32 crc32_pclmulqdq
|
||||
|
||||
#include "crc32_pclmulqdq_tpl.h"
|
||||
|
||||
#endif
|
||||
375
third_party/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
vendored
Normal file
375
third_party/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
vendored
Normal file
@@ -0,0 +1,375 @@
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
* doc/crc-pclmulqdq.pdf
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Copyright (C) 2016 Marian Beermann (support for initial value)
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
#include <smmintrin.h> // _mm_extract_epi32
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
# include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#include "crc32.h"
|
||||
#include "crc32_braid_p.h"
|
||||
#include "crc32_braid_tbl.h"
|
||||
#include "x86_intrins.h"
|
||||
#include <assert.h>
|
||||
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc,
|
||||
int32_t first);
|
||||
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
#endif
|
||||
|
||||
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp3;
|
||||
__m128 ps_crc0, ps_crc3, ps_res;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc3 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
|
||||
|
||||
*xmm_crc0 = *xmm_crc1;
|
||||
*xmm_crc1 = *xmm_crc2;
|
||||
*xmm_crc2 = x_tmp3;
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp3, x_tmp2;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
x_tmp2 = *xmm_crc2;
|
||||
|
||||
*xmm_crc3 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
|
||||
|
||||
*xmm_crc2 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
|
||||
|
||||
*xmm_crc0 = x_tmp2;
|
||||
*xmm_crc1 = x_tmp3;
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res20);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res31);
|
||||
}
|
||||
|
||||
static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp3;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc3 = *xmm_crc2;
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
|
||||
|
||||
*xmm_crc2 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
|
||||
|
||||
*xmm_crc1 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
|
||||
|
||||
*xmm_crc0 = x_tmp3;
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res10);
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res21);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res32);
|
||||
}
|
||||
|
||||
static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
|
||||
__m128 ps_t0, ps_t1, ps_t2, ps_t3;
|
||||
__m128 ps_res0, ps_res1, ps_res2, ps_res3;
|
||||
|
||||
x_tmp0 = *xmm_crc0;
|
||||
x_tmp1 = *xmm_crc1;
|
||||
x_tmp2 = *xmm_crc2;
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_t0 = _mm_castsi128_ps(x_tmp0);
|
||||
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
|
||||
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_t1 = _mm_castsi128_ps(x_tmp1);
|
||||
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
|
||||
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_t2 = _mm_castsi128_ps(x_tmp2);
|
||||
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
|
||||
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
|
||||
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_t3 = _mm_castsi128_ps(x_tmp3);
|
||||
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
|
||||
|
||||
*xmm_crc0 = _mm_castps_si128(ps_res0);
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res1);
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res2);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res3);
|
||||
}
|
||||
|
||||
static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
|
||||
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
|
||||
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
|
||||
0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
|
||||
0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
|
||||
0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
|
||||
0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
|
||||
0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl 9 (16 - 7)/shr7 */
|
||||
0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl 8 (16 - 8)/shr8 */
|
||||
0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl 7 (16 - 9)/shr9 */
|
||||
0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl 6 (16 -10)/shr10*/
|
||||
0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl 5 (16 -11)/shr11*/
|
||||
0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl 4 (16 -12)/shr12*/
|
||||
0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
|
||||
0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
|
||||
0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
|
||||
};
|
||||
|
||||
static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
|
||||
__m128i *xmm_crc3, __m128i *xmm_crc_part) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
|
||||
|
||||
__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
|
||||
__m128i xmm_a0_0, xmm_a0_1;
|
||||
__m128 ps_crc3, psa0_0, psa0_1, ps_res;
|
||||
|
||||
xmm_shl = _mm_load_si128((__m128i *)(pshufb_shf_table + (4 * (len - 1))));
|
||||
xmm_shr = xmm_shl;
|
||||
xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
|
||||
|
||||
xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
|
||||
|
||||
*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
|
||||
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
|
||||
*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
|
||||
|
||||
*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
|
||||
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
|
||||
*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
|
||||
|
||||
*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
|
||||
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
|
||||
*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
|
||||
|
||||
*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
|
||||
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
|
||||
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
|
||||
|
||||
xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
|
||||
xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
|
||||
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
psa0_0 = _mm_castsi128_ps(xmm_a0_0);
|
||||
psa0_1 = _mm_castsi128_ps(xmm_a0_1);
|
||||
|
||||
ps_res = _mm_xor_ps(ps_crc3, psa0_0);
|
||||
ps_res = _mm_xor_ps(ps_res, psa0_1);
|
||||
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) {
|
||||
*fold0 = _mm_load_si128(fold + 0);
|
||||
*fold1 = _mm_load_si128(fold + 1);
|
||||
*fold2 = _mm_load_si128(fold + 2);
|
||||
*fold3 = _mm_load_si128(fold + 3);
|
||||
}
|
||||
|
||||
static inline void crc32_fold_save(__m128i *fold, const __m128i *fold0, const __m128i *fold1,
|
||||
const __m128i *fold2, const __m128i *fold3) {
|
||||
_mm_storeu_si128(fold + 0, *fold0);
|
||||
_mm_storeu_si128(fold + 1, *fold1);
|
||||
_mm_storeu_si128(fold + 2, *fold2);
|
||||
_mm_storeu_si128(fold + 3, *fold3);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t CRC32_FOLD_RESET(crc32_fold *crc) {
|
||||
__m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
|
||||
__m128i xmm_zero = _mm_setzero_si128();
|
||||
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_zero, &xmm_zero, &xmm_zero);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define ONCE(op) if (first) { first = 0; op; }
|
||||
#define XOR_INITIAL128(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
# define XOR_INITIAL512(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
|
||||
#endif
|
||||
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
# include "crc32_fold_vpclmulqdq_tpl.h"
|
||||
#endif
|
||||
#include "crc32_fold_pclmulqdq_tpl.h"
|
||||
#define COPY
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
# include "crc32_fold_vpclmulqdq_tpl.h"
|
||||
#endif
|
||||
#include "crc32_fold_pclmulqdq_tpl.h"
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_k[] = {
|
||||
0xccaa009e, 0x00000000, /* rk1 */
|
||||
0x751997d0, 0x00000001, /* rk2 */
|
||||
0xccaa009e, 0x00000000, /* rk5 */
|
||||
0x63cd6124, 0x00000001, /* rk6 */
|
||||
0xf7011640, 0x00000001, /* rk7 */
|
||||
0xdb710640, 0x00000001 /* rk8 */
|
||||
};
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_mask[4] = {
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
|
||||
};
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_mask2[4] = {
|
||||
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
|
||||
};
|
||||
|
||||
Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
|
||||
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
|
||||
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
|
||||
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
|
||||
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
|
||||
|
||||
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
/*
|
||||
* k1
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i *)crc_k);
|
||||
|
||||
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
|
||||
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
|
||||
|
||||
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
|
||||
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
|
||||
|
||||
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
|
||||
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
|
||||
/*
|
||||
* k5
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i *)(crc_k + 4));
|
||||
|
||||
xmm_crc0 = xmm_crc3;
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
||||
|
||||
xmm_crc0 = xmm_crc3;
|
||||
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
|
||||
|
||||
/*
|
||||
* k7
|
||||
*/
|
||||
xmm_crc1 = xmm_crc3;
|
||||
xmm_crc2 = xmm_crc3;
|
||||
crc_fold = _mm_load_si128((__m128i *)(crc_k + 8));
|
||||
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
|
||||
|
||||
xmm_crc2 = xmm_crc3;
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
|
||||
|
||||
crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2));
|
||||
|
||||
return crc->value;
|
||||
}
|
||||
|
||||
static inline uint32_t crc32_small(uint32_t crc, const uint8_t *buf, size_t len) {
|
||||
uint32_t c = (~crc) & 0xffffffff;
|
||||
|
||||
while (len) {
|
||||
len--;
|
||||
DO1;
|
||||
}
|
||||
|
||||
return c ^ 0xffffffff;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
|
||||
/* For lens smaller than ~12, crc32_small method is faster.
|
||||
* But there are also minimum requirements for the pclmul functions due to alignment */
|
||||
if (len < 16)
|
||||
return crc32_small(crc32, buf, len);
|
||||
|
||||
crc32_fold ALIGNED_(16) crc_state;
|
||||
CRC32_FOLD_RESET(&crc_state);
|
||||
CRC32_FOLD(&crc_state, buf, len, crc32);
|
||||
return CRC32_FOLD_FINAL(&crc_state);
|
||||
}
|
||||
17
third_party/zlib-ng/arch/x86/crc32_vpclmulqdq.c
vendored
Normal file
17
third_party/zlib-ng/arch/x86/crc32_vpclmulqdq.c
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
|
||||
* Copyright Wangyang Guo (wangyang.guo@intel.com)
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef X86_VPCLMULQDQ_CRC
|
||||
|
||||
#define X86_VPCLMULQDQ
|
||||
#define CRC32_FOLD_COPY crc32_fold_vpclmulqdq_copy
|
||||
#define CRC32_FOLD crc32_fold_vpclmulqdq
|
||||
#define CRC32_FOLD_RESET crc32_fold_vpclmulqdq_reset
|
||||
#define CRC32_FOLD_FINAL crc32_fold_vpclmulqdq_final
|
||||
#define CRC32 crc32_vpclmulqdq
|
||||
|
||||
#include "crc32_pclmulqdq_tpl.h"
|
||||
|
||||
#endif
|
||||
40
third_party/zlib-ng/arch/x86/slide_hash_avx2.c
vendored
Normal file
40
third_party/zlib-ng/arch/x86/slide_hash_avx2.c
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* AVX2 optimized hash slide, based on Intel's slide_sse implementation
|
||||
*
|
||||
* Copyright (C) 2017 Intel Corporation
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
* Mika T. Lindqvist <postmaster@raasu.org>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
|
||||
table += entries;
|
||||
table -= 16;
|
||||
|
||||
do {
|
||||
__m256i value, result;
|
||||
|
||||
value = _mm256_loadu_si256((__m256i *)table);
|
||||
result = _mm256_subs_epu16(value, wsize);
|
||||
_mm256_storeu_si256((__m256i *)table, result);
|
||||
|
||||
table -= 16;
|
||||
entries -= 16;
|
||||
} while (entries > 0);
|
||||
}
|
||||
|
||||
Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
|
||||
Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
|
||||
uint16_t wsize = (uint16_t)s->w_size;
|
||||
const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
|
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
|
||||
slide_hash_chain(s->prev, wsize, ymm_wsize);
|
||||
}
|
||||
63
third_party/zlib-ng/arch/x86/slide_hash_sse2.c
vendored
Normal file
63
third_party/zlib-ng/arch/x86/slide_hash_sse2.c
vendored
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
* SSE optimized hash slide
|
||||
*
|
||||
* Copyright (C) 2017 Intel Corporation
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <assert.h>
|
||||
|
||||
static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
|
||||
uint32_t entries1, const __m128i wsize) {
|
||||
uint32_t entries;
|
||||
Pos *table;
|
||||
__m128i value0, value1, result0, result1;
|
||||
|
||||
int on_chain = 0;
|
||||
|
||||
next_chain:
|
||||
table = (on_chain) ? table1 : table0;
|
||||
entries = (on_chain) ? entries1 : entries0;
|
||||
|
||||
table += entries;
|
||||
table -= 16;
|
||||
|
||||
/* ZALLOC allocates this pointer unless the user chose a custom allocator.
|
||||
* Our alloc function is aligned to 64 byte boundaries */
|
||||
do {
|
||||
value0 = _mm_load_si128((__m128i *)table);
|
||||
value1 = _mm_load_si128((__m128i *)(table + 8));
|
||||
result0 = _mm_subs_epu16(value0, wsize);
|
||||
result1 = _mm_subs_epu16(value1, wsize);
|
||||
_mm_store_si128((__m128i *)table, result0);
|
||||
_mm_store_si128((__m128i *)(table + 8), result1);
|
||||
|
||||
table -= 16;
|
||||
entries -= 16;
|
||||
} while (entries > 0);
|
||||
|
||||
++on_chain;
|
||||
if (on_chain > 1) {
|
||||
return;
|
||||
} else {
|
||||
goto next_chain;
|
||||
}
|
||||
}
|
||||
|
||||
Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
|
||||
Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
|
||||
uint16_t wsize = (uint16_t)s->w_size;
|
||||
const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
|
||||
|
||||
assert(((uintptr_t)s->head & 15) == 0);
|
||||
assert(((uintptr_t)s->prev & 15) == 0);
|
||||
|
||||
slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
|
||||
}
|
||||
117
third_party/zlib-ng/arch/x86/x86_features.c
vendored
Normal file
117
third_party/zlib-ng/arch/x86/x86_features.c
vendored
Normal file
@@ -0,0 +1,117 @@
|
||||
/* x86_features.c - x86 feature check
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Author:
|
||||
* Jim Kukunas
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "x86_features.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
# include <intrin.h>
|
||||
#else
|
||||
// Newer versions of GCC and clang come with cpuid.h
|
||||
# include <cpuid.h>
|
||||
# ifdef X86_HAVE_XSAVE_INTRIN
|
||||
# if __GNUC__ == 8
|
||||
# include <xsaveintrin.h>
|
||||
# else
|
||||
# include <immintrin.h>
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include <string.h>
|
||||
|
||||
static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
|
||||
#ifdef _MSC_VER
|
||||
unsigned int registers[4];
|
||||
__cpuid((int *)registers, info);
|
||||
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
*eax = *ebx = *ecx = *edx = 0;
|
||||
__cpuid(info, *eax, *ebx, *ecx, *edx);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
|
||||
#ifdef _MSC_VER
|
||||
unsigned int registers[4];
|
||||
__cpuidex((int *)registers, info, subinfo);
|
||||
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
*eax = *ebx = *ecx = *edx = 0;
|
||||
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint64_t xgetbv(unsigned int xcr) {
|
||||
#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN)
|
||||
return _xgetbv(xcr);
|
||||
#else
|
||||
uint32_t eax, edx;
|
||||
__asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr));
|
||||
return (uint64_t)(edx) << 32 | eax;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
|
||||
unsigned eax, ebx, ecx, edx;
|
||||
unsigned maxbasic;
|
||||
|
||||
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
|
||||
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
features->has_sse2 = edx & 0x4000000;
|
||||
features->has_ssse3 = ecx & 0x200;
|
||||
features->has_sse42 = ecx & 0x100000;
|
||||
features->has_pclmulqdq = ecx & 0x2;
|
||||
|
||||
if (ecx & 0x08000000) {
|
||||
uint64_t xfeature = xgetbv(0);
|
||||
|
||||
features->has_os_save_ymm = ((xfeature & 0x06) == 0x06);
|
||||
features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6);
|
||||
}
|
||||
|
||||
if (maxbasic >= 7) {
|
||||
cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
// check BMI1 bit
|
||||
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
|
||||
features->has_vpclmulqdq = ecx & 0x400;
|
||||
|
||||
// check AVX2 bit if the OS supports saving YMM registers
|
||||
if (features->has_os_save_ymm) {
|
||||
features->has_avx2 = ebx & 0x20;
|
||||
}
|
||||
|
||||
features->has_bmi2 = ebx & 0x8;
|
||||
|
||||
// check AVX512 bits if the OS supports saving ZMM registers
|
||||
if (features->has_os_save_zmm) {
|
||||
features->has_avx512f = ebx & 0x00010000;
|
||||
if (features->has_avx512f) {
|
||||
// According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable
|
||||
// AVX512(DQ,BW,VL).
|
||||
features->has_avx512dq = ebx & 0x00020000;
|
||||
features->has_avx512bw = ebx & 0x40000000;
|
||||
features->has_avx512vl = ebx & 0x80000000;
|
||||
}
|
||||
features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
|
||||
&& features->has_avx512vl && features->has_bmi2;
|
||||
features->has_avx512vnni = ecx & 0x800;
|
||||
}
|
||||
}
|
||||
}
|
||||
29
third_party/zlib-ng/arch/x86/x86_features.h
vendored
Normal file
29
third_party/zlib-ng/arch/x86/x86_features.h
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
/* x86_features.h -- check for CPU features
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef X86_FEATURES_H_
|
||||
#define X86_FEATURES_H_
|
||||
|
||||
struct x86_cpu_features {
|
||||
int has_avx2;
|
||||
int has_avx512f;
|
||||
int has_avx512dq;
|
||||
int has_avx512bw;
|
||||
int has_avx512vl;
|
||||
int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
|
||||
int has_avx512vnni;
|
||||
int has_bmi2;
|
||||
int has_sse2;
|
||||
int has_ssse3;
|
||||
int has_sse42;
|
||||
int has_pclmulqdq;
|
||||
int has_vpclmulqdq;
|
||||
int has_os_save_ymm;
|
||||
int has_os_save_zmm;
|
||||
};
|
||||
|
||||
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
|
||||
|
||||
#endif /* X86_FEATURES_H_ */
|
||||
181
third_party/zlib-ng/arch/x86/x86_functions.h
vendored
Normal file
181
third_party/zlib-ng/arch/x86/x86_functions.h
vendored
Normal file
@@ -0,0 +1,181 @@
|
||||
/* x86_functions.h -- x86 implementations for arch-specific functions.
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef X86_FUNCTIONS_H_
|
||||
#define X86_FUNCTIONS_H_
|
||||
|
||||
#ifdef X86_SSE2
|
||||
uint32_t chunksize_sse2(void);
|
||||
uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
|
||||
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
|
||||
uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
|
||||
uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
|
||||
void slide_hash_sse2(deflate_state *s);
|
||||
# endif
|
||||
void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
|
||||
#endif
|
||||
|
||||
#ifdef X86_SSSE3
|
||||
uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
|
||||
void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
|
||||
#endif
|
||||
|
||||
#ifdef X86_SSE42
|
||||
uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
#endif
|
||||
|
||||
#ifdef X86_AVX2
|
||||
uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
uint32_t chunksize_avx2(void);
|
||||
uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
|
||||
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
|
||||
uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
|
||||
uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
|
||||
void slide_hash_avx2(deflate_state *s);
|
||||
# endif
|
||||
void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
|
||||
#endif
|
||||
#ifdef X86_AVX512
|
||||
uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
uint32_t chunksize_avx512(void);
|
||||
uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
|
||||
void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
|
||||
#endif
|
||||
#ifdef X86_AVX512VNNI
|
||||
uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
#endif
|
||||
|
||||
#ifdef X86_PCLMULQDQ_CRC
|
||||
uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
|
||||
void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
|
||||
uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
|
||||
uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
#ifdef X86_VPCLMULQDQ_CRC
|
||||
uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
|
||||
void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
|
||||
uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
|
||||
uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef DISABLE_RUNTIME_CPU_DETECTION
|
||||
// X86 - SSE2
|
||||
# if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2)
|
||||
# undef native_chunkmemset_safe
|
||||
# define native_chunkmemset_safe chunkmemset_safe_sse2
|
||||
# undef native_chunksize
|
||||
# define native_chunksize chunksize_sse2
|
||||
# undef native_inflate_fast
|
||||
# define native_inflate_fast inflate_fast_sse2
|
||||
# undef native_slide_hash
|
||||
# define native_slide_hash slide_hash_sse2
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
# undef native_compare256
|
||||
# define native_compare256 compare256_sse2
|
||||
# undef native_longest_match
|
||||
# define native_longest_match longest_match_sse2
|
||||
# undef native_longest_match_slow
|
||||
# define native_longest_match_slow longest_match_slow_sse2
|
||||
# endif
|
||||
#endif
|
||||
// X86 - SSSE3
|
||||
# if defined(X86_SSSE3) && defined(__SSSE3__)
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_ssse3
|
||||
# undef native_chunkmemset_safe
|
||||
# define native_chunkmemset_safe chunkmemset_safe_ssse3
|
||||
# undef native_inflate_fast
|
||||
# define native_inflate_fast inflate_fast_ssse3
|
||||
# endif
|
||||
// X86 - SSE4.2
|
||||
# if defined(X86_SSE42) && defined(__SSE4_2__)
|
||||
# undef native_adler32_fold_copy
|
||||
# define native_adler32_fold_copy adler32_fold_copy_sse42
|
||||
# endif
|
||||
|
||||
// X86 - PCLMUL
|
||||
#if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__)
|
||||
# undef native_crc32
|
||||
# define native_crc32 crc32_pclmulqdq
|
||||
# undef native_crc32_fold
|
||||
# define native_crc32_fold crc32_fold_pclmulqdq
|
||||
# undef native_crc32_fold_copy
|
||||
# define native_crc32_fold_copy crc32_fold_pclmulqdq_copy
|
||||
# undef native_crc32_fold_final
|
||||
# define native_crc32_fold_final crc32_fold_pclmulqdq_final
|
||||
# undef native_crc32_fold_reset
|
||||
# define native_crc32_fold_reset crc32_fold_pclmulqdq_reset
|
||||
#endif
|
||||
// X86 - AVX
|
||||
# if defined(X86_AVX2) && defined(__AVX2__)
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_avx2
|
||||
# undef native_adler32_fold_copy
|
||||
# define native_adler32_fold_copy adler32_fold_copy_avx2
|
||||
# undef native_chunkmemset_safe
|
||||
# define native_chunkmemset_safe chunkmemset_safe_avx2
|
||||
# undef native_chunksize
|
||||
# define native_chunksize chunksize_avx2
|
||||
# undef native_inflate_fast
|
||||
# define native_inflate_fast inflate_fast_avx2
|
||||
# undef native_slide_hash
|
||||
# define native_slide_hash slide_hash_avx2
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
# undef native_compare256
|
||||
# define native_compare256 compare256_avx2
|
||||
# undef native_longest_match
|
||||
# define native_longest_match longest_match_avx2
|
||||
# undef native_longest_match_slow
|
||||
# define native_longest_match_slow longest_match_slow_avx2
|
||||
# endif
|
||||
# endif
|
||||
|
||||
// X86 - AVX512 (F,DQ,BW,Vl)
|
||||
# if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_avx512
|
||||
# undef native_adler32_fold_copy
|
||||
# define native_adler32_fold_copy adler32_fold_copy_avx512
|
||||
# undef native_chunkmemset_safe
|
||||
# define native_chunkmemset_safe chunkmemset_safe_avx512
|
||||
# undef native_chunksize
|
||||
# define native_chunksize chunksize_avx512
|
||||
# undef native_inflate_fast
|
||||
# define native_inflate_fast inflate_fast_avx512
|
||||
// X86 - AVX512 (VNNI)
|
||||
# if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_avx512_vnni
|
||||
# undef native_adler32_fold_copy
|
||||
# define native_adler32_fold_copy adler32_fold_copy_avx512_vnni
|
||||
# endif
|
||||
// X86 - VPCLMULQDQ
|
||||
# if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__)
|
||||
# undef native_crc32
|
||||
# define native_crc32 crc32_vpclmulqdq
|
||||
# undef native_crc32_fold
|
||||
# define native_crc32_fold crc32_fold_vpclmulqdq
|
||||
# undef native_crc32_fold_copy
|
||||
# define native_crc32_fold_copy crc32_fold_vpclmulqdq_copy
|
||||
# undef native_crc32_fold_final
|
||||
# define native_crc32_fold_final crc32_fold_vpclmulqdq_final
|
||||
# undef native_crc32_fold_reset
|
||||
# define native_crc32_fold_reset crc32_fold_vpclmulqdq_reset
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif /* X86_FUNCTIONS_H_ */
|
||||
92
third_party/zlib-ng/arch/x86/x86_intrins.h
vendored
Normal file
92
third_party/zlib-ng/arch/x86/x86_intrins.h
vendored
Normal file
@@ -0,0 +1,92 @@
|
||||
#ifndef X86_INTRINS_H
|
||||
#define X86_INTRINS_H
|
||||
|
||||
/* Unfortunately GCC didn't support these things until version 10.
|
||||
* Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3.
|
||||
*/
|
||||
#ifdef __AVX2__
|
||||
#include <immintrin.h>
|
||||
|
||||
#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \
|
||||
|| (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
|
||||
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
|
||||
__m128i r;
|
||||
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
|
||||
return _mm256_castsi128_si256(r);
|
||||
}
|
||||
|
||||
#ifdef __AVX512F__
|
||||
static inline __m512i _mm512_zextsi128_si512(__m128i a) {
|
||||
__m128i r;
|
||||
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
|
||||
return _mm512_castsi128_si512(r);
|
||||
}
|
||||
#endif // __AVX512F__
|
||||
#endif // gcc/AppleClang version test
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
/* GCC <9 is missing some AVX512 intrinsics.
|
||||
*/
|
||||
#ifdef __AVX512F__
|
||||
#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9)
|
||||
#include <immintrin.h>
|
||||
|
||||
#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
|
||||
((int)(unsigned char)(c2) << 8) | ((int)(unsigned char)(c3)))
|
||||
|
||||
static inline __m512i _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60,
|
||||
char __q59, char __q58, char __q57, char __q56,
|
||||
char __q55, char __q54, char __q53, char __q52,
|
||||
char __q51, char __q50, char __q49, char __q48,
|
||||
char __q47, char __q46, char __q45, char __q44,
|
||||
char __q43, char __q42, char __q41, char __q40,
|
||||
char __q39, char __q38, char __q37, char __q36,
|
||||
char __q35, char __q34, char __q33, char __q32,
|
||||
char __q31, char __q30, char __q29, char __q28,
|
||||
char __q27, char __q26, char __q25, char __q24,
|
||||
char __q23, char __q22, char __q21, char __q20,
|
||||
char __q19, char __q18, char __q17, char __q16,
|
||||
char __q15, char __q14, char __q13, char __q12,
|
||||
char __q11, char __q10, char __q09, char __q08,
|
||||
char __q07, char __q06, char __q05, char __q04,
|
||||
char __q03, char __q02, char __q01, char __q00) {
|
||||
return _mm512_set_epi32(PACK(__q63, __q62, __q61, __q60), PACK(__q59, __q58, __q57, __q56),
|
||||
PACK(__q55, __q54, __q53, __q52), PACK(__q51, __q50, __q49, __q48),
|
||||
PACK(__q47, __q46, __q45, __q44), PACK(__q43, __q42, __q41, __q40),
|
||||
PACK(__q39, __q38, __q37, __q36), PACK(__q35, __q34, __q33, __q32),
|
||||
PACK(__q31, __q30, __q29, __q28), PACK(__q27, __q26, __q25, __q24),
|
||||
PACK(__q23, __q22, __q21, __q20), PACK(__q19, __q18, __q17, __q16),
|
||||
PACK(__q15, __q14, __q13, __q12), PACK(__q11, __q10, __q09, __q08),
|
||||
PACK(__q07, __q06, __q05, __q04), PACK(__q03, __q02, __q01, __q00));
|
||||
}
|
||||
|
||||
#undef PACK
|
||||
|
||||
#endif // gcc version test
|
||||
#endif // __AVX512F__
|
||||
|
||||
/* Missing zero-extension AVX and AVX512 intrinsics.
|
||||
* Fixed in Microsoft Visual Studio 2017 version 15.7
|
||||
* https://developercommunity.visualstudio.com/t/missing-zero-extension-avx-and-avx512-intrinsics/175737
|
||||
*/
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1914
|
||||
#ifdef __AVX2__
|
||||
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
|
||||
return _mm256_inserti128_si256(_mm256_setzero_si256(), a, 0);
|
||||
}
|
||||
#endif // __AVX2__
|
||||
|
||||
#ifdef __AVX512F__
|
||||
static inline __m512i _mm512_zextsi128_si512(__m128i a) {
|
||||
return _mm512_inserti32x4(_mm512_setzero_si512(), a, 0);
|
||||
}
|
||||
#endif // __AVX512F__
|
||||
#endif // defined(_MSC_VER) && _MSC_VER < 1914
|
||||
|
||||
/* Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsics */
|
||||
#if defined(_MSC_VER) && defined(__AVX512F__) && !defined(_MM_K0_REG8)
|
||||
# undef _mm512_extracti32x4_epi32
|
||||
# define _mm512_extracti32x4_epi32(v1, e1) _mm512_maskz_extracti32x4_epi32(UINT8_MAX, v1, e1)
|
||||
#endif
|
||||
#endif // include guard X86_INTRINS_H
|
||||
Reference in New Issue
Block a user