fix: breakpad use miniz
Some checks failed
sm-rpc / build (Debug, arm-linux-gnueabihf) (push) Successful in 1m34s
sm-rpc / build (Debug, aarch64-linux-gnu) (push) Successful in 2m46s
sm-rpc / build (Debug, host.gcc) (push) Failing after 1m28s
sm-rpc / build (Release, aarch64-linux-gnu) (push) Successful in 2m14s
sm-rpc / build (Release, arm-linux-gnueabihf) (push) Successful in 2m8s
sm-rpc / build (Debug, mipsel-linux-gnu) (push) Successful in 5m35s
sm-rpc / build (Release, host.gcc) (push) Failing after 1m55s
sm-rpc / build (Release, mipsel-linux-gnu) (push) Successful in 7m21s

This commit is contained in:
tqcq
2025-08-25 15:24:22 +08:00
parent a58517497b
commit 68b2e7f763
728 changed files with 489652 additions and 1211 deletions

147
third_party/zlib-ng/arch/x86/Makefile.in vendored Normal file
View File

@@ -0,0 +1,147 @@
# Makefile for zlib
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SUFFIX=
AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2
AVX512VNNIFLAG=-mavx512vnni -mbmi2
AVX2FLAG=-mavx2 -mbmi2
SSE2FLAG=-msse2
SSSE3FLAG=-mssse3
SSE42FLAG=-msse4.2
PCLMULFLAG=-mpclmul
VPCLMULFLAG=-mvpclmulqdq
XSAVEFLAG=-mxsave
NOLTOFLAG=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all: \
x86_features.o x86_features.lo \
adler32_avx2.o adler32_avx2.lo \
adler32_avx512.o adler32_avx512.lo \
adler32_avx512_vnni.o adler32_avx512_vnni.lo \
adler32_sse42.o adler32_sse42.lo \
adler32_ssse3.o adler32_ssse3.lo \
chunkset_avx2.o chunkset_avx2.lo \
chunkset_avx512.o chunkset_avx512.lo \
chunkset_sse2.o chunkset_sse2.lo \
chunkset_ssse3.o chunkset_ssse3.lo \
compare256_avx2.o compare256_avx2.lo \
compare256_sse2.o compare256_sse2.lo \
crc32_pclmulqdq.o crc32_pclmulqdq.lo \
crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
slide_hash_avx2.o slide_hash_avx2.lo \
slide_hash_sse2.o slide_hash_sse2.lo
x86_features.o:
$(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
x86_features.lo:
$(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
chunkset_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
chunkset_avx2.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
chunkset_avx512.o:
$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
chunkset_avx512.lo:
$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
chunkset_sse2.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
chunkset_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
chunkset_ssse3.o:
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
chunkset_ssse3.lo:
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
compare256_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
compare256_avx2.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
compare256_sse2.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
compare256_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
crc32_pclmulqdq.o:
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
crc32_pclmulqdq.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
crc32_vpclmulqdq.o:
$(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
crc32_vpclmulqdq.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
slide_hash_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
slide_hash_avx2.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
slide_hash_sse2.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
slide_hash_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean: clean
rm -f Makefile

View File

@@ -0,0 +1,145 @@
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Copyright (C) 2022 Adam Stylinski
* Authors:
* Brian Bockelman <bockelman@gmail.com>
* Adam Stylinski <kungfujesus06@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_AVX2
#include "zbuild.h"
#include <immintrin.h>
#include "adler32_p.h"
#include "adler32_avx2_p.h"
#include "x86_intrins.h"
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel:
if (len < 16) {
if (COPY) {
return adler32_copy_len_16(adler0, src, dst, len, adler1);
} else {
return adler32_len_16(adler0, src, len, adler1);
}
} else if (len < 32) {
if (COPY) {
return adler32_fold_copy_sse42(adler, dst, src, len);
} else {
return adler32_ssse3(adler, src, len);
}
}
__m256i vs1, vs2;
const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m256i dot3v = _mm256_set1_epi16(1);
const __m256i zero = _mm256_setzero_si256();
while (len >= 32) {
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
__m256i vs1_0 = vs1;
__m256i vs3 = _mm256_setzero_si256();
size_t k = MIN(len, NMAX);
k -= k % 32;
len -= k;
while (k >= 32) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
*/
__m256i vbuf = _mm256_loadu_si256((__m256i*)src);
src += 32;
k -= 32;
__m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
if (COPY) {
_mm256_storeu_si256((__m256i*)dst, vbuf);
dst += 32;
}
vs1 = _mm256_add_epi32(vs1, vs1_sad);
vs3 = _mm256_add_epi32(vs3, vs1_0);
__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
vs2 = _mm256_add_epi32(vsum2, vs2);
vs1_0 = vs1;
}
/* Defer the multiplication with 32 to outside of the loop */
vs3 = _mm256_slli_epi32(vs3, 5);
vs2 = _mm256_add_epi32(vs2, vs3);
/* The compiler is generating the following sequence for this integer modulus
* when done the scalar way, in GPRs:
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
mov $0x80078071,%edi // move magic constant into 32 bit register %edi
...
vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
shr $0x2f,%rsi // shift right by 47
imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
...
// repeats for each element with vpextract instructions
This is tricky with AVX2 for a number of reasons:
1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
back down to 32 bit precision later (there is in AVX512)
3.) Full width integer multiplications aren't cheap
We can, however, do a relatively cheap sequence for horizontal sums.
Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
performed on the maximum possible inputs before overflow
*/
/* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
* conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
* This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
* what the compiler is doing to avoid integer divisions. */
adler0 = partial_hsum256(vs1) % BASE;
adler1 = hsum256(vs2) % BASE;
}
adler = adler0 | (adler1 << 16);
if (len) {
goto rem_peel;
}
return adler;
}
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
return adler32_fold_copy_impl(adler, NULL, src, len, 0);
}
Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
return adler32_fold_copy_impl(adler, dst, src, len, 1);
}
#endif

View File

@@ -0,0 +1,32 @@
/* adler32_avx2_p.h -- adler32 avx2 utility functions
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ADLER32_AVX2_P_H_
#define ADLER32_AVX2_P_H_
#if defined(X86_AVX2) || defined(X86_AVX512VNNI)
/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
static inline uint32_t hsum256(__m256i x) {
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
_mm256_castsi256_si128(x));
__m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
return (uint32_t)_mm_cvtsi128_si32(sum3);
}
static inline uint32_t partial_hsum256(__m256i x) {
/* We need a permutation vector to extract every other integer. The
* rest are going to be zeros */
const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
__m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec);
__m128i non_zero_sse = _mm256_castsi256_si128(non_zero);
__m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse));
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
return (uint32_t)_mm_cvtsi128_si32(sum3);
}
#endif
#endif

View File

@@ -0,0 +1,108 @@
/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_AVX512
#include "zbuild.h"
#include "adler32_p.h"
#include "arch_functions.h"
#include <immintrin.h>
#include "x86_intrins.h"
#include "adler32_avx512_p.h"
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel:
if (len < 64) {
/* This handles the remaining copies, just call normal adler checksum after this */
if (COPY) {
__mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
__m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
_mm512_mask_storeu_epi8(dst, storemask, copy_vec);
}
return adler32_avx2(adler, src, len);
}
__m512i vbuf, vs1_0, vs3;
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64);
const __m512i dot3v = _mm512_set1_epi16(1);
const __m512i zero = _mm512_setzero_si512();
size_t k;
while (len >= 64) {
__m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
__m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
vs1_0 = vs1;
vs3 = _mm512_setzero_si512();
k = MIN(len, NMAX);
k -= k % 64;
len -= k;
while (k >= 64) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
*/
vbuf = _mm512_loadu_si512(src);
if (COPY) {
_mm512_storeu_si512(dst, vbuf);
dst += 64;
}
src += 64;
k -= 64;
__m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
__m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
vs1 = _mm512_add_epi32(vs1_sad, vs1);
vs3 = _mm512_add_epi32(vs3, vs1_0);
__m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
vs2 = _mm512_add_epi32(vsum2, vs2);
vs1_0 = vs1;
}
vs3 = _mm512_slli_epi32(vs3, 6);
vs2 = _mm512_add_epi32(vs2, vs3);
adler0 = partial_hsum(vs1) % BASE;
adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
}
adler = adler0 | (adler1 << 16);
/* Process tail (len < 64). */
if (len) {
goto rem_peel;
}
return adler;
}
Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
return adler32_fold_copy_impl(adler, dst, src, len, 1);
}
Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
return adler32_fold_copy_impl(adler, NULL, src, len, 0);
}
#endif

View File

@@ -0,0 +1,57 @@
#ifndef AVX512_FUNCS_H
#define AVX512_FUNCS_H
#include <immintrin.h>
#include <stdint.h>
/* Written because Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsic macros */
#if defined(_MSC_VER) && !defined(_MM_K0_REG8)
# undef _mm512_extracti64x4_epi64
# define _mm512_extracti64x4_epi64(v1, e1) _mm512_maskz_extracti64x4_epi64(UINT8_MAX, v1, e1)
# undef _mm512_set1_epi16
# define _mm512_set1_epi16(e1) _mm512_maskz_set1_epi16(UINT32_MAX, e1)
# undef _mm512_maddubs_epi16
# define _mm512_maddubs_epi16(v1, v2) _mm512_maskz_maddubs_epi16(UINT32_MAX, v1, v2)
#endif
/* Written because *_add_epi32(a) sets off ubsan */
static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
__m256i a = _mm512_extracti64x4_epi64(x, 1);
__m256i b = _mm512_extracti64x4_epi64(x, 0);
__m256i a_plus_b = _mm256_add_epi32(a, b);
__m128i c = _mm256_extracti128_si256(a_plus_b, 1);
__m128i d = _mm256_extracti128_si256(a_plus_b, 0);
__m128i c_plus_d = _mm_add_epi32(c, d);
__m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
__m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
__m128i sum4 = _mm_add_epi32(sum2, sum3);
return _mm_cvtsi128_si32(sum4);
}
static inline uint32_t partial_hsum(__m512i x) {
/* We need a permutation vector to extract every other integer. The
* rest are going to be zeros. Marking this const so the compiler stands
* a better chance of keeping this resident in a register through entire
* loop execution. We certainly have enough zmm registers (32) */
const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
1, 1, 1, 1, 1, 1, 1, 1);
__m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
/* From here, it's a simple 256 bit wide reduction sum */
__m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
/* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
* pretty slow, much slower than the longer instruction sequence below */
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
_mm256_castsi256_si128(non_zero_avx));
__m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
__m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
return (uint32_t)_mm_cvtsi128_si32(sum3);
}
#endif

View File

@@ -0,0 +1,210 @@
/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
* Based on Brian Bockelman's AVX2 version
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_AVX512VNNI
#include "zbuild.h"
#include "adler32_p.h"
#include "arch_functions.h"
#include <immintrin.h>
#include "x86_intrins.h"
#include "adler32_avx512_p.h"
#include "adler32_avx2_p.h"
Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel:
if (len < 32)
return adler32_ssse3(adler, src, len);
if (len < 64)
return adler32_avx2(adler, src, len);
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64);
const __m512i zero = _mm512_setzero_si512();
__m512i vs1, vs2;
while (len >= 64) {
vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
size_t k = MIN(len, NMAX);
k -= k % 64;
len -= k;
__m512i vs1_0 = vs1;
__m512i vs3 = _mm512_setzero_si512();
/* We might get a tad bit more ILP here if we sum to a second register in the loop */
__m512i vs2_1 = _mm512_setzero_si512();
__m512i vbuf0, vbuf1;
/* Remainder peeling */
if (k % 128) {
vbuf1 = _mm512_loadu_si512((__m512i*)src);
src += 64;
k -= 64;
__m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
vs1 = _mm512_add_epi32(vs1, vs1_sad);
vs3 = _mm512_add_epi32(vs3, vs1_0);
vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
vs1_0 = vs1;
}
/* Manually unrolled this loop by 2 for an decent amount of ILP */
while (k >= 128) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
*/
vbuf0 = _mm512_loadu_si512((__m512i*)src);
vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
src += 128;
k -= 128;
__m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
vs1 = _mm512_add_epi32(vs1, vs1_sad);
vs3 = _mm512_add_epi32(vs3, vs1_0);
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
* instructions to eliminate them */
vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
vs3 = _mm512_add_epi32(vs3, vs1);
vs1_sad = _mm512_sad_epu8(vbuf1, zero);
vs1 = _mm512_add_epi32(vs1, vs1_sad);
vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
vs1_0 = vs1;
}
vs3 = _mm512_slli_epi32(vs3, 6);
vs2 = _mm512_add_epi32(vs2, vs3);
vs2 = _mm512_add_epi32(vs2, vs2_1);
adler0 = partial_hsum(vs1) % BASE;
adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
}
adler = adler0 | (adler1 << 16);
/* Process tail (len < 64). */
if (len) {
goto rem_peel;
}
return adler;
}
Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel_copy:
if (len < 32) {
/* This handles the remaining copies, just call normal adler checksum after this */
__mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
__m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
_mm256_mask_storeu_epi8(dst, storemask, copy_vec);
return adler32_ssse3(adler, src, len);
}
const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
const __m256i zero = _mm256_setzero_si256();
__m256i vs1, vs2;
while (len >= 32) {
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
size_t k = MIN(len, NMAX);
k -= k % 32;
len -= k;
__m256i vs1_0 = vs1;
__m256i vs3 = _mm256_setzero_si256();
/* We might get a tad bit more ILP here if we sum to a second register in the loop */
__m256i vs2_1 = _mm256_setzero_si256();
__m256i vbuf0, vbuf1;
/* Remainder peeling */
if (k % 64) {
vbuf1 = _mm256_loadu_si256((__m256i*)src);
_mm256_storeu_si256((__m256i*)dst, vbuf1);
dst += 32;
src += 32;
k -= 32;
__m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
vs1 = _mm256_add_epi32(vs1, vs1_sad);
vs3 = _mm256_add_epi32(vs3, vs1_0);
vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
vs1_0 = vs1;
}
/* Manually unrolled this loop by 2 for an decent amount of ILP */
while (k >= 64) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
*/
vbuf0 = _mm256_loadu_si256((__m256i*)src);
vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
_mm256_storeu_si256((__m256i*)dst, vbuf0);
_mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
dst += 64;
src += 64;
k -= 64;
__m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
vs1 = _mm256_add_epi32(vs1, vs1_sad);
vs3 = _mm256_add_epi32(vs3, vs1_0);
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
* instructions to eliminate them */
vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
vs3 = _mm256_add_epi32(vs3, vs1);
vs1_sad = _mm256_sad_epu8(vbuf1, zero);
vs1 = _mm256_add_epi32(vs1, vs1_sad);
vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
vs1_0 = vs1;
}
vs3 = _mm256_slli_epi32(vs3, 5);
vs2 = _mm256_add_epi32(vs2, vs3);
vs2 = _mm256_add_epi32(vs2, vs2_1);
adler0 = partial_hsum256(vs1) % BASE;
adler1 = hsum256(vs2) % BASE;
}
adler = adler0 | (adler1 << 16);
/* Process tail (len < 64). */
if (len) {
goto rem_peel_copy;
}
return adler;
}
#endif

View File

@@ -0,0 +1,120 @@
/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "adler32_p.h"
#include "adler32_ssse3_p.h"
#include <immintrin.h>
#ifdef X86_SSE42
Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel:
if (len < 16) {
return adler32_copy_len_16(adler0, src, dst, len, adler1);
}
__m128i vbuf, vbuf_0;
__m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
v_sad_sum2, vsum2, vsum2_0;
__m128i zero = _mm_setzero_si128();
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i dot3v = _mm_set1_epi16(1);
size_t k;
while (len >= 16) {
k = MIN(len, NMAX);
k -= k % 16;
len -= k;
vs1 = _mm_cvtsi32_si128(adler0);
vs2 = _mm_cvtsi32_si128(adler1);
vs3 = _mm_setzero_si128();
vs2_0 = _mm_setzero_si128();
vs1_0 = vs1;
while (k >= 32) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
vbuf = _mm_loadu_si128((__m128i*)src);
vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
src += 32;
k -= 32;
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
_mm_storeu_si128((__m128i*)dst, vbuf);
_mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
dst += 32;
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
vs3 = _mm_add_epi32(vs1_0, vs3);
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
vs1 = _mm_add_epi32(v_sad_sum2, vs1);
vs2 = _mm_add_epi32(vsum2, vs2);
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
vs1_0 = vs1;
}
vs2 = _mm_add_epi32(vs2_0, vs2);
vs3 = _mm_slli_epi32(vs3, 5);
vs2 = _mm_add_epi32(vs3, vs2);
vs3 = _mm_setzero_si128();
while (k >= 16) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
vbuf = _mm_loadu_si128((__m128i*)src);
src += 16;
k -= 16;
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
vs3 = _mm_add_epi32(vs1_0, vs3);
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
vs2 = _mm_add_epi32(vsum2, vs2);
vs1_0 = vs1;
_mm_storeu_si128((__m128i*)dst, vbuf);
dst += 16;
}
vs3 = _mm_slli_epi32(vs3, 4);
vs2 = _mm_add_epi32(vs2, vs3);
adler0 = partial_hsum(vs1) % BASE;
adler1 = hsum(vs2) % BASE;
}
/* If this is true, there's fewer than 16 elements remaining */
if (len) {
goto rem_peel;
}
return adler0 | (adler1 << 16);
}
#endif

View File

@@ -0,0 +1,156 @@
/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "adler32_p.h"
#include "adler32_ssse3_p.h"
#ifdef X86_SSSE3
#include <immintrin.h>
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i dot3v = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
__m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
vbuf_0, v_sad_sum2, vsum2, vsum2_0;
/* If our buffer is unaligned (likely), make the determination whether
* or not there's enough of a buffer to consume to make the scalar, aligning
* additions worthwhile or if it's worth it to just eat the cost of an unaligned
* load. This is a pretty simple test, just test if 16 - the remainder + len is
* < 16 */
size_t max_iters = NMAX;
size_t rem = (uintptr_t)buf & 15;
size_t align_offset = 16 - rem;
size_t k = 0;
if (rem) {
if (len < 16 + align_offset) {
/* Let's eat the cost of this one unaligned load so that
* we don't completely skip over the vectorization. Doing
* 16 bytes at a time unaligned is better than 16 + <= 15
* sums */
vbuf = _mm_loadu_si128((__m128i*)buf);
len -= 16;
buf += 16;
vs1 = _mm_cvtsi32_si128(adler);
vs2 = _mm_cvtsi32_si128(sum2);
vs3 = _mm_setzero_si128();
vs1_0 = vs1;
goto unaligned_jmp;
}
for (size_t i = 0; i < align_offset; ++i) {
adler += *(buf++);
sum2 += adler;
}
/* lop off the max number of sums based on the scalar sums done
* above */
len -= align_offset;
max_iters -= align_offset;
}
while (len >= 16) {
vs1 = _mm_cvtsi32_si128(adler);
vs2 = _mm_cvtsi32_si128(sum2);
vs3 = _mm_setzero_si128();
vs2_0 = _mm_setzero_si128();
vs1_0 = vs1;
k = (len < max_iters ? len : max_iters);
k -= k % 16;
len -= k;
while (k >= 32) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
vbuf = _mm_load_si128((__m128i*)buf);
vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
buf += 32;
k -= 32;
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
vs3 = _mm_add_epi32(vs1_0, vs3);
vs1 = _mm_add_epi32(v_sad_sum2, vs1);
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
vs2 = _mm_add_epi32(vsum2, vs2);
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
vs1_0 = vs1;
}
vs2 = _mm_add_epi32(vs2_0, vs2);
vs3 = _mm_slli_epi32(vs3, 5);
vs2 = _mm_add_epi32(vs3, vs2);
vs3 = _mm_setzero_si128();
while (k >= 16) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
vbuf = _mm_load_si128((__m128i*)buf);
buf += 16;
k -= 16;
unaligned_jmp:
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
vs3 = _mm_add_epi32(vs1_0, vs3);
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
vs2 = _mm_add_epi32(vsum2, vs2);
vs1_0 = vs1;
}
vs3 = _mm_slli_epi32(vs3, 4);
vs2 = _mm_add_epi32(vs2, vs3);
/* We don't actually need to do a full horizontal sum, since psadbw is actually doing
* a partial reduction sum implicitly and only summing to integers in vector positions
* 0 and 2. This saves us some contention on the shuffle port(s) */
adler = partial_hsum(vs1) % BASE;
sum2 = hsum(vs2) % BASE;
max_iters = NMAX;
}
/* Process tail (len < 16). */
return adler32_len_16(adler, buf, len, sum2);
}
#endif

View File

@@ -0,0 +1,29 @@
/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ADLER32_SSSE3_P_H_
#define ADLER32_SSSE3_P_H_
#ifdef X86_SSSE3
#include <immintrin.h>
#include <stdint.h>
static inline uint32_t partial_hsum(__m128i x) {
__m128i second_int = _mm_srli_si128(x, 8);
__m128i sum = _mm_add_epi32(x, second_int);
return _mm_cvtsi128_si32(sum);
}
static inline uint32_t hsum(__m128i x) {
__m128i sum1 = _mm_unpackhi_epi64(x, x);
__m128i sum2 = _mm_add_epi32(x, sum1);
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
__m128i sum4 = _mm_add_epi32(sum2, sum3);
return _mm_cvtsi128_si32(sum4);
}
#endif
#endif

View File

@@ -0,0 +1,44 @@
#ifndef _AVX2_TABLES_H
#define _AVX2_TABLES_H
#include "../generic/chunk_permute_table.h"
/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
* never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
static const lut_rem_pair perm_idx_lut[29] = {
{ 0, 2}, /* 3 */
{ 0, 0}, /* don't care */
{ 1 * 32, 2}, /* 5 */
{ 2 * 32, 2}, /* 6 */
{ 3 * 32, 4}, /* 7 */
{ 0 * 32, 0}, /* don't care */
{ 4 * 32, 5}, /* 9 */
{ 5 * 32, 22}, /* 10 */
{ 6 * 32, 21}, /* 11 */
{ 7 * 32, 20}, /* 12 */
{ 8 * 32, 6}, /* 13 */
{ 9 * 32, 4}, /* 14 */
{10 * 32, 2}, /* 15 */
{ 0 * 32, 0}, /* don't care */
{11 * 32, 15}, /* 17 */
{11 * 32 + 16, 14}, /* 18 */
{11 * 32 + 16 * 2, 13}, /* 19 */
{11 * 32 + 16 * 3, 12}, /* 20 */
{11 * 32 + 16 * 4, 11}, /* 21 */
{11 * 32 + 16 * 5, 10}, /* 22 */
{11 * 32 + 16 * 6, 9}, /* 23 */
{11 * 32 + 16 * 7, 8}, /* 24 */
{11 * 32 + 16 * 8, 7}, /* 25 */
{11 * 32 + 16 * 9, 6}, /* 26 */
{11 * 32 + 16 * 10, 5}, /* 27 */
{11 * 32 + 16 * 11, 4}, /* 28 */
{11 * 32 + 16 * 12, 3}, /* 29 */
{11 * 32 + 16 * 13, 2}, /* 30 */
{11 * 32 + 16 * 14, 1} /* 31 */
};
static const uint16_t half_rem_vals[13] = {
1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
};
#endif

View File

@@ -0,0 +1,130 @@
/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zmemory.h"
#ifdef X86_AVX2
#include "avx2_tables.h"
#include <immintrin.h>
#include "x86_intrins.h"
typedef __m256i chunk_t;
typedef __m128i halfchunk_t;
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNKMEMSET_16
#define HAVE_CHUNK_MAG
#define HAVE_HALF_CHUNK
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi16(zng_memread_2(from));
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi32(zng_memread_4(from));
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi64x(zng_memread_8(from));
}
static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
/* See explanation in chunkset_avx512.c */
#if defined(_MSC_VER) && _MSC_VER <= 1900
halfchunk_t half = _mm_loadu_si128((__m128i*)from);
*chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1);
#else
*chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
#endif
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm256_loadu_si256((__m256i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm256_storeu_si256((__m256i *)out, *chunk);
}
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m256i ret_vec;
/* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
* compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in
* GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
*chunk_rem = lut_rem.remval;
/* See note in chunkset_ssse3.c for why this is ok */
__msan_unpoison(buf + dist, 32 - dist);
if (dist < 16) {
/* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
* broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
* shuffles and combining the halves later */
const __m256i permute_xform =
_mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
__m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
} else {
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
__m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
/* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
__m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
__m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
__m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1);
/* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
* shuffle those values */
__m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
}
return ret_vec;
}
static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
*chunk = _mm_loadu_si128((__m128i *)s);
}
static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
_mm_storeu_si128((__m128i *)out, *chunk);
}
static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
/* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
* unlikely to be actually written or read from */
return _mm256_zextsi128_si256(*chunk);
}
static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m128i perm_vec, ret_vec;
__msan_unpoison(buf + dist, 16 - dist);
ret_vec = _mm_loadu_si128((__m128i*)buf);
*chunk_rem = half_rem_vals[dist - 3];
perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
return ret_vec;
}
#define CHUNKSIZE chunksize_avx2
#define CHUNKCOPY chunkcopy_avx2
#define CHUNKUNROLL chunkunroll_avx2
#define CHUNKMEMSET chunkmemset_avx2
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_avx2
#include "inffast_tpl.h"
#endif

View File

@@ -0,0 +1,182 @@
/* chunkset_avx512.c -- AVX512 inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zmemory.h"
#ifdef X86_AVX512
#include "avx2_tables.h"
#include <immintrin.h>
#include "x86_intrins.h"
typedef __m256i chunk_t;
typedef __m128i halfchunk_t;
typedef __mmask32 mask_t;
typedef __mmask16 halfmask_t;
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNKMEMSET_16
#define HAVE_CHUNK_MAG
#define HAVE_HALF_CHUNK
#define HAVE_MASKED_READWRITE
#define HAVE_CHUNKCOPY
#define HAVE_HALFCHUNKCOPY
static inline halfmask_t gen_half_mask(unsigned len) {
return (halfmask_t)_bzhi_u32(0xFFFF, len);
}
static inline mask_t gen_mask(unsigned len) {
return (mask_t)_bzhi_u32(0xFFFFFFFF, len);
}
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi16(zng_memread_2(from));
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi32(zng_memread_4(from));
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi64x(zng_memread_8(from));
}
static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
/* Unfortunately there seems to be a compiler bug in Visual Studio 2015 where
* the load is dumped to the stack with an aligned move for this memory-register
* broadcast. The vbroadcasti128 instruction is 2 fewer cycles and this dump to
* stack doesn't exist if compiled with optimizations. For the sake of working
* properly in a debugger, let's take the 2 cycle penalty */
#if defined(_MSC_VER) && _MSC_VER <= 1900
halfchunk_t half = _mm_loadu_si128((__m128i*)from);
*chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1);
#else
*chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
#endif
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm256_loadu_si256((__m256i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm256_storeu_si256((__m256i *)out, *chunk);
}
static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
Assert(len > 0, "chunkcopy should never have a length 0");
chunk_t chunk;
uint32_t rem = len % sizeof(chunk_t);
if (len < sizeof(chunk_t)) {
mask_t rem_mask = gen_mask(rem);
chunk = _mm256_maskz_loadu_epi8(rem_mask, from);
_mm256_mask_storeu_epi8(out, rem_mask, chunk);
return out + rem;
}
loadchunk(from, &chunk);
rem = (rem == 0) ? sizeof(chunk_t) : rem;
storechunk(out, &chunk);
out += rem;
from += rem;
len -= rem;
while (len > 0) {
loadchunk(from, &chunk);
storechunk(out, &chunk);
out += sizeof(chunk_t);
from += sizeof(chunk_t);
len -= sizeof(chunk_t);
}
return out;
}
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m256i ret_vec;
*chunk_rem = lut_rem.remval;
/* See the AVX2 implementation for more detailed comments. This is that + some masked
* loads to avoid an out of bounds read on the heap */
if (dist < 16) {
const __m256i permute_xform =
_mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
__m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
halfmask_t load_mask = gen_half_mask(dist);
__m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf);
perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
} else {
halfmask_t load_mask = gen_half_mask(dist - 16);
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
__m128i ret_vec1 = _mm_maskz_loadu_epi8(load_mask, (__m128i*)(buf + 16));
__m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
halfmask_t xlane_mask = _mm_cmp_epi8_mask(perm_vec1, _mm_set1_epi8(15), _MM_CMPINT_LE);
__m128i latter_half = _mm_mask_shuffle_epi8(ret_vec1, xlane_mask, ret_vec0, perm_vec1);
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
}
return ret_vec;
}
static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
_mm_storeu_si128((__m128i *)out, *chunk);
}
static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
/* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
* unlikely to be actually written or read from */
return _mm256_zextsi128_si256(*chunk);
}
static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m128i perm_vec, ret_vec;
halfmask_t load_mask = gen_half_mask(dist);
ret_vec = _mm_maskz_loadu_epi8(load_mask, buf);
*chunk_rem = half_rem_vals[dist - 3];
perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
return ret_vec;
}
static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
Assert(len > 0, "chunkcopy should never have a length 0");
halfchunk_t chunk;
uint32_t rem = len % sizeof(halfchunk_t);
if (rem == 0) {
rem = sizeof(halfchunk_t);
}
halfmask_t rem_mask = gen_half_mask(rem);
chunk = _mm_maskz_loadu_epi8(rem_mask, from);
_mm_mask_storeu_epi8(out, rem_mask, chunk);
return out + rem;
}
#define CHUNKSIZE chunksize_avx512
#define CHUNKUNROLL chunkunroll_avx512
#define CHUNKMEMSET chunkmemset_avx512
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx512
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_avx512
#include "inffast_tpl.h"
#endif

View File

@@ -0,0 +1,49 @@
/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zmemory.h"
#ifdef X86_SSE2
#include <immintrin.h>
typedef __m128i chunk_t;
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi16(zng_memread_2(from));
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi32(zng_memread_4(from));
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi64x(zng_memread_8(from));
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm_loadu_si128((__m128i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm_storeu_si128((__m128i *)out, *chunk);
}
#define CHUNKSIZE chunksize_sse2
#define CHUNKCOPY chunkcopy_sse2
#define CHUNKUNROLL chunkunroll_sse2
#define CHUNKMEMSET chunkmemset_sse2
#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_sse2
#include "inffast_tpl.h"
#endif

View File

@@ -0,0 +1,86 @@
/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zmemory.h"
#if defined(X86_SSSE3)
#include <immintrin.h>
#include "../generic/chunk_permute_table.h"
typedef __m128i chunk_t;
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNK_MAG
static const lut_rem_pair perm_idx_lut[13] = {
{0, 1}, /* 3 */
{0, 0}, /* don't care */
{1 * 32, 1}, /* 5 */
{2 * 32, 4}, /* 6 */
{3 * 32, 2}, /* 7 */
{0 * 32, 0}, /* don't care */
{4 * 32, 7}, /* 9 */
{5 * 32, 6}, /* 10 */
{6 * 32, 5}, /* 11 */
{7 * 32, 4}, /* 12 */
{8 * 32, 3}, /* 13 */
{9 * 32, 2}, /* 14 */
{10 * 32, 1},/* 15 */
};
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi16(zng_memread_2(from));
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi32(zng_memread_4(from));
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi64x(zng_memread_8(from));
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm_loadu_si128((__m128i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm_storeu_si128((__m128i *)out, *chunk);
}
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m128i perm_vec, ret_vec;
/* Important to note:
* This is _not_ to subvert the memory sanitizer but to instead unpoison some
* bytes we willingly and purposefully load uninitialized that we swizzle over
* in a vector register, anyway. If what we assume is wrong about what is used,
* the memory sanitizer will still usefully flag it */
__msan_unpoison(buf + dist, 16 - dist);
ret_vec = _mm_loadu_si128((__m128i*)buf);
*chunk_rem = lut_rem.remval;
perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
return ret_vec;
}
#define CHUNKSIZE chunksize_ssse3
#define CHUNKMEMSET chunkmemset_ssse3
#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
#define CHUNKCOPY chunkcopy_ssse3
#define CHUNKUNROLL chunkunroll_ssse3
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_ssse3
#include "inffast_tpl.h"
#endif

View File

@@ -0,0 +1,64 @@
/* compare256_avx2.c -- AVX2 version of compare256
* Copyright Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
#include <immintrin.h>
#ifdef _MSC_VER
# include <nmmintrin.h>
#endif
static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
__m256i ymm_src0, ymm_src1, ymm_cmp;
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
if (mask != 0xFFFFFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
return len + match_byte;
}
src0 += 32, src1 += 32, len += 32;
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
if (mask != 0xFFFFFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
src0 += 32, src1 += 32, len += 32;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
return compare256_avx2_static(src0, src1);
}
#define LONGEST_MATCH longest_match_avx2
#define COMPARE256 compare256_avx2_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_avx2
#define COMPARE256 compare256_avx2_static
#include "match_tpl.h"
#endif

View File

@@ -0,0 +1,97 @@
/* compare256_sse2.c -- SSE2 version of compare256
* Copyright Adam Stylinski <kungfujesus06@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
#include <emmintrin.h>
static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
int align_offset = ((uintptr_t)src0) & 15;
const uint8_t *end0 = src0 + 256;
const uint8_t *end1 = src1 + 256;
__m128i xmm_src0, xmm_src1, xmm_cmp;
/* Do the first load unaligned, than all subsequent ones we have at least
* one aligned load. Sadly aligning both loads is probably unrealistic */
xmm_src0 = _mm_loadu_si128((__m128i*)src0);
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
/* Compiler _may_ turn this branch into a ptest + movemask,
* since a lot of those uops are shared and fused */
if (mask != 0xFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
int align_adv = 16 - align_offset;
len += align_adv;
src0 += align_adv;
src1 += align_adv;
/* Do a flooring division (should just be a shift right) */
int num_iter = (256 - len) / 16;
for (int i = 0; i < num_iter; ++i) {
xmm_src0 = _mm_load_si128((__m128i*)src0);
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
/* Compiler _may_ turn this branch into a ptest + movemask,
* since a lot of those uops are shared and fused */
if (mask != 0xFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
len += 16, src0 += 16, src1 += 16;
}
if (align_offset) {
src0 = end0 - 16;
src1 = end1 - 16;
len = 256 - 16;
xmm_src0 = _mm_loadu_si128((__m128i*)src0);
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
if (mask != 0xFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
}
return 256;
}
Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
return compare256_sse2_static(src0, src1);
}
#define LONGEST_MATCH longest_match_sse2
#define COMPARE256 compare256_sse2_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_sse2
#define COMPARE256 compare256_sse2_static
#include "match_tpl.h"
#endif

View File

@@ -0,0 +1,199 @@
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
* doc/crc-pclmulqdq.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Copyright (C) 2016 Marian Beermann (support for initial value)
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef COPY
Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
#else
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
#endif
unsigned long algn_diff;
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
__m128i xmm_crc_part = _mm_setzero_si128();
char ALIGNED_(16) partial_buf[16] = { 0 };
#ifndef COPY
__m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
int32_t first = init_crc != 0;
/* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
* for the aligning load that occurs. If there's an initial CRC, to carry it forward through
* the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
* up to 15 bytes + one full vector load. */
assert(len >= 16 || first == 0);
#endif
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
if (len < 16) {
if (len == 0)
return;
memcpy(partial_buf, src, len);
xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
#ifdef COPY
memcpy(dst, partial_buf, len);
#endif
goto partial;
}
algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
if (algn_diff) {
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
dst += algn_diff;
#else
XOR_INITIAL128(xmm_crc_part);
if (algn_diff < 4 && init_crc != 0) {
xmm_t0 = xmm_crc_part;
if (len >= 32) {
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
} else {
memcpy(partial_buf, src + 16, len - 16);
xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
src += 16;
len -= 16;
#ifdef COPY
dst -= algn_diff;
#endif
goto partial;
}
src += 16;
len -= 16;
}
#endif
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
src += algn_diff;
len -= algn_diff;
}
#ifdef X86_VPCLMULQDQ
if (len >= 256) {
#ifdef COPY
size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
dst += n;
#else
size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
xmm_initial, first);
first = 0;
#endif
len -= n;
src += n;
}
#endif
while (len >= 64) {
len -= 64;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
src += 64;
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_t0);
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
_mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
dst += 64;
#else
XOR_INITIAL128(xmm_t0);
#endif
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
}
/*
* len = num bytes left - 64
*/
if (len >= 48) {
len -= 48;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
src += 48;
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_t0);
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
dst += 48;
#else
XOR_INITIAL128(xmm_t0);
#endif
fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
} else if (len >= 32) {
len -= 32;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
src += 32;
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_t0);
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
dst += 32;
#else
XOR_INITIAL128(xmm_t0);
#endif
fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
} else if (len >= 16) {
len -= 16;
xmm_t0 = _mm_load_si128((__m128i *)src);
src += 16;
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_t0);
dst += 16;
#else
XOR_INITIAL128(xmm_t0);
#endif
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
}
partial:
if (len) {
memcpy(&xmm_crc_part, src, len);
#ifdef COPY
_mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
memcpy(dst, partial_buf, len);
#endif
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
}
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
}

View File

@@ -0,0 +1,107 @@
/* crc32_fold_vpclmulqdq_tpl.h -- VPCMULQDQ-based CRC32 folding template.
* Copyright Wangyang Guo (wangyang.guo@intel.com)
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef COPY
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
#else
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
__m128i init_crc, int32_t first) {
__m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
#endif
__m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
__m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
__m512i z0, z1, z2, z3;
size_t len_tmp = len;
const __m512i zmm_fold4 = _mm512_set4_epi32(
0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
const __m512i zmm_fold16 = _mm512_set4_epi32(
0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
// zmm register init
zmm_crc0 = _mm512_setzero_si512();
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
#ifndef COPY
XOR_INITIAL512(zmm_t0);
#endif
zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
/* already have intermediate CRC in xmm registers
* fold4 with 4 xmm_crc to get zmm_crc0
*/
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
#ifdef COPY
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
_mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
_mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
_mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
dst += 256;
#endif
len -= 256;
src += 256;
// fold-16 loops
while (len >= 256) {
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96);
zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96);
zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96);
#ifdef COPY
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
_mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
_mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
_mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
dst += 256;
#endif
len -= 256;
src += 256;
}
// zmm_crc[0,1,2,3] -> zmm_crc0
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96);
// zmm_crc0 -> xmm_crc[0, 1, 2, 3]
*xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
*xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
*xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
*xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
return (len_tmp - len); // return n bytes processed
}

View File

@@ -0,0 +1,30 @@
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
* doc/crc-pclmulqdq.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Copyright (C) 2016 Marian Beermann (support for initial value)
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_PCLMULQDQ_CRC
#define CRC32_FOLD_COPY crc32_fold_pclmulqdq_copy
#define CRC32_FOLD crc32_fold_pclmulqdq
#define CRC32_FOLD_RESET crc32_fold_pclmulqdq_reset
#define CRC32_FOLD_FINAL crc32_fold_pclmulqdq_final
#define CRC32 crc32_pclmulqdq
#include "crc32_pclmulqdq_tpl.h"
#endif

View File

@@ -0,0 +1,375 @@
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
* doc/crc-pclmulqdq.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Copyright (C) 2016 Marian Beermann (support for initial value)
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include <immintrin.h>
#include <wmmintrin.h>
#include <smmintrin.h> // _mm_extract_epi32
#ifdef X86_VPCLMULQDQ
# include <immintrin.h>
#endif
#include "crc32.h"
#include "crc32_braid_p.h"
#include "crc32_braid_tbl.h"
#include "x86_intrins.h"
#include <assert.h>
#ifdef X86_VPCLMULQDQ
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc,
int32_t first);
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
#endif
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp3;
__m128 ps_crc0, ps_crc3, ps_res;
x_tmp3 = *xmm_crc3;
*xmm_crc3 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
*xmm_crc0 = *xmm_crc1;
*xmm_crc1 = *xmm_crc2;
*xmm_crc2 = x_tmp3;
*xmm_crc3 = _mm_castps_si128(ps_res);
}
static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp3, x_tmp2;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
x_tmp3 = *xmm_crc3;
x_tmp2 = *xmm_crc2;
*xmm_crc3 = *xmm_crc1;
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
*xmm_crc2 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
*xmm_crc0 = x_tmp2;
*xmm_crc1 = x_tmp3;
*xmm_crc2 = _mm_castps_si128(ps_res20);
*xmm_crc3 = _mm_castps_si128(ps_res31);
}
static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp3;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
x_tmp3 = *xmm_crc3;
*xmm_crc3 = *xmm_crc2;
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
*xmm_crc2 = *xmm_crc1;
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
*xmm_crc1 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
*xmm_crc0 = x_tmp3;
*xmm_crc1 = _mm_castps_si128(ps_res10);
*xmm_crc2 = _mm_castps_si128(ps_res21);
*xmm_crc3 = _mm_castps_si128(ps_res32);
}
static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
__m128 ps_t0, ps_t1, ps_t2, ps_t3;
__m128 ps_res0, ps_res1, ps_res2, ps_res3;
x_tmp0 = *xmm_crc0;
x_tmp1 = *xmm_crc1;
x_tmp2 = *xmm_crc2;
x_tmp3 = *xmm_crc3;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_t0 = _mm_castsi128_ps(x_tmp0);
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_t1 = _mm_castsi128_ps(x_tmp1);
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_t2 = _mm_castsi128_ps(x_tmp2);
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_t3 = _mm_castsi128_ps(x_tmp3);
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
*xmm_crc0 = _mm_castps_si128(ps_res0);
*xmm_crc1 = _mm_castps_si128(ps_res1);
*xmm_crc2 = _mm_castps_si128(ps_res2);
*xmm_crc3 = _mm_castps_si128(ps_res3);
}
static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl 9 (16 - 7)/shr7 */
0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl 8 (16 - 8)/shr8 */
0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl 7 (16 - 9)/shr9 */
0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl 6 (16 -10)/shr10*/
0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl 5 (16 -11)/shr11*/
0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl 4 (16 -12)/shr12*/
0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
};
static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
__m128i *xmm_crc3, __m128i *xmm_crc_part) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
__m128i xmm_a0_0, xmm_a0_1;
__m128 ps_crc3, psa0_0, psa0_1, ps_res;
xmm_shl = _mm_load_si128((__m128i *)(pshufb_shf_table + (4 * (len - 1))));
xmm_shr = xmm_shl;
xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
psa0_0 = _mm_castsi128_ps(xmm_a0_0);
psa0_1 = _mm_castsi128_ps(xmm_a0_1);
ps_res = _mm_xor_ps(ps_crc3, psa0_0);
ps_res = _mm_xor_ps(ps_res, psa0_1);
*xmm_crc3 = _mm_castps_si128(ps_res);
}
static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) {
*fold0 = _mm_load_si128(fold + 0);
*fold1 = _mm_load_si128(fold + 1);
*fold2 = _mm_load_si128(fold + 2);
*fold3 = _mm_load_si128(fold + 3);
}
static inline void crc32_fold_save(__m128i *fold, const __m128i *fold0, const __m128i *fold1,
const __m128i *fold2, const __m128i *fold3) {
_mm_storeu_si128(fold + 0, *fold0);
_mm_storeu_si128(fold + 1, *fold1);
_mm_storeu_si128(fold + 2, *fold2);
_mm_storeu_si128(fold + 3, *fold3);
}
Z_INTERNAL uint32_t CRC32_FOLD_RESET(crc32_fold *crc) {
__m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
__m128i xmm_zero = _mm_setzero_si128();
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_zero, &xmm_zero, &xmm_zero);
return 0;
}
#define ONCE(op) if (first) { first = 0; op; }
#define XOR_INITIAL128(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
#ifdef X86_VPCLMULQDQ
# define XOR_INITIAL512(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
#endif
#ifdef X86_VPCLMULQDQ
# include "crc32_fold_vpclmulqdq_tpl.h"
#endif
#include "crc32_fold_pclmulqdq_tpl.h"
#define COPY
#ifdef X86_VPCLMULQDQ
# include "crc32_fold_vpclmulqdq_tpl.h"
#endif
#include "crc32_fold_pclmulqdq_tpl.h"
static const unsigned ALIGNED_(16) crc_k[] = {
0xccaa009e, 0x00000000, /* rk1 */
0x751997d0, 0x00000001, /* rk2 */
0xccaa009e, 0x00000000, /* rk5 */
0x63cd6124, 0x00000001, /* rk6 */
0xf7011640, 0x00000001, /* rk7 */
0xdb710640, 0x00000001 /* rk8 */
};
static const unsigned ALIGNED_(16) crc_mask[4] = {
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
};
static const unsigned ALIGNED_(16) crc_mask2[4] = {
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
};
Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
/*
* k1
*/
crc_fold = _mm_load_si128((__m128i *)crc_k);
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
/*
* k5
*/
crc_fold = _mm_load_si128((__m128i *)(crc_k + 4));
xmm_crc0 = xmm_crc3;
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
xmm_crc0 = xmm_crc3;
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
/*
* k7
*/
xmm_crc1 = xmm_crc3;
xmm_crc2 = xmm_crc3;
crc_fold = _mm_load_si128((__m128i *)(crc_k + 8));
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
xmm_crc2 = xmm_crc3;
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2));
return crc->value;
}
static inline uint32_t crc32_small(uint32_t crc, const uint8_t *buf, size_t len) {
uint32_t c = (~crc) & 0xffffffff;
while (len) {
len--;
DO1;
}
return c ^ 0xffffffff;
}
Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
/* For lens smaller than ~12, crc32_small method is faster.
* But there are also minimum requirements for the pclmul functions due to alignment */
if (len < 16)
return crc32_small(crc32, buf, len);
crc32_fold ALIGNED_(16) crc_state;
CRC32_FOLD_RESET(&crc_state);
CRC32_FOLD(&crc_state, buf, len, crc32);
return CRC32_FOLD_FINAL(&crc_state);
}

View File

@@ -0,0 +1,17 @@
/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
* Copyright Wangyang Guo (wangyang.guo@intel.com)
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_VPCLMULQDQ_CRC
#define X86_VPCLMULQDQ
#define CRC32_FOLD_COPY crc32_fold_vpclmulqdq_copy
#define CRC32_FOLD crc32_fold_vpclmulqdq
#define CRC32_FOLD_RESET crc32_fold_vpclmulqdq_reset
#define CRC32_FOLD_FINAL crc32_fold_vpclmulqdq_final
#define CRC32 crc32_vpclmulqdq
#include "crc32_pclmulqdq_tpl.h"
#endif

View File

@@ -0,0 +1,40 @@
/*
* AVX2 optimized hash slide, based on Intel's slide_sse implementation
*
* Copyright (C) 2017 Intel Corporation
* Authors:
* Arjan van de Ven <arjan@linux.intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
* Mika T. Lindqvist <postmaster@raasu.org>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "deflate.h"
#include <immintrin.h>
static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
table += entries;
table -= 16;
do {
__m256i value, result;
value = _mm256_loadu_si256((__m256i *)table);
result = _mm256_subs_epu16(value, wsize);
_mm256_storeu_si256((__m256i *)table, result);
table -= 16;
entries -= 16;
} while (entries > 0);
}
Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
uint16_t wsize = (uint16_t)s->w_size;
const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
slide_hash_chain(s->prev, wsize, ymm_wsize);
}

View File

@@ -0,0 +1,63 @@
/*
* SSE optimized hash slide
*
* Copyright (C) 2017 Intel Corporation
* Authors:
* Arjan van de Ven <arjan@linux.intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "deflate.h"
#include <immintrin.h>
#include <assert.h>
static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
uint32_t entries1, const __m128i wsize) {
uint32_t entries;
Pos *table;
__m128i value0, value1, result0, result1;
int on_chain = 0;
next_chain:
table = (on_chain) ? table1 : table0;
entries = (on_chain) ? entries1 : entries0;
table += entries;
table -= 16;
/* ZALLOC allocates this pointer unless the user chose a custom allocator.
* Our alloc function is aligned to 64 byte boundaries */
do {
value0 = _mm_load_si128((__m128i *)table);
value1 = _mm_load_si128((__m128i *)(table + 8));
result0 = _mm_subs_epu16(value0, wsize);
result1 = _mm_subs_epu16(value1, wsize);
_mm_store_si128((__m128i *)table, result0);
_mm_store_si128((__m128i *)(table + 8), result1);
table -= 16;
entries -= 16;
} while (entries > 0);
++on_chain;
if (on_chain > 1) {
return;
} else {
goto next_chain;
}
}
Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
uint16_t wsize = (uint16_t)s->w_size;
const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
assert(((uintptr_t)s->head & 15) == 0);
assert(((uintptr_t)s->prev & 15) == 0);
slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
}

View File

@@ -0,0 +1,117 @@
/* x86_features.c - x86 feature check
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Author:
* Jim Kukunas
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "x86_features.h"
#ifdef _MSC_VER
# include <intrin.h>
#else
// Newer versions of GCC and clang come with cpuid.h
# include <cpuid.h>
# ifdef X86_HAVE_XSAVE_INTRIN
# if __GNUC__ == 8
# include <xsaveintrin.h>
# else
# include <immintrin.h>
# endif
# endif
#endif
#include <string.h>
static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
#ifdef _MSC_VER
unsigned int registers[4];
__cpuid((int *)registers, info);
*eax = registers[0];
*ebx = registers[1];
*ecx = registers[2];
*edx = registers[3];
#else
*eax = *ebx = *ecx = *edx = 0;
__cpuid(info, *eax, *ebx, *ecx, *edx);
#endif
}
static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
#ifdef _MSC_VER
unsigned int registers[4];
__cpuidex((int *)registers, info, subinfo);
*eax = registers[0];
*ebx = registers[1];
*ecx = registers[2];
*edx = registers[3];
#else
*eax = *ebx = *ecx = *edx = 0;
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
#endif
}
static inline uint64_t xgetbv(unsigned int xcr) {
#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN)
return _xgetbv(xcr);
#else
uint32_t eax, edx;
__asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr));
return (uint64_t)(edx) << 32 | eax;
#endif
}
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
unsigned eax, ebx, ecx, edx;
unsigned maxbasic;
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
features->has_sse2 = edx & 0x4000000;
features->has_ssse3 = ecx & 0x200;
features->has_sse42 = ecx & 0x100000;
features->has_pclmulqdq = ecx & 0x2;
if (ecx & 0x08000000) {
uint64_t xfeature = xgetbv(0);
features->has_os_save_ymm = ((xfeature & 0x06) == 0x06);
features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6);
}
if (maxbasic >= 7) {
cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
// check BMI1 bit
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
features->has_vpclmulqdq = ecx & 0x400;
// check AVX2 bit if the OS supports saving YMM registers
if (features->has_os_save_ymm) {
features->has_avx2 = ebx & 0x20;
}
features->has_bmi2 = ebx & 0x8;
// check AVX512 bits if the OS supports saving ZMM registers
if (features->has_os_save_zmm) {
features->has_avx512f = ebx & 0x00010000;
if (features->has_avx512f) {
// According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable
// AVX512(DQ,BW,VL).
features->has_avx512dq = ebx & 0x00020000;
features->has_avx512bw = ebx & 0x40000000;
features->has_avx512vl = ebx & 0x80000000;
}
features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
&& features->has_avx512vl && features->has_bmi2;
features->has_avx512vnni = ecx & 0x800;
}
}
}

View File

@@ -0,0 +1,29 @@
/* x86_features.h -- check for CPU features
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef X86_FEATURES_H_
#define X86_FEATURES_H_
struct x86_cpu_features {
int has_avx2;
int has_avx512f;
int has_avx512dq;
int has_avx512bw;
int has_avx512vl;
int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
int has_avx512vnni;
int has_bmi2;
int has_sse2;
int has_ssse3;
int has_sse42;
int has_pclmulqdq;
int has_vpclmulqdq;
int has_os_save_ymm;
int has_os_save_zmm;
};
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
#endif /* X86_FEATURES_H_ */

View File

@@ -0,0 +1,181 @@
/* x86_functions.h -- x86 implementations for arch-specific functions.
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef X86_FUNCTIONS_H_
#define X86_FUNCTIONS_H_
#ifdef X86_SSE2
uint32_t chunksize_sse2(void);
uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
void slide_hash_sse2(deflate_state *s);
# endif
void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
#endif
#ifdef X86_SSSE3
uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef X86_SSE42
uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX2
uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t chunksize_avx2(void);
uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
void slide_hash_avx2(deflate_state *s);
# endif
void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
#endif
#ifdef X86_AVX512
uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t chunksize_avx512(void);
uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
#endif
#ifdef X86_AVX512VNNI
uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_PCLMULQDQ_CRC
uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
#endif
#ifdef X86_VPCLMULQDQ_CRC
uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// X86 - SSE2
# if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2)
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_sse2
# undef native_chunksize
# define native_chunksize chunksize_sse2
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_sse2
# undef native_slide_hash
# define native_slide_hash slide_hash_sse2
# ifdef HAVE_BUILTIN_CTZ
# undef native_compare256
# define native_compare256 compare256_sse2
# undef native_longest_match
# define native_longest_match longest_match_sse2
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_sse2
# endif
#endif
// X86 - SSSE3
# if defined(X86_SSSE3) && defined(__SSSE3__)
# undef native_adler32
# define native_adler32 adler32_ssse3
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_ssse3
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_ssse3
# endif
// X86 - SSE4.2
# if defined(X86_SSE42) && defined(__SSE4_2__)
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_sse42
# endif
// X86 - PCLMUL
#if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__)
# undef native_crc32
# define native_crc32 crc32_pclmulqdq
# undef native_crc32_fold
# define native_crc32_fold crc32_fold_pclmulqdq
# undef native_crc32_fold_copy
# define native_crc32_fold_copy crc32_fold_pclmulqdq_copy
# undef native_crc32_fold_final
# define native_crc32_fold_final crc32_fold_pclmulqdq_final
# undef native_crc32_fold_reset
# define native_crc32_fold_reset crc32_fold_pclmulqdq_reset
#endif
// X86 - AVX
# if defined(X86_AVX2) && defined(__AVX2__)
# undef native_adler32
# define native_adler32 adler32_avx2
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_avx2
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_avx2
# undef native_chunksize
# define native_chunksize chunksize_avx2
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_avx2
# undef native_slide_hash
# define native_slide_hash slide_hash_avx2
# ifdef HAVE_BUILTIN_CTZ
# undef native_compare256
# define native_compare256 compare256_avx2
# undef native_longest_match
# define native_longest_match longest_match_avx2
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_avx2
# endif
# endif
// X86 - AVX512 (F,DQ,BW,Vl)
# if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
# undef native_adler32
# define native_adler32 adler32_avx512
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_avx512
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_avx512
# undef native_chunksize
# define native_chunksize chunksize_avx512
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_avx512
// X86 - AVX512 (VNNI)
# if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
# undef native_adler32
# define native_adler32 adler32_avx512_vnni
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_avx512_vnni
# endif
// X86 - VPCLMULQDQ
# if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__)
# undef native_crc32
# define native_crc32 crc32_vpclmulqdq
# undef native_crc32_fold
# define native_crc32_fold crc32_fold_vpclmulqdq
# undef native_crc32_fold_copy
# define native_crc32_fold_copy crc32_fold_vpclmulqdq_copy
# undef native_crc32_fold_final
# define native_crc32_fold_final crc32_fold_vpclmulqdq_final
# undef native_crc32_fold_reset
# define native_crc32_fold_reset crc32_fold_vpclmulqdq_reset
# endif
# endif
#endif
#endif /* X86_FUNCTIONS_H_ */

View File

@@ -0,0 +1,92 @@
#ifndef X86_INTRINS_H
#define X86_INTRINS_H
/* Unfortunately GCC didn't support these things until version 10.
* Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3.
*/
#ifdef __AVX2__
#include <immintrin.h>
#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \
|| (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
__m128i r;
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
return _mm256_castsi128_si256(r);
}
#ifdef __AVX512F__
static inline __m512i _mm512_zextsi128_si512(__m128i a) {
__m128i r;
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
return _mm512_castsi128_si512(r);
}
#endif // __AVX512F__
#endif // gcc/AppleClang version test
#endif // __AVX2__
/* GCC <9 is missing some AVX512 intrinsics.
*/
#ifdef __AVX512F__
#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9)
#include <immintrin.h>
#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
((int)(unsigned char)(c2) << 8) | ((int)(unsigned char)(c3)))
static inline __m512i _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60,
char __q59, char __q58, char __q57, char __q56,
char __q55, char __q54, char __q53, char __q52,
char __q51, char __q50, char __q49, char __q48,
char __q47, char __q46, char __q45, char __q44,
char __q43, char __q42, char __q41, char __q40,
char __q39, char __q38, char __q37, char __q36,
char __q35, char __q34, char __q33, char __q32,
char __q31, char __q30, char __q29, char __q28,
char __q27, char __q26, char __q25, char __q24,
char __q23, char __q22, char __q21, char __q20,
char __q19, char __q18, char __q17, char __q16,
char __q15, char __q14, char __q13, char __q12,
char __q11, char __q10, char __q09, char __q08,
char __q07, char __q06, char __q05, char __q04,
char __q03, char __q02, char __q01, char __q00) {
return _mm512_set_epi32(PACK(__q63, __q62, __q61, __q60), PACK(__q59, __q58, __q57, __q56),
PACK(__q55, __q54, __q53, __q52), PACK(__q51, __q50, __q49, __q48),
PACK(__q47, __q46, __q45, __q44), PACK(__q43, __q42, __q41, __q40),
PACK(__q39, __q38, __q37, __q36), PACK(__q35, __q34, __q33, __q32),
PACK(__q31, __q30, __q29, __q28), PACK(__q27, __q26, __q25, __q24),
PACK(__q23, __q22, __q21, __q20), PACK(__q19, __q18, __q17, __q16),
PACK(__q15, __q14, __q13, __q12), PACK(__q11, __q10, __q09, __q08),
PACK(__q07, __q06, __q05, __q04), PACK(__q03, __q02, __q01, __q00));
}
#undef PACK
#endif // gcc version test
#endif // __AVX512F__
/* Missing zero-extension AVX and AVX512 intrinsics.
* Fixed in Microsoft Visual Studio 2017 version 15.7
* https://developercommunity.visualstudio.com/t/missing-zero-extension-avx-and-avx512-intrinsics/175737
*/
#if defined(_MSC_VER) && _MSC_VER < 1914
#ifdef __AVX2__
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
return _mm256_inserti128_si256(_mm256_setzero_si256(), a, 0);
}
#endif // __AVX2__
#ifdef __AVX512F__
static inline __m512i _mm512_zextsi128_si512(__m128i a) {
return _mm512_inserti32x4(_mm512_setzero_si512(), a, 0);
}
#endif // __AVX512F__
#endif // defined(_MSC_VER) && _MSC_VER < 1914
/* Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsics */
#if defined(_MSC_VER) && defined(__AVX512F__) && !defined(_MM_K0_REG8)
# undef _mm512_extracti32x4_epi32
# define _mm512_extracti32x4_epi32(v1, e1) _mm512_maskz_extracti32x4_epi32(UINT8_MAX, v1, e1)
#endif
#endif // include guard X86_INTRINS_H