fix: breakpad use miniz
Some checks failed
sm-rpc / build (Debug, arm-linux-gnueabihf) (push) Successful in 1m34s
sm-rpc / build (Debug, aarch64-linux-gnu) (push) Successful in 2m46s
sm-rpc / build (Debug, host.gcc) (push) Failing after 1m28s
sm-rpc / build (Release, aarch64-linux-gnu) (push) Successful in 2m14s
sm-rpc / build (Release, arm-linux-gnueabihf) (push) Successful in 2m8s
sm-rpc / build (Debug, mipsel-linux-gnu) (push) Successful in 5m35s
sm-rpc / build (Release, host.gcc) (push) Failing after 1m55s
sm-rpc / build (Release, mipsel-linux-gnu) (push) Successful in 7m21s
Some checks failed
sm-rpc / build (Debug, arm-linux-gnueabihf) (push) Successful in 1m34s
sm-rpc / build (Debug, aarch64-linux-gnu) (push) Successful in 2m46s
sm-rpc / build (Debug, host.gcc) (push) Failing after 1m28s
sm-rpc / build (Release, aarch64-linux-gnu) (push) Successful in 2m14s
sm-rpc / build (Release, arm-linux-gnueabihf) (push) Successful in 2m8s
sm-rpc / build (Debug, mipsel-linux-gnu) (push) Successful in 5m35s
sm-rpc / build (Release, host.gcc) (push) Failing after 1m55s
sm-rpc / build (Release, mipsel-linux-gnu) (push) Successful in 7m21s
This commit is contained in:
93
third_party/zlib-ng/arch/power/Makefile.in
vendored
Normal file
93
third_party/zlib-ng/arch/power/Makefile.in
vendored
Normal file
@@ -0,0 +1,93 @@
|
||||
# Makefile for POWER-specific files
|
||||
# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
P8FLAGS=-mcpu=power8
|
||||
P9FLAGS=-mcpu=power9
|
||||
PPCFLAGS=-maltivec
|
||||
NOLTOFLAG=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: power_features.o \
|
||||
power_features.lo \
|
||||
adler32_power8.o \
|
||||
adler32_power8.lo \
|
||||
adler32_vmx.o \
|
||||
adler32_vmx.lo \
|
||||
chunkset_power8.o \
|
||||
chunkset_power8.lo \
|
||||
compare256_power9.o \
|
||||
compare256_power9.lo \
|
||||
crc32_power8.o \
|
||||
crc32_power8.lo \
|
||||
slide_hash_power8.o \
|
||||
slide_hash_power8.lo \
|
||||
slide_hash_vmx.o \
|
||||
slide_hash_vmx.lo
|
||||
|
||||
power_features.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
|
||||
|
||||
power_features.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
|
||||
|
||||
adler32_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
|
||||
|
||||
adler32_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
|
||||
|
||||
adler32_vmx.o:
|
||||
$(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
|
||||
|
||||
adler32_vmx.lo:
|
||||
$(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
|
||||
|
||||
chunkset_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
|
||||
|
||||
chunkset_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
|
||||
|
||||
compare256_power9.o:
|
||||
$(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
|
||||
|
||||
compare256_power9.lo:
|
||||
$(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
|
||||
|
||||
crc32_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
|
||||
|
||||
crc32_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
|
||||
|
||||
slide_hash_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
|
||||
|
||||
slide_hash_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
|
||||
|
||||
slide_hash_vmx.o:
|
||||
$(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
|
||||
|
||||
slide_hash_vmx.lo:
|
||||
$(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean
|
||||
rm -f Makefile
|
||||
153
third_party/zlib-ng/arch/power/adler32_power8.c
vendored
Normal file
153
third_party/zlib-ng/arch/power/adler32_power8.c
vendored
Normal file
@@ -0,0 +1,153 @@
|
||||
/* Adler32 for POWER8 using VSX instructions.
|
||||
* Copyright (C) 2020 IBM Corporation
|
||||
* Author: Rogerio Alves <rcardoso@linux.ibm.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
* Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
|
||||
* instructions.
|
||||
*
|
||||
* If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
|
||||
* iteration n) is the initial value of adler - at start _0 is 1 unless
|
||||
* adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
|
||||
* the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
|
||||
* Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
|
||||
* after iteration N.
|
||||
*
|
||||
* Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
|
||||
* N-1*c[1] + ... + c[N]
|
||||
*
|
||||
* In a more general way:
|
||||
*
|
||||
* s1_N = s1_0 + sum(i=1 to N)c[i]
|
||||
* s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
|
||||
*
|
||||
* Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
|
||||
* can process N-bit at time we can do this at once.
|
||||
*
|
||||
* Since VSX can support 16-bit vector instructions, we can process
|
||||
* 16-bit at time using N = 16 we have:
|
||||
*
|
||||
* s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
|
||||
* s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
|
||||
*
|
||||
* After the first iteration we calculate the adler32 checksum for 16 bytes.
|
||||
*
|
||||
* For more background about adler32 please check the RFC:
|
||||
* https://www.ietf.org/rfc/rfc1950.txt
|
||||
*/
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
|
||||
#include <altivec.h>
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
|
||||
/* Vector across sum unsigned int (saturate). */
|
||||
static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
|
||||
__b = vec_sld(__a, __a, 8);
|
||||
__b = vec_add(__b, __a);
|
||||
__a = vec_sld(__b, __b, 4);
|
||||
__a = vec_add(__a, __b);
|
||||
|
||||
return __a;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
uint32_t s1 = adler & 0xffff;
|
||||
uint32_t s2 = (adler >> 16) & 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(s1, buf, s2);
|
||||
|
||||
/* If buffer is empty or len=0 we need to return adler initial value. */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1;
|
||||
|
||||
/* This is faster than VSX code for len < 64. */
|
||||
if (len < 64)
|
||||
return adler32_len_64(s1, buf, len, s2);
|
||||
|
||||
/* Use POWER VSX instructions for len >= 64. */
|
||||
const vector unsigned int v_zeros = { 0 };
|
||||
const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
|
||||
6, 5, 4, 3, 2, 1};
|
||||
const vector unsigned char vsh = vec_splat_u8(4);
|
||||
const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
|
||||
vector unsigned int vs1 = { 0 };
|
||||
vector unsigned int vs2 = { 0 };
|
||||
vector unsigned int vs1_save = { 0 };
|
||||
vector unsigned int vsum1, vsum2;
|
||||
vector unsigned char vbuf;
|
||||
int n;
|
||||
|
||||
vs1[0] = s1;
|
||||
vs2[0] = s2;
|
||||
|
||||
/* Do length bigger than NMAX in blocks of NMAX size. */
|
||||
while (len >= NMAX) {
|
||||
len -= NMAX;
|
||||
n = NMAX / 16;
|
||||
do {
|
||||
vbuf = vec_xl(0, (unsigned char *) buf);
|
||||
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
|
||||
/* sum(i=1 to 16) buf[i]*(16-i+1). */
|
||||
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
|
||||
/* Save vs1. */
|
||||
vs1_save = vec_add(vs1_save, vs1);
|
||||
/* Accumulate the sums. */
|
||||
vs1 = vec_add(vsum1, vs1);
|
||||
vs2 = vec_add(vsum2, vs2);
|
||||
|
||||
buf += 16;
|
||||
} while (--n);
|
||||
/* Once each block of NMAX size. */
|
||||
vs1 = vec_sumsu(vs1, vsum1);
|
||||
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
|
||||
vs2 = vec_add(vs1_save, vs2);
|
||||
vs2 = vec_sumsu(vs2, vsum2);
|
||||
|
||||
/* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */
|
||||
vs1[0] = vs1[0] % BASE;
|
||||
/* vs2[0] = s2_i + 16*s1_save +
|
||||
sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
|
||||
vs2[0] = vs2[0] % BASE;
|
||||
|
||||
vs1 = vec_and(vs1, vmask);
|
||||
vs2 = vec_and(vs2, vmask);
|
||||
vs1_save = v_zeros;
|
||||
}
|
||||
|
||||
/* len is less than NMAX one modulo is needed. */
|
||||
if (len >= 16) {
|
||||
while (len >= 16) {
|
||||
len -= 16;
|
||||
|
||||
vbuf = vec_xl(0, (unsigned char *) buf);
|
||||
|
||||
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
|
||||
/* sum(i=1 to 16) buf[i]*(16-i+1). */
|
||||
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
|
||||
/* Save vs1. */
|
||||
vs1_save = vec_add(vs1_save, vs1);
|
||||
/* Accumulate the sums. */
|
||||
vs1 = vec_add(vsum1, vs1);
|
||||
vs2 = vec_add(vsum2, vs2);
|
||||
|
||||
buf += 16;
|
||||
}
|
||||
/* Since the size will be always less than NMAX we do this once. */
|
||||
vs1 = vec_sumsu(vs1, vsum1);
|
||||
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
|
||||
vs2 = vec_add(vs1_save, vs2);
|
||||
vs2 = vec_sumsu(vs2, vsum2);
|
||||
}
|
||||
/* Copy result back to s1, s2 (mod 65521). */
|
||||
s1 = vs1[0] % BASE;
|
||||
s2 = vs2[0] % BASE;
|
||||
|
||||
/* Process tail (len < 16). */
|
||||
return adler32_len_16(s1, buf, len, s2);
|
||||
}
|
||||
|
||||
#endif /* POWER8_VSX */
|
||||
186
third_party/zlib-ng/arch/power/adler32_vmx.c
vendored
Normal file
186
third_party/zlib-ng/arch/power/adler32_vmx.c
vendored
Normal file
@@ -0,0 +1,186 @@
|
||||
/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef PPC_VMX
|
||||
#include <altivec.h>
|
||||
#include "zbuild.h"
|
||||
#include "zendian.h"
|
||||
#include "adler32_p.h"
|
||||
|
||||
#define vmx_zero() (vec_splat_u32(0))
|
||||
|
||||
static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
|
||||
unsigned int i;
|
||||
for (i = 0; i < len; ++i) {
|
||||
pair[0] += buf[i];
|
||||
pair[1] += pair[0];
|
||||
}
|
||||
}
|
||||
|
||||
static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
|
||||
/* Different taps for the separable components of sums */
|
||||
const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
|
||||
const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
|
||||
const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
|
||||
const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
|
||||
/* As silly and inefficient as it seems, creating 1 permutation vector to permute
|
||||
* a 2 element vector from a single load + a subsequent shift is just barely faster
|
||||
* than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
|
||||
const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
|
||||
const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
|
||||
vector unsigned int adacc, s2acc;
|
||||
vector unsigned int pair_vec = vec_ld(0, s);
|
||||
adacc = vec_perm(pair_vec, pair_vec, s0_perm);
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
s2acc = vec_sro(pair_vec, shift_vec);
|
||||
#else
|
||||
s2acc = vec_slo(pair_vec, shift_vec);
|
||||
#endif
|
||||
|
||||
vector unsigned int zero = vmx_zero();
|
||||
vector unsigned int s3acc = zero;
|
||||
vector unsigned int s3acc_0 = zero;
|
||||
vector unsigned int adacc_prev = adacc;
|
||||
vector unsigned int adacc_prev_0 = zero;
|
||||
|
||||
vector unsigned int s2acc_0 = zero;
|
||||
vector unsigned int s2acc_1 = zero;
|
||||
vector unsigned int s2acc_2 = zero;
|
||||
|
||||
/* Maintain a running sum of a second half, this might help use break yet another
|
||||
* data dependency bubble in the sum */
|
||||
vector unsigned int adacc_0 = zero;
|
||||
|
||||
int num_iter = len / 4;
|
||||
int rem = len & 3;
|
||||
|
||||
for (int i = 0; i < num_iter; ++i) {
|
||||
vector unsigned char d0 = vec_ld(0, buf);
|
||||
vector unsigned char d1 = vec_ld(16, buf);
|
||||
vector unsigned char d2 = vec_ld(32, buf);
|
||||
vector unsigned char d3 = vec_ld(48, buf);
|
||||
|
||||
/* The core operation of the loop, basically
|
||||
* what is being unrolled below */
|
||||
adacc = vec_sum4s(d0, adacc);
|
||||
s3acc = vec_add(s3acc, adacc_prev);
|
||||
s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
|
||||
s2acc = vec_msum(t0, d0, s2acc);
|
||||
|
||||
/* interleave dependent sums in here */
|
||||
adacc_0 = vec_sum4s(d1, adacc_0);
|
||||
s2acc_0 = vec_msum(t1, d1, s2acc_0);
|
||||
adacc = vec_sum4s(d2, adacc);
|
||||
s2acc_1 = vec_msum(t2, d2, s2acc_1);
|
||||
s2acc_2 = vec_msum(t3, d3, s2acc_2);
|
||||
adacc_0 = vec_sum4s(d3, adacc_0);
|
||||
|
||||
adacc_prev = adacc;
|
||||
adacc_prev_0 = adacc_0;
|
||||
buf += 64;
|
||||
}
|
||||
|
||||
adacc = vec_add(adacc, adacc_0);
|
||||
s3acc = vec_add(s3acc, s3acc_0);
|
||||
s3acc = vec_sl(s3acc, vec_splat_u32(6));
|
||||
|
||||
if (rem) {
|
||||
adacc_prev = vec_add(adacc_prev_0, adacc_prev);
|
||||
adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
|
||||
while (rem--) {
|
||||
vector unsigned char d0 = vec_ld(0, buf);
|
||||
adacc = vec_sum4s(d0, adacc);
|
||||
s3acc = vec_add(s3acc, adacc_prev);
|
||||
s2acc = vec_msum(t3, d0, s2acc);
|
||||
adacc_prev = vec_sl(adacc, vec_splat_u32(4));
|
||||
buf += 16;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Sum up independent second sums */
|
||||
s2acc = vec_add(s2acc, s2acc_0);
|
||||
s2acc_2 = vec_add(s2acc_1, s2acc_2);
|
||||
s2acc = vec_add(s2acc, s2acc_2);
|
||||
|
||||
s2acc = vec_add(s2acc, s3acc);
|
||||
|
||||
adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
|
||||
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
|
||||
adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
|
||||
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
|
||||
|
||||
vec_ste(adacc, 0, s);
|
||||
vec_ste(s2acc, 0, s+1);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
uint32_t pair[16] ALIGNED_(16);
|
||||
memset(&pair[2], 0, 14);
|
||||
int n = NMAX;
|
||||
unsigned int done = 0, i;
|
||||
|
||||
/* Split Adler-32 into component sums, it can be supplied by
|
||||
* the caller sites (e.g. in a PNG file).
|
||||
*/
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
pair[0] = adler;
|
||||
pair[1] = sum2;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
// Align buffer
|
||||
unsigned int al = 0;
|
||||
if ((uintptr_t)buf & 0xf) {
|
||||
al = 16-((uintptr_t)buf & 0xf);
|
||||
if (al > len) {
|
||||
al=len;
|
||||
}
|
||||
vmx_handle_head_or_tail(pair, buf, al);
|
||||
|
||||
done += al;
|
||||
/* Rather than rebasing, we can reduce the max sums for the
|
||||
* first round only */
|
||||
n -= al;
|
||||
}
|
||||
for (i = al; i < len; i += n) {
|
||||
int remaining = (int)(len-i);
|
||||
n = MIN(remaining, (i == al) ? n : NMAX);
|
||||
|
||||
if (n < 16)
|
||||
break;
|
||||
|
||||
vmx_accum32(pair, buf + i, n / 16);
|
||||
pair[0] %= BASE;
|
||||
pair[1] %= BASE;
|
||||
|
||||
done += (n / 16) * 16;
|
||||
}
|
||||
|
||||
/* Handle the tail elements. */
|
||||
if (done < len) {
|
||||
vmx_handle_head_or_tail(pair, (buf + done), len - done);
|
||||
pair[0] %= BASE;
|
||||
pair[1] %= BASE;
|
||||
}
|
||||
|
||||
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
|
||||
return (pair[1] << 16) | pair[0];
|
||||
}
|
||||
#endif
|
||||
50
third_party/zlib-ng/arch/power/chunkset_power8.c
vendored
Normal file
50
third_party/zlib-ng/arch/power/chunkset_power8.c
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
#include <altivec.h>
|
||||
#include "zbuild.h"
|
||||
#include "zmemory.h"
|
||||
|
||||
typedef vector unsigned char chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 16
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = (vector unsigned char)vec_splats(zng_memread_2(from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = (vector unsigned char)vec_splats(zng_memread_4(from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = (vector unsigned char)vec_splats((unsigned long long)zng_memread_8(from));
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = vec_xl(0, s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
vec_xst(*chunk, 0, out);
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_power8
|
||||
#define CHUNKCOPY chunkcopy_power8
|
||||
#define CHUNKUNROLL chunkunroll_power8
|
||||
#define CHUNKMEMSET chunkmemset_power8
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_power8
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
||||
66
third_party/zlib-ng/arch/power/compare256_power9.c
vendored
Normal file
66
third_party/zlib-ng/arch/power/compare256_power9.c
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
/* compare256_power9.c - Power9 version of compare256
|
||||
* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef POWER9
|
||||
#include <altivec.h>
|
||||
#include "zbuild.h"
|
||||
#include "zmemory.h"
|
||||
#include "deflate.h"
|
||||
#include "zendian.h"
|
||||
|
||||
/* Older versions of GCC misimplemented semantics for these bit counting builtins.
|
||||
* https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
|
||||
#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 12)
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
|
||||
#else
|
||||
# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
|
||||
#endif
|
||||
#else
|
||||
# define zng_vec_vctzlsbb(vc, len) len = vec_cntlz_lsbb(vc)
|
||||
#endif
|
||||
|
||||
static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0, cmplen;
|
||||
|
||||
do {
|
||||
vector unsigned char vsrc0, vsrc1, vc;
|
||||
|
||||
vsrc0 = *((vector unsigned char *)src0);
|
||||
vsrc1 = *((vector unsigned char *)src1);
|
||||
|
||||
/* Compare 16 bytes at a time. Each byte of vc will be either
|
||||
* all ones or all zeroes, depending on the result of the comparison. */
|
||||
vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
|
||||
|
||||
/* Since the index of matching bytes will contain only zeroes
|
||||
* on vc (since we used cmpne), counting the number of consecutive
|
||||
* bytes where LSB == 0 is the same as counting the length of the match. */
|
||||
zng_vec_vctzlsbb(vc, cmplen);
|
||||
if (cmplen != 16)
|
||||
return len + cmplen;
|
||||
|
||||
src0 += 16, src1 += 16, len += 16;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_power9_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_power9
|
||||
#define COMPARE256 compare256_power9_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_power9
|
||||
#define COMPARE256 compare256_power9_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
1123
third_party/zlib-ng/arch/power/crc32_constants.h
vendored
Normal file
1123
third_party/zlib-ng/arch/power/crc32_constants.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
587
third_party/zlib-ng/arch/power/crc32_power8.c
vendored
Normal file
587
third_party/zlib-ng/arch/power/crc32_power8.c
vendored
Normal file
@@ -0,0 +1,587 @@
|
||||
/* crc32 for POWER8 using VSX instructions
|
||||
* Copyright (C) 2021 IBM Corporation
|
||||
*
|
||||
* Author: Rogerio Alves <rogealve@br.ibm.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
* Calculate the checksum of data that is 16 byte aligned and a multiple of
|
||||
* 16 bytes.
|
||||
*
|
||||
* The first step is to reduce it to 1024 bits. We do this in 8 parallel
|
||||
* chunks in order to mask the latency of the vpmsum instructions. If we
|
||||
* have more than 32 kB of data to checksum we repeat this step multiple
|
||||
* times, passing in the previous 1024 bits.
|
||||
*
|
||||
* The next step is to reduce the 1024 bits to 64 bits. This step adds
|
||||
* 32 bits of 0s to the end - this matches what a CRC does. We just
|
||||
* calculate constants that land the data in this 32 bits.
|
||||
*
|
||||
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
|
||||
* for n = CRC using POWER8 instructions. We use x = 32.
|
||||
*
|
||||
* http://en.wikipedia.org/wiki/Barrett_reduction
|
||||
*
|
||||
* This code uses gcc vector builtins instead using assembly directly.
|
||||
*/
|
||||
|
||||
#include <altivec.h>
|
||||
#include "zendian.h"
|
||||
#include "zbuild.h"
|
||||
|
||||
#include "crc32_constants.h"
|
||||
#include "crc32_braid_tbl.h"
|
||||
|
||||
#include "power_intrins.h"
|
||||
|
||||
#define MAX_SIZE 32768
|
||||
#define VMX_ALIGN 16
|
||||
#define VMX_ALIGN_MASK (VMX_ALIGN-1)
|
||||
|
||||
static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
|
||||
while (len--)
|
||||
crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
|
||||
return crc;
|
||||
}
|
||||
|
||||
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
|
||||
|
||||
Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
|
||||
unsigned int prealign;
|
||||
unsigned int tail;
|
||||
|
||||
unsigned long len = (unsigned long) _len;
|
||||
|
||||
if (p == (const unsigned char *) 0x0)
|
||||
return 0;
|
||||
|
||||
crc ^= 0xffffffff;
|
||||
|
||||
if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
|
||||
crc = crc32_align(crc, p, len);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((unsigned long)p & VMX_ALIGN_MASK) {
|
||||
prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
|
||||
crc = crc32_align(crc, p, prealign);
|
||||
len -= prealign;
|
||||
p += prealign;
|
||||
}
|
||||
|
||||
crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
|
||||
|
||||
tail = len & VMX_ALIGN_MASK;
|
||||
if (tail) {
|
||||
p += len & ~VMX_ALIGN_MASK;
|
||||
crc = crc32_align(crc, p, tail);
|
||||
}
|
||||
|
||||
out:
|
||||
crc ^= 0xffffffff;
|
||||
|
||||
return crc;
|
||||
}
|
||||
|
||||
/* When we have a load-store in a single-dispatch group and address overlap
|
||||
* such that forward is not allowed (load-hit-store) the group must be flushed.
|
||||
* A group ending NOP prevents the flush.
|
||||
*/
|
||||
#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
|
||||
|
||||
#if BYTE_ORDER == BIG_ENDIAN
|
||||
#define BYTESWAP_DATA
|
||||
#endif
|
||||
|
||||
#ifdef BYTESWAP_DATA
|
||||
#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
/* Byte reverse permute constant LE. */
|
||||
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
|
||||
#else
|
||||
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
|
||||
#endif
|
||||
#else
|
||||
#define VEC_PERM(vr, va, vb, vc)
|
||||
#endif
|
||||
|
||||
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
|
||||
|
||||
const __vector unsigned long long vzero = {0,0};
|
||||
const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
|
||||
|
||||
const __vector unsigned long long vmask_32bit =
|
||||
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
|
||||
|
||||
const __vector unsigned long long vmask_64bit =
|
||||
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
|
||||
|
||||
__vector unsigned long long vcrc;
|
||||
|
||||
__vector unsigned long long vconst1, vconst2;
|
||||
|
||||
/* vdata0-vdata7 will contain our data (p). */
|
||||
__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
|
||||
|
||||
/* v0-v7 will contain our checksums */
|
||||
__vector unsigned long long v0 = {0,0};
|
||||
__vector unsigned long long v1 = {0,0};
|
||||
__vector unsigned long long v2 = {0,0};
|
||||
__vector unsigned long long v3 = {0,0};
|
||||
__vector unsigned long long v4 = {0,0};
|
||||
__vector unsigned long long v5 = {0,0};
|
||||
__vector unsigned long long v6 = {0,0};
|
||||
__vector unsigned long long v7 = {0,0};
|
||||
|
||||
|
||||
/* Vector auxiliary variables. */
|
||||
__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
|
||||
|
||||
unsigned int offset; /* Constant table offset. */
|
||||
|
||||
unsigned long i; /* Counter. */
|
||||
unsigned long chunks;
|
||||
|
||||
unsigned long block_size;
|
||||
int next_block = 0;
|
||||
|
||||
/* Align by 128 bits. The last 128 bit block will be processed at end. */
|
||||
unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
|
||||
|
||||
vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
|
||||
|
||||
/* Short version. */
|
||||
if (len < 256) {
|
||||
/* Calculate where in the constant table we need to start. */
|
||||
offset = 256 - len;
|
||||
|
||||
vconst1 = vec_ld(offset, vcrc_short_const);
|
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
|
||||
|
||||
/* xor initial value */
|
||||
vdata0 = vec_xor(vdata0, vcrc);
|
||||
|
||||
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
|
||||
v0 = vec_xor(v0, vdata0);
|
||||
|
||||
for (i = 16; i < len; i += 16) {
|
||||
vconst1 = vec_ld(offset + i, vcrc_short_const);
|
||||
vdata0 = vec_ld(i, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
|
||||
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
|
||||
v0 = vec_xor(v0, vdata0);
|
||||
}
|
||||
} else {
|
||||
|
||||
/* Load initial values. */
|
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
|
||||
|
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
|
||||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
|
||||
|
||||
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
|
||||
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
|
||||
|
||||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
|
||||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
|
||||
|
||||
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
|
||||
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
|
||||
|
||||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
|
||||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
|
||||
|
||||
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
|
||||
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
|
||||
|
||||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
|
||||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
|
||||
|
||||
/* xor in initial value */
|
||||
vdata0 = vec_xor(vdata0, vcrc);
|
||||
|
||||
p = (char *)p + 128;
|
||||
|
||||
do {
|
||||
/* Checksum in blocks of MAX_SIZE. */
|
||||
block_size = length;
|
||||
if (block_size > MAX_SIZE) {
|
||||
block_size = MAX_SIZE;
|
||||
}
|
||||
|
||||
length = length - block_size;
|
||||
|
||||
/*
|
||||
* Work out the offset into the constants table to start at. Each
|
||||
* constant is 16 bytes, and it is used against 128 bytes of input
|
||||
* data - 128 / 16 = 8
|
||||
*/
|
||||
offset = (MAX_SIZE/8) - (block_size/8);
|
||||
/* We reduce our final 128 bytes in a separate step */
|
||||
chunks = (block_size/128)-1;
|
||||
|
||||
vconst1 = vec_ld(offset, vcrc_const);
|
||||
|
||||
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
|
||||
(__vector unsigned long long)vconst1);
|
||||
|
||||
if (chunks > 1) {
|
||||
offset += 16;
|
||||
vconst2 = vec_ld(offset, vcrc_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
|
||||
|
||||
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
|
||||
|
||||
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
|
||||
|
||||
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
|
||||
|
||||
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
|
||||
|
||||
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
|
||||
|
||||
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
|
||||
|
||||
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
|
||||
|
||||
p = (char *)p + 128;
|
||||
|
||||
/*
|
||||
* main loop. Each iteration calculates the CRC for a 128-byte
|
||||
* block.
|
||||
*/
|
||||
for (i = 0; i < chunks-2; i++) {
|
||||
vconst1 = vec_ld(offset, vcrc_const);
|
||||
offset += 16;
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v0 = vec_xor(v0, va0);
|
||||
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
|
||||
(__vector unsigned long long)vconst2);
|
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v1 = vec_xor(v1, va1);
|
||||
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
|
||||
(__vector unsigned long long)vconst2);
|
||||
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v2 = vec_xor(v2, va2);
|
||||
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
|
||||
vdata2, (__vector unsigned long long)vconst2);
|
||||
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v3 = vec_xor(v3, va3);
|
||||
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
|
||||
(__vector unsigned long long)vconst2);
|
||||
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
|
||||
|
||||
vconst2 = vec_ld(offset, vcrc_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v4 = vec_xor(v4, va4);
|
||||
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
|
||||
(__vector unsigned long long)vconst1);
|
||||
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v5 = vec_xor(v5, va5);
|
||||
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
|
||||
(__vector unsigned long long)vconst1);
|
||||
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v6 = vec_xor(v6, va6);
|
||||
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
|
||||
(__vector unsigned long long)vconst1);
|
||||
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v7 = vec_xor(v7, va7);
|
||||
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
|
||||
(__vector unsigned long long)vconst1);
|
||||
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
|
||||
|
||||
p = (char *)p + 128;
|
||||
}
|
||||
|
||||
/* First cool down */
|
||||
vconst1 = vec_ld(offset, vcrc_const);
|
||||
offset += 16;
|
||||
|
||||
v0 = vec_xor(v0, va0);
|
||||
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v1 = vec_xor(v1, va1);
|
||||
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v2 = vec_xor(v2, va2);
|
||||
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v3 = vec_xor(v3, va3);
|
||||
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v4 = vec_xor(v4, va4);
|
||||
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v5 = vec_xor(v5, va5);
|
||||
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v6 = vec_xor(v6, va6);
|
||||
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v7 = vec_xor(v7, va7);
|
||||
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
|
||||
(__vector unsigned long long)vconst1);
|
||||
}/* else */
|
||||
|
||||
/* Second cool down. */
|
||||
v0 = vec_xor(v0, va0);
|
||||
v1 = vec_xor(v1, va1);
|
||||
v2 = vec_xor(v2, va2);
|
||||
v3 = vec_xor(v3, va3);
|
||||
v4 = vec_xor(v4, va4);
|
||||
v5 = vec_xor(v5, va5);
|
||||
v6 = vec_xor(v6, va6);
|
||||
v7 = vec_xor(v7, va7);
|
||||
|
||||
/*
|
||||
* vpmsumd produces a 96 bit result in the least significant bits
|
||||
* of the register. Since we are bit reflected we have to shift it
|
||||
* left 32 bits so it occupies the least significant bits in the
|
||||
* bit reflected domain.
|
||||
*/
|
||||
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
|
||||
/* xor with the last 1024 bits. */
|
||||
va0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va0, va0, va0, vperm_const);
|
||||
|
||||
va1 = vec_ld(16, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va1, va1, va1, vperm_const);
|
||||
|
||||
va2 = vec_ld(32, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va2, va2, va2, vperm_const);
|
||||
|
||||
va3 = vec_ld(48, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va3, va3, va3, vperm_const);
|
||||
|
||||
va4 = vec_ld(64, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va4, va4, va4, vperm_const);
|
||||
|
||||
va5 = vec_ld(80, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va5, va5, va5, vperm_const);
|
||||
|
||||
va6 = vec_ld(96, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va6, va6, va6, vperm_const);
|
||||
|
||||
va7 = vec_ld(112, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va7, va7, va7, vperm_const);
|
||||
|
||||
p = (char *)p + 128;
|
||||
|
||||
vdata0 = vec_xor(v0, va0);
|
||||
vdata1 = vec_xor(v1, va1);
|
||||
vdata2 = vec_xor(v2, va2);
|
||||
vdata3 = vec_xor(v3, va3);
|
||||
vdata4 = vec_xor(v4, va4);
|
||||
vdata5 = vec_xor(v5, va5);
|
||||
vdata6 = vec_xor(v6, va6);
|
||||
vdata7 = vec_xor(v7, va7);
|
||||
|
||||
/* Check if we have more blocks to process */
|
||||
next_block = 0;
|
||||
if (length != 0) {
|
||||
next_block = 1;
|
||||
|
||||
/* zero v0-v7 */
|
||||
v0 = vec_xor(v0, v0);
|
||||
v1 = vec_xor(v1, v1);
|
||||
v2 = vec_xor(v2, v2);
|
||||
v3 = vec_xor(v3, v3);
|
||||
v4 = vec_xor(v4, v4);
|
||||
v5 = vec_xor(v5, v5);
|
||||
v6 = vec_xor(v6, v6);
|
||||
v7 = vec_xor(v7, v7);
|
||||
}
|
||||
length = length + 128;
|
||||
|
||||
} while (next_block);
|
||||
|
||||
/* Calculate how many bytes we have left. */
|
||||
length = (len & 127);
|
||||
|
||||
/* Calculate where in (short) constant table we need to start. */
|
||||
offset = 128 - length;
|
||||
|
||||
v0 = vec_ld(offset, vcrc_short_const);
|
||||
v1 = vec_ld(offset + 16, vcrc_short_const);
|
||||
v2 = vec_ld(offset + 32, vcrc_short_const);
|
||||
v3 = vec_ld(offset + 48, vcrc_short_const);
|
||||
v4 = vec_ld(offset + 64, vcrc_short_const);
|
||||
v5 = vec_ld(offset + 80, vcrc_short_const);
|
||||
v6 = vec_ld(offset + 96, vcrc_short_const);
|
||||
v7 = vec_ld(offset + 112, vcrc_short_const);
|
||||
|
||||
offset += 128;
|
||||
|
||||
v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata0, (__vector unsigned int)v0);
|
||||
v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata1, (__vector unsigned int)v1);
|
||||
v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata2, (__vector unsigned int)v2);
|
||||
v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata3, (__vector unsigned int)v3);
|
||||
v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata4, (__vector unsigned int)v4);
|
||||
v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata5, (__vector unsigned int)v5);
|
||||
v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata6, (__vector unsigned int)v6);
|
||||
v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata7, (__vector unsigned int)v7);
|
||||
|
||||
/* Now reduce the tail (0-112 bytes). */
|
||||
for (i = 0; i < length; i+=16) {
|
||||
vdata0 = vec_ld(i,(__vector unsigned long long*)p);
|
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
|
||||
va0 = vec_ld(offset + i,vcrc_short_const);
|
||||
va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata0, (__vector unsigned int)va0);
|
||||
v0 = vec_xor(v0, va0);
|
||||
}
|
||||
|
||||
/* xor all parallel chunks together. */
|
||||
v0 = vec_xor(v0, v1);
|
||||
v2 = vec_xor(v2, v3);
|
||||
v4 = vec_xor(v4, v5);
|
||||
v6 = vec_xor(v6, v7);
|
||||
|
||||
v0 = vec_xor(v0, v2);
|
||||
v4 = vec_xor(v4, v6);
|
||||
|
||||
v0 = vec_xor(v0, v4);
|
||||
}
|
||||
|
||||
/* Barrett Reduction */
|
||||
vconst1 = vec_ld(0, v_Barrett_const);
|
||||
vconst2 = vec_ld(16, v_Barrett_const);
|
||||
|
||||
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
|
||||
(__vector unsigned char)v0, 8);
|
||||
v0 = vec_xor(v1,v0);
|
||||
|
||||
/* shift left one bit */
|
||||
__vector unsigned char vsht_splat = vec_splat_u8 (1);
|
||||
v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
|
||||
|
||||
v0 = vec_and(v0, vmask_64bit);
|
||||
|
||||
/*
|
||||
* The reflected version of Barrett reduction. Instead of bit
|
||||
* reflecting our data (which is expensive to do), we bit reflect our
|
||||
* constants and our algorithm, which means the intermediate data in
|
||||
* our vector registers goes from 0-63 instead of 63-0. We can reflect
|
||||
* the algorithm because we don't carry in mod 2 arithmetic.
|
||||
*/
|
||||
|
||||
/* bottom 32 bits of a */
|
||||
v1 = vec_and(v0, vmask_32bit);
|
||||
|
||||
/* ma */
|
||||
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
|
||||
(__vector unsigned long long)vconst1);
|
||||
|
||||
/* bottom 32bits of ma */
|
||||
v1 = vec_and(v1, vmask_32bit);
|
||||
/* qn */
|
||||
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
|
||||
(__vector unsigned long long)vconst2);
|
||||
/* a - qn, subtraction is xor in GF(2) */
|
||||
v0 = vec_xor (v0, v1);
|
||||
|
||||
/*
|
||||
* Since we are bit reflected, the result (ie the low 32 bits) is in
|
||||
* the high 32 bits. We just need to shift it left 4 bytes
|
||||
* V0 [ 0 1 X 3 ]
|
||||
* V0 [ 0 X 2 3 ]
|
||||
*/
|
||||
|
||||
/* shift result into top 64 bits of */
|
||||
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
|
||||
#if BYTE_ORDER == BIG_ENDIAN
|
||||
return v0[0];
|
||||
#else
|
||||
return v0[1];
|
||||
#endif
|
||||
}
|
||||
49
third_party/zlib-ng/arch/power/power_features.c
vendored
Normal file
49
third_party/zlib-ng/arch/power/power_features.c
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
/* power_features.c - POWER feature check
|
||||
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef HAVE_SYS_AUXV_H
|
||||
# include <sys/auxv.h>
|
||||
#endif
|
||||
#ifdef POWER_NEED_AUXVEC_H
|
||||
# include <linux/auxvec.h>
|
||||
#endif
|
||||
#ifdef __FreeBSD__
|
||||
# include <machine/cpu.h>
|
||||
#endif
|
||||
#include "zbuild.h"
|
||||
#include "power_features.h"
|
||||
|
||||
void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
|
||||
#ifdef PPC_FEATURES
|
||||
unsigned long hwcap;
|
||||
#ifdef __FreeBSD__
|
||||
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
|
||||
#else
|
||||
hwcap = getauxval(AT_HWCAP);
|
||||
#endif
|
||||
|
||||
if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
|
||||
features->has_altivec = 1;
|
||||
#endif
|
||||
|
||||
#ifdef POWER_FEATURES
|
||||
unsigned long hwcap2;
|
||||
#ifdef __FreeBSD__
|
||||
elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
|
||||
#else
|
||||
hwcap2 = getauxval(AT_HWCAP2);
|
||||
#endif
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
|
||||
features->has_arch_2_07 = 1;
|
||||
#endif
|
||||
#ifdef POWER9
|
||||
if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
|
||||
features->has_arch_3_00 = 1;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
18
third_party/zlib-ng/arch/power/power_features.h
vendored
Normal file
18
third_party/zlib-ng/arch/power/power_features.h
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
/* power_features.h -- check for POWER CPU features
|
||||
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef POWER_FEATURES_H_
|
||||
#define POWER_FEATURES_H_
|
||||
|
||||
struct power_cpu_features {
|
||||
int has_altivec;
|
||||
int has_arch_2_07;
|
||||
int has_arch_3_00;
|
||||
};
|
||||
|
||||
void Z_INTERNAL power_check_features(struct power_cpu_features *features);
|
||||
|
||||
#endif /* POWER_FEATURES_H_ */
|
||||
67
third_party/zlib-ng/arch/power/power_functions.h
vendored
Normal file
67
third_party/zlib-ng/arch/power/power_functions.h
vendored
Normal file
@@ -0,0 +1,67 @@
|
||||
/* power_functions.h -- POWER implementations for arch-specific functions.
|
||||
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef POWER_FUNCTIONS_H_
|
||||
#define POWER_FUNCTIONS_H_
|
||||
|
||||
#ifdef PPC_VMX
|
||||
uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
void slide_hash_vmx(deflate_state *s);
|
||||
#endif
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
uint32_t chunksize_power8(void);
|
||||
uint8_t* chunkmemset_safe_power8(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
|
||||
uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
|
||||
void slide_hash_power8(deflate_state *s);
|
||||
void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
|
||||
#endif
|
||||
|
||||
#ifdef POWER9
|
||||
uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
|
||||
uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
|
||||
uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef DISABLE_RUNTIME_CPU_DETECTION
|
||||
// Power - VMX
|
||||
# if defined(PPC_VMX) && defined(__ALTIVEC__)
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_vmx
|
||||
# undef native_slide_hash
|
||||
# define native_slide_hash slide_hash_vmx
|
||||
# endif
|
||||
// Power8 - VSX
|
||||
# if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__)
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_power8
|
||||
# undef native_chunkmemset_safe
|
||||
# define native_chunkmemset_safe chunkmemset_safe_power8
|
||||
# undef native_chunksize
|
||||
# define native_chunksize chunksize_power8
|
||||
# undef native_inflate_fast
|
||||
# define native_inflate_fast inflate_fast_power8
|
||||
# undef native_slide_hash
|
||||
# define native_slide_hash slide_hash_power8
|
||||
# endif
|
||||
# if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__)
|
||||
# undef native_crc32
|
||||
# define native_crc32 crc32_power8
|
||||
# endif
|
||||
// Power9
|
||||
# if defined(POWER9) && defined(_ARCH_PWR9)
|
||||
# undef native_compare256
|
||||
# define native_compare256 compare256_power9
|
||||
# undef native_longest_match
|
||||
# define native_longest_match longest_match_power9
|
||||
# undef native_longest_match_slow
|
||||
# define native_longest_match_slow longest_match_slow_power9
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif /* POWER_FUNCTIONS_H_ */
|
||||
34
third_party/zlib-ng/arch/power/power_intrins.h
vendored
Normal file
34
third_party/zlib-ng/arch/power/power_intrins.h
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
/* Helper functions to work around issues with clang builtins
|
||||
* Copyright (C) 2021 IBM Corporation
|
||||
*
|
||||
* Authors:
|
||||
* Daniel Black <daniel@linux.vnet.ibm.com>
|
||||
* Rogerio Alves <rogealve@br.ibm.com>
|
||||
* Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef POWER_INTRINS_H
|
||||
#define POWER_INTRINS_H
|
||||
|
||||
#if defined (__clang__)
|
||||
/*
|
||||
* These stubs fix clang incompatibilities with GCC builtins.
|
||||
*/
|
||||
|
||||
#ifndef __builtin_crypto_vpmsumw
|
||||
#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
|
||||
#endif
|
||||
#ifndef __builtin_crypto_vpmsumd
|
||||
#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
|
||||
#endif
|
||||
|
||||
static inline __vector unsigned long long __attribute__((overloadable))
|
||||
vec_ld(int __a, const __vector unsigned long long* __b) {
|
||||
return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
12
third_party/zlib-ng/arch/power/slide_hash_power8.c
vendored
Normal file
12
third_party/zlib-ng/arch/power/slide_hash_power8.c
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
/* Optimized slide_hash for POWER processors
|
||||
* Copyright (C) 2019-2020 IBM Corporation
|
||||
* Author: Matheus Castanho <msc@linux.ibm.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
|
||||
#define SLIDE_PPC slide_hash_power8
|
||||
#include "slide_ppc_tpl.h"
|
||||
|
||||
#endif /* POWER8_VSX */
|
||||
10
third_party/zlib-ng/arch/power/slide_hash_vmx.c
vendored
Normal file
10
third_party/zlib-ng/arch/power/slide_hash_vmx.c
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
/* Optimized slide_hash for PowerPC processors with VMX instructions
|
||||
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#ifdef PPC_VMX
|
||||
|
||||
#define SLIDE_PPC slide_hash_vmx
|
||||
#include "slide_ppc_tpl.h"
|
||||
|
||||
#endif /* PPC_VMX */
|
||||
32
third_party/zlib-ng/arch/power/slide_ppc_tpl.h
vendored
Normal file
32
third_party/zlib-ng/arch/power/slide_ppc_tpl.h
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
/* Optimized slide_hash for PowerPC processors
|
||||
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include <altivec.h>
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
|
||||
const vector unsigned short vmx_wsize = vec_splats(wsize);
|
||||
Pos *p = table;
|
||||
|
||||
do {
|
||||
vector unsigned short value, result;
|
||||
|
||||
value = vec_ld(0, p);
|
||||
result = vec_subs(value, vmx_wsize);
|
||||
vec_st(result, 0, p);
|
||||
|
||||
p += 8;
|
||||
entries -= 8;
|
||||
} while (entries > 0);
|
||||
}
|
||||
|
||||
void Z_INTERNAL SLIDE_PPC(deflate_state *s) {
|
||||
Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
|
||||
uint16_t wsize = (uint16_t)s->w_size;
|
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, wsize);
|
||||
slide_hash_chain(s->prev, wsize, wsize);
|
||||
}
|
||||
Reference in New Issue
Block a user