fix: breakpad use miniz
Some checks failed
sm-rpc / build (Debug, arm-linux-gnueabihf) (push) Successful in 1m34s
sm-rpc / build (Debug, aarch64-linux-gnu) (push) Successful in 2m46s
sm-rpc / build (Debug, host.gcc) (push) Failing after 1m28s
sm-rpc / build (Release, aarch64-linux-gnu) (push) Successful in 2m14s
sm-rpc / build (Release, arm-linux-gnueabihf) (push) Successful in 2m8s
sm-rpc / build (Debug, mipsel-linux-gnu) (push) Successful in 5m35s
sm-rpc / build (Release, host.gcc) (push) Failing after 1m55s
sm-rpc / build (Release, mipsel-linux-gnu) (push) Successful in 7m21s

This commit is contained in:
tqcq
2025-08-25 15:24:22 +08:00
parent a58517497b
commit 68b2e7f763
728 changed files with 489652 additions and 1211 deletions

View File

@@ -0,0 +1,93 @@
# Makefile for POWER-specific files
# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SUFFIX=
P8FLAGS=-mcpu=power8
P9FLAGS=-mcpu=power9
PPCFLAGS=-maltivec
NOLTOFLAG=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all: power_features.o \
power_features.lo \
adler32_power8.o \
adler32_power8.lo \
adler32_vmx.o \
adler32_vmx.lo \
chunkset_power8.o \
chunkset_power8.lo \
compare256_power9.o \
compare256_power9.lo \
crc32_power8.o \
crc32_power8.lo \
slide_hash_power8.o \
slide_hash_power8.lo \
slide_hash_vmx.o \
slide_hash_vmx.lo
power_features.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
power_features.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
adler32_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
adler32_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
adler32_vmx.o:
$(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
adler32_vmx.lo:
$(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
chunkset_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
chunkset_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
compare256_power9.o:
$(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
compare256_power9.lo:
$(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
crc32_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
crc32_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
slide_hash_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
slide_hash_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
slide_hash_vmx.o:
$(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
slide_hash_vmx.lo:
$(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean: clean
rm -f Makefile

View File

@@ -0,0 +1,153 @@
/* Adler32 for POWER8 using VSX instructions.
* Copyright (C) 2020 IBM Corporation
* Author: Rogerio Alves <rcardoso@linux.ibm.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
* instructions.
*
* If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
* iteration n) is the initial value of adler - at start _0 is 1 unless
* adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
* the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
* Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
* after iteration N.
*
* Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
* N-1*c[1] + ... + c[N]
*
* In a more general way:
*
* s1_N = s1_0 + sum(i=1 to N)c[i]
* s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
*
* Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
* can process N-bit at time we can do this at once.
*
* Since VSX can support 16-bit vector instructions, we can process
* 16-bit at time using N = 16 we have:
*
* s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
* s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
*
* After the first iteration we calculate the adler32 checksum for 16 bytes.
*
* For more background about adler32 please check the RFC:
* https://www.ietf.org/rfc/rfc1950.txt
*/
#ifdef POWER8_VSX
#include <altivec.h>
#include "zbuild.h"
#include "adler32_p.h"
/* Vector across sum unsigned int (saturate). */
static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
__b = vec_sld(__a, __a, 8);
__b = vec_add(__b, __a);
__a = vec_sld(__b, __b, 4);
__a = vec_add(__a, __b);
return __a;
}
Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t s1 = adler & 0xffff;
uint32_t s2 = (adler >> 16) & 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(s1, buf, s2);
/* If buffer is empty or len=0 we need to return adler initial value. */
if (UNLIKELY(buf == NULL))
return 1;
/* This is faster than VSX code for len < 64. */
if (len < 64)
return adler32_len_64(s1, buf, len, s2);
/* Use POWER VSX instructions for len >= 64. */
const vector unsigned int v_zeros = { 0 };
const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
6, 5, 4, 3, 2, 1};
const vector unsigned char vsh = vec_splat_u8(4);
const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
vector unsigned int vs1 = { 0 };
vector unsigned int vs2 = { 0 };
vector unsigned int vs1_save = { 0 };
vector unsigned int vsum1, vsum2;
vector unsigned char vbuf;
int n;
vs1[0] = s1;
vs2[0] = s2;
/* Do length bigger than NMAX in blocks of NMAX size. */
while (len >= NMAX) {
len -= NMAX;
n = NMAX / 16;
do {
vbuf = vec_xl(0, (unsigned char *) buf);
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
/* sum(i=1 to 16) buf[i]*(16-i+1). */
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
/* Save vs1. */
vs1_save = vec_add(vs1_save, vs1);
/* Accumulate the sums. */
vs1 = vec_add(vsum1, vs1);
vs2 = vec_add(vsum2, vs2);
buf += 16;
} while (--n);
/* Once each block of NMAX size. */
vs1 = vec_sumsu(vs1, vsum1);
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
vs2 = vec_add(vs1_save, vs2);
vs2 = vec_sumsu(vs2, vsum2);
/* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */
vs1[0] = vs1[0] % BASE;
/* vs2[0] = s2_i + 16*s1_save +
sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
vs2[0] = vs2[0] % BASE;
vs1 = vec_and(vs1, vmask);
vs2 = vec_and(vs2, vmask);
vs1_save = v_zeros;
}
/* len is less than NMAX one modulo is needed. */
if (len >= 16) {
while (len >= 16) {
len -= 16;
vbuf = vec_xl(0, (unsigned char *) buf);
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
/* sum(i=1 to 16) buf[i]*(16-i+1). */
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
/* Save vs1. */
vs1_save = vec_add(vs1_save, vs1);
/* Accumulate the sums. */
vs1 = vec_add(vsum1, vs1);
vs2 = vec_add(vsum2, vs2);
buf += 16;
}
/* Since the size will be always less than NMAX we do this once. */
vs1 = vec_sumsu(vs1, vsum1);
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
vs2 = vec_add(vs1_save, vs2);
vs2 = vec_sumsu(vs2, vsum2);
}
/* Copy result back to s1, s2 (mod 65521). */
s1 = vs1[0] % BASE;
s2 = vs2[0] % BASE;
/* Process tail (len < 16). */
return adler32_len_16(s1, buf, len, s2);
}
#endif /* POWER8_VSX */

View File

@@ -0,0 +1,186 @@
/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org>
* Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef PPC_VMX
#include <altivec.h>
#include "zbuild.h"
#include "zendian.h"
#include "adler32_p.h"
#define vmx_zero() (vec_splat_u32(0))
static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
unsigned int i;
for (i = 0; i < len; ++i) {
pair[0] += buf[i];
pair[1] += pair[0];
}
}
static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
/* Different taps for the separable components of sums */
const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
/* As silly and inefficient as it seems, creating 1 permutation vector to permute
* a 2 element vector from a single load + a subsequent shift is just barely faster
* than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
vector unsigned int adacc, s2acc;
vector unsigned int pair_vec = vec_ld(0, s);
adacc = vec_perm(pair_vec, pair_vec, s0_perm);
#if BYTE_ORDER == LITTLE_ENDIAN
s2acc = vec_sro(pair_vec, shift_vec);
#else
s2acc = vec_slo(pair_vec, shift_vec);
#endif
vector unsigned int zero = vmx_zero();
vector unsigned int s3acc = zero;
vector unsigned int s3acc_0 = zero;
vector unsigned int adacc_prev = adacc;
vector unsigned int adacc_prev_0 = zero;
vector unsigned int s2acc_0 = zero;
vector unsigned int s2acc_1 = zero;
vector unsigned int s2acc_2 = zero;
/* Maintain a running sum of a second half, this might help use break yet another
* data dependency bubble in the sum */
vector unsigned int adacc_0 = zero;
int num_iter = len / 4;
int rem = len & 3;
for (int i = 0; i < num_iter; ++i) {
vector unsigned char d0 = vec_ld(0, buf);
vector unsigned char d1 = vec_ld(16, buf);
vector unsigned char d2 = vec_ld(32, buf);
vector unsigned char d3 = vec_ld(48, buf);
/* The core operation of the loop, basically
* what is being unrolled below */
adacc = vec_sum4s(d0, adacc);
s3acc = vec_add(s3acc, adacc_prev);
s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
s2acc = vec_msum(t0, d0, s2acc);
/* interleave dependent sums in here */
adacc_0 = vec_sum4s(d1, adacc_0);
s2acc_0 = vec_msum(t1, d1, s2acc_0);
adacc = vec_sum4s(d2, adacc);
s2acc_1 = vec_msum(t2, d2, s2acc_1);
s2acc_2 = vec_msum(t3, d3, s2acc_2);
adacc_0 = vec_sum4s(d3, adacc_0);
adacc_prev = adacc;
adacc_prev_0 = adacc_0;
buf += 64;
}
adacc = vec_add(adacc, adacc_0);
s3acc = vec_add(s3acc, s3acc_0);
s3acc = vec_sl(s3acc, vec_splat_u32(6));
if (rem) {
adacc_prev = vec_add(adacc_prev_0, adacc_prev);
adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
while (rem--) {
vector unsigned char d0 = vec_ld(0, buf);
adacc = vec_sum4s(d0, adacc);
s3acc = vec_add(s3acc, adacc_prev);
s2acc = vec_msum(t3, d0, s2acc);
adacc_prev = vec_sl(adacc, vec_splat_u32(4));
buf += 16;
}
}
/* Sum up independent second sums */
s2acc = vec_add(s2acc, s2acc_0);
s2acc_2 = vec_add(s2acc_1, s2acc_2);
s2acc = vec_add(s2acc, s2acc_2);
s2acc = vec_add(s2acc, s3acc);
adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
vec_ste(adacc, 0, s);
vec_ste(s2acc, 0, s+1);
}
Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
uint32_t pair[16] ALIGNED_(16);
memset(&pair[2], 0, 14);
int n = NMAX;
unsigned int done = 0, i;
/* Split Adler-32 into component sums, it can be supplied by
* the caller sites (e.g. in a PNG file).
*/
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
pair[0] = adler;
pair[1] = sum2;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
// Align buffer
unsigned int al = 0;
if ((uintptr_t)buf & 0xf) {
al = 16-((uintptr_t)buf & 0xf);
if (al > len) {
al=len;
}
vmx_handle_head_or_tail(pair, buf, al);
done += al;
/* Rather than rebasing, we can reduce the max sums for the
* first round only */
n -= al;
}
for (i = al; i < len; i += n) {
int remaining = (int)(len-i);
n = MIN(remaining, (i == al) ? n : NMAX);
if (n < 16)
break;
vmx_accum32(pair, buf + i, n / 16);
pair[0] %= BASE;
pair[1] %= BASE;
done += (n / 16) * 16;
}
/* Handle the tail elements. */
if (done < len) {
vmx_handle_head_or_tail(pair, (buf + done), len - done);
pair[0] %= BASE;
pair[1] %= BASE;
}
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
return (pair[1] << 16) | pair[0];
}
#endif

View File

@@ -0,0 +1,50 @@
/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef POWER8_VSX
#include <altivec.h>
#include "zbuild.h"
#include "zmemory.h"
typedef vector unsigned char chunk_t;
#define CHUNK_SIZE 16
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
*chunk = (vector unsigned char)vec_splats(zng_memread_2(from));
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
*chunk = (vector unsigned char)vec_splats(zng_memread_4(from));
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
*chunk = (vector unsigned char)vec_splats((unsigned long long)zng_memread_8(from));
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = vec_xl(0, s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
vec_xst(*chunk, 0, out);
}
#define CHUNKSIZE chunksize_power8
#define CHUNKCOPY chunkcopy_power8
#define CHUNKUNROLL chunkunroll_power8
#define CHUNKMEMSET chunkmemset_power8
#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_power8
#include "inffast_tpl.h"
#endif

View File

@@ -0,0 +1,66 @@
/* compare256_power9.c - Power9 version of compare256
* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef POWER9
#include <altivec.h>
#include "zbuild.h"
#include "zmemory.h"
#include "deflate.h"
#include "zendian.h"
/* Older versions of GCC misimplemented semantics for these bit counting builtins.
* https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 12)
#if BYTE_ORDER == LITTLE_ENDIAN
# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
#else
# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
#endif
#else
# define zng_vec_vctzlsbb(vc, len) len = vec_cntlz_lsbb(vc)
#endif
static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0, cmplen;
do {
vector unsigned char vsrc0, vsrc1, vc;
vsrc0 = *((vector unsigned char *)src0);
vsrc1 = *((vector unsigned char *)src1);
/* Compare 16 bytes at a time. Each byte of vc will be either
* all ones or all zeroes, depending on the result of the comparison. */
vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
/* Since the index of matching bytes will contain only zeroes
* on vc (since we used cmpne), counting the number of consecutive
* bytes where LSB == 0 is the same as counting the length of the match. */
zng_vec_vctzlsbb(vc, cmplen);
if (cmplen != 16)
return len + cmplen;
src0 += 16, src1 += 16, len += 16;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
return compare256_power9_static(src0, src1);
}
#define LONGEST_MATCH longest_match_power9
#define COMPARE256 compare256_power9_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_power9
#define COMPARE256 compare256_power9_static
#include "match_tpl.h"
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,587 @@
/* crc32 for POWER8 using VSX instructions
* Copyright (C) 2021 IBM Corporation
*
* Author: Rogerio Alves <rogealve@br.ibm.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Calculate the checksum of data that is 16 byte aligned and a multiple of
* 16 bytes.
*
* The first step is to reduce it to 1024 bits. We do this in 8 parallel
* chunks in order to mask the latency of the vpmsum instructions. If we
* have more than 32 kB of data to checksum we repeat this step multiple
* times, passing in the previous 1024 bits.
*
* The next step is to reduce the 1024 bits to 64 bits. This step adds
* 32 bits of 0s to the end - this matches what a CRC does. We just
* calculate constants that land the data in this 32 bits.
*
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
* for n = CRC using POWER8 instructions. We use x = 32.
*
* http://en.wikipedia.org/wiki/Barrett_reduction
*
* This code uses gcc vector builtins instead using assembly directly.
*/
#include <altivec.h>
#include "zendian.h"
#include "zbuild.h"
#include "crc32_constants.h"
#include "crc32_braid_tbl.h"
#include "power_intrins.h"
#define MAX_SIZE 32768
#define VMX_ALIGN 16
#define VMX_ALIGN_MASK (VMX_ALIGN-1)
static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
while (len--)
crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
return crc;
}
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
unsigned int prealign;
unsigned int tail;
unsigned long len = (unsigned long) _len;
if (p == (const unsigned char *) 0x0)
return 0;
crc ^= 0xffffffff;
if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
crc = crc32_align(crc, p, len);
goto out;
}
if ((unsigned long)p & VMX_ALIGN_MASK) {
prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
crc = crc32_align(crc, p, prealign);
len -= prealign;
p += prealign;
}
crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
tail = len & VMX_ALIGN_MASK;
if (tail) {
p += len & ~VMX_ALIGN_MASK;
crc = crc32_align(crc, p, tail);
}
out:
crc ^= 0xffffffff;
return crc;
}
/* When we have a load-store in a single-dispatch group and address overlap
* such that forward is not allowed (load-hit-store) the group must be flushed.
* A group ending NOP prevents the flush.
*/
#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
#if BYTE_ORDER == BIG_ENDIAN
#define BYTESWAP_DATA
#endif
#ifdef BYTESWAP_DATA
#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
#if BYTE_ORDER == LITTLE_ENDIAN
/* Byte reverse permute constant LE. */
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
#else
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
#endif
#else
#define VEC_PERM(vr, va, vb, vc)
#endif
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
const __vector unsigned long long vzero = {0,0};
const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
const __vector unsigned long long vmask_32bit =
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
const __vector unsigned long long vmask_64bit =
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
__vector unsigned long long vcrc;
__vector unsigned long long vconst1, vconst2;
/* vdata0-vdata7 will contain our data (p). */
__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
/* v0-v7 will contain our checksums */
__vector unsigned long long v0 = {0,0};
__vector unsigned long long v1 = {0,0};
__vector unsigned long long v2 = {0,0};
__vector unsigned long long v3 = {0,0};
__vector unsigned long long v4 = {0,0};
__vector unsigned long long v5 = {0,0};
__vector unsigned long long v6 = {0,0};
__vector unsigned long long v7 = {0,0};
/* Vector auxiliary variables. */
__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
unsigned int offset; /* Constant table offset. */
unsigned long i; /* Counter. */
unsigned long chunks;
unsigned long block_size;
int next_block = 0;
/* Align by 128 bits. The last 128 bit block will be processed at end. */
unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
/* Short version. */
if (len < 256) {
/* Calculate where in the constant table we need to start. */
offset = 256 - len;
vconst1 = vec_ld(offset, vcrc_short_const);
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
/* xor initial value */
vdata0 = vec_xor(vdata0, vcrc);
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
v0 = vec_xor(v0, vdata0);
for (i = 16; i < len; i += 16) {
vconst1 = vec_ld(offset + i, vcrc_short_const);
vdata0 = vec_ld(i, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
v0 = vec_xor(v0, vdata0);
}
} else {
/* Load initial values. */
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
/* xor in initial value */
vdata0 = vec_xor(vdata0, vcrc);
p = (char *)p + 128;
do {
/* Checksum in blocks of MAX_SIZE. */
block_size = length;
if (block_size > MAX_SIZE) {
block_size = MAX_SIZE;
}
length = length - block_size;
/*
* Work out the offset into the constants table to start at. Each
* constant is 16 bytes, and it is used against 128 bytes of input
* data - 128 / 16 = 8
*/
offset = (MAX_SIZE/8) - (block_size/8);
/* We reduce our final 128 bytes in a separate step */
chunks = (block_size/128)-1;
vconst1 = vec_ld(offset, vcrc_const);
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
(__vector unsigned long long)vconst1);
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
(__vector unsigned long long)vconst1);
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
(__vector unsigned long long)vconst1);
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
(__vector unsigned long long)vconst1);
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
(__vector unsigned long long)vconst1);
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
(__vector unsigned long long)vconst1);
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
(__vector unsigned long long)vconst1);
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
(__vector unsigned long long)vconst1);
if (chunks > 1) {
offset += 16;
vconst2 = vec_ld(offset, vcrc_const);
GROUP_ENDING_NOP;
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
p = (char *)p + 128;
/*
* main loop. Each iteration calculates the CRC for a 128-byte
* block.
*/
for (i = 0; i < chunks-2; i++) {
vconst1 = vec_ld(offset, vcrc_const);
offset += 16;
GROUP_ENDING_NOP;
v0 = vec_xor(v0, va0);
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
(__vector unsigned long long)vconst2);
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
GROUP_ENDING_NOP;
v1 = vec_xor(v1, va1);
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
(__vector unsigned long long)vconst2);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
GROUP_ENDING_NOP;
v2 = vec_xor(v2, va2);
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
vdata2, (__vector unsigned long long)vconst2);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
GROUP_ENDING_NOP;
v3 = vec_xor(v3, va3);
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
(__vector unsigned long long)vconst2);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vconst2 = vec_ld(offset, vcrc_const);
GROUP_ENDING_NOP;
v4 = vec_xor(v4, va4);
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
(__vector unsigned long long)vconst1);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
GROUP_ENDING_NOP;
v5 = vec_xor(v5, va5);
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
(__vector unsigned long long)vconst1);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
GROUP_ENDING_NOP;
v6 = vec_xor(v6, va6);
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
(__vector unsigned long long)vconst1);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
GROUP_ENDING_NOP;
v7 = vec_xor(v7, va7);
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
(__vector unsigned long long)vconst1);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
p = (char *)p + 128;
}
/* First cool down */
vconst1 = vec_ld(offset, vcrc_const);
offset += 16;
v0 = vec_xor(v0, va0);
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v1 = vec_xor(v1, va1);
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v2 = vec_xor(v2, va2);
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v3 = vec_xor(v3, va3);
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v4 = vec_xor(v4, va4);
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v5 = vec_xor(v5, va5);
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v6 = vec_xor(v6, va6);
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v7 = vec_xor(v7, va7);
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
(__vector unsigned long long)vconst1);
}/* else */
/* Second cool down. */
v0 = vec_xor(v0, va0);
v1 = vec_xor(v1, va1);
v2 = vec_xor(v2, va2);
v3 = vec_xor(v3, va3);
v4 = vec_xor(v4, va4);
v5 = vec_xor(v5, va5);
v6 = vec_xor(v6, va6);
v7 = vec_xor(v7, va7);
/*
* vpmsumd produces a 96 bit result in the least significant bits
* of the register. Since we are bit reflected we have to shift it
* left 32 bits so it occupies the least significant bits in the
* bit reflected domain.
*/
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)vzero, 4);
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
(__vector unsigned char)vzero, 4);
v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
(__vector unsigned char)vzero, 4);
v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
(__vector unsigned char)vzero, 4);
v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
(__vector unsigned char)vzero, 4);
v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
(__vector unsigned char)vzero, 4);
v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
(__vector unsigned char)vzero, 4);
v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
(__vector unsigned char)vzero, 4);
/* xor with the last 1024 bits. */
va0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(va0, va0, va0, vperm_const);
va1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(va1, va1, va1, vperm_const);
va2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(va2, va2, va2, vperm_const);
va3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(va3, va3, va3, vperm_const);
va4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(va4, va4, va4, vperm_const);
va5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(va5, va5, va5, vperm_const);
va6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(va6, va6, va6, vperm_const);
va7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(va7, va7, va7, vperm_const);
p = (char *)p + 128;
vdata0 = vec_xor(v0, va0);
vdata1 = vec_xor(v1, va1);
vdata2 = vec_xor(v2, va2);
vdata3 = vec_xor(v3, va3);
vdata4 = vec_xor(v4, va4);
vdata5 = vec_xor(v5, va5);
vdata6 = vec_xor(v6, va6);
vdata7 = vec_xor(v7, va7);
/* Check if we have more blocks to process */
next_block = 0;
if (length != 0) {
next_block = 1;
/* zero v0-v7 */
v0 = vec_xor(v0, v0);
v1 = vec_xor(v1, v1);
v2 = vec_xor(v2, v2);
v3 = vec_xor(v3, v3);
v4 = vec_xor(v4, v4);
v5 = vec_xor(v5, v5);
v6 = vec_xor(v6, v6);
v7 = vec_xor(v7, v7);
}
length = length + 128;
} while (next_block);
/* Calculate how many bytes we have left. */
length = (len & 127);
/* Calculate where in (short) constant table we need to start. */
offset = 128 - length;
v0 = vec_ld(offset, vcrc_short_const);
v1 = vec_ld(offset + 16, vcrc_short_const);
v2 = vec_ld(offset + 32, vcrc_short_const);
v3 = vec_ld(offset + 48, vcrc_short_const);
v4 = vec_ld(offset + 64, vcrc_short_const);
v5 = vec_ld(offset + 80, vcrc_short_const);
v6 = vec_ld(offset + 96, vcrc_short_const);
v7 = vec_ld(offset + 112, vcrc_short_const);
offset += 128;
v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata0, (__vector unsigned int)v0);
v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata1, (__vector unsigned int)v1);
v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata2, (__vector unsigned int)v2);
v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata3, (__vector unsigned int)v3);
v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata4, (__vector unsigned int)v4);
v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata5, (__vector unsigned int)v5);
v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata6, (__vector unsigned int)v6);
v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata7, (__vector unsigned int)v7);
/* Now reduce the tail (0-112 bytes). */
for (i = 0; i < length; i+=16) {
vdata0 = vec_ld(i,(__vector unsigned long long*)p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
va0 = vec_ld(offset + i,vcrc_short_const);
va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata0, (__vector unsigned int)va0);
v0 = vec_xor(v0, va0);
}
/* xor all parallel chunks together. */
v0 = vec_xor(v0, v1);
v2 = vec_xor(v2, v3);
v4 = vec_xor(v4, v5);
v6 = vec_xor(v6, v7);
v0 = vec_xor(v0, v2);
v4 = vec_xor(v4, v6);
v0 = vec_xor(v0, v4);
}
/* Barrett Reduction */
vconst1 = vec_ld(0, v_Barrett_const);
vconst2 = vec_ld(16, v_Barrett_const);
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)v0, 8);
v0 = vec_xor(v1,v0);
/* shift left one bit */
__vector unsigned char vsht_splat = vec_splat_u8 (1);
v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
v0 = vec_and(v0, vmask_64bit);
/*
* The reflected version of Barrett reduction. Instead of bit
* reflecting our data (which is expensive to do), we bit reflect our
* constants and our algorithm, which means the intermediate data in
* our vector registers goes from 0-63 instead of 63-0. We can reflect
* the algorithm because we don't carry in mod 2 arithmetic.
*/
/* bottom 32 bits of a */
v1 = vec_and(v0, vmask_32bit);
/* ma */
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
(__vector unsigned long long)vconst1);
/* bottom 32bits of ma */
v1 = vec_and(v1, vmask_32bit);
/* qn */
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
(__vector unsigned long long)vconst2);
/* a - qn, subtraction is xor in GF(2) */
v0 = vec_xor (v0, v1);
/*
* Since we are bit reflected, the result (ie the low 32 bits) is in
* the high 32 bits. We just need to shift it left 4 bytes
* V0 [ 0 1 X 3 ]
* V0 [ 0 X 2 3 ]
*/
/* shift result into top 64 bits of */
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)vzero, 4);
#if BYTE_ORDER == BIG_ENDIAN
return v0[0];
#else
return v0[1];
#endif
}

View File

@@ -0,0 +1,49 @@
/* power_features.c - POWER feature check
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
* Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef HAVE_SYS_AUXV_H
# include <sys/auxv.h>
#endif
#ifdef POWER_NEED_AUXVEC_H
# include <linux/auxvec.h>
#endif
#ifdef __FreeBSD__
# include <machine/cpu.h>
#endif
#include "zbuild.h"
#include "power_features.h"
void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
#ifdef PPC_FEATURES
unsigned long hwcap;
#ifdef __FreeBSD__
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
#else
hwcap = getauxval(AT_HWCAP);
#endif
if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
features->has_altivec = 1;
#endif
#ifdef POWER_FEATURES
unsigned long hwcap2;
#ifdef __FreeBSD__
elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
#else
hwcap2 = getauxval(AT_HWCAP2);
#endif
#ifdef POWER8_VSX
if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
features->has_arch_2_07 = 1;
#endif
#ifdef POWER9
if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
features->has_arch_3_00 = 1;
#endif
#endif
}

View File

@@ -0,0 +1,18 @@
/* power_features.h -- check for POWER CPU features
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
* Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef POWER_FEATURES_H_
#define POWER_FEATURES_H_
struct power_cpu_features {
int has_altivec;
int has_arch_2_07;
int has_arch_3_00;
};
void Z_INTERNAL power_check_features(struct power_cpu_features *features);
#endif /* POWER_FEATURES_H_ */

View File

@@ -0,0 +1,67 @@
/* power_functions.h -- POWER implementations for arch-specific functions.
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
* Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef POWER_FUNCTIONS_H_
#define POWER_FUNCTIONS_H_
#ifdef PPC_VMX
uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
void slide_hash_vmx(deflate_state *s);
#endif
#ifdef POWER8_VSX
uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t chunksize_power8(void);
uint8_t* chunkmemset_safe_power8(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
void slide_hash_power8(deflate_state *s);
void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef POWER9
uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// Power - VMX
# if defined(PPC_VMX) && defined(__ALTIVEC__)
# undef native_adler32
# define native_adler32 adler32_vmx
# undef native_slide_hash
# define native_slide_hash slide_hash_vmx
# endif
// Power8 - VSX
# if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__)
# undef native_adler32
# define native_adler32 adler32_power8
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_power8
# undef native_chunksize
# define native_chunksize chunksize_power8
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_power8
# undef native_slide_hash
# define native_slide_hash slide_hash_power8
# endif
# if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__)
# undef native_crc32
# define native_crc32 crc32_power8
# endif
// Power9
# if defined(POWER9) && defined(_ARCH_PWR9)
# undef native_compare256
# define native_compare256 compare256_power9
# undef native_longest_match
# define native_longest_match longest_match_power9
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_power9
# endif
#endif
#endif /* POWER_FUNCTIONS_H_ */

View File

@@ -0,0 +1,34 @@
/* Helper functions to work around issues with clang builtins
* Copyright (C) 2021 IBM Corporation
*
* Authors:
* Daniel Black <daniel@linux.vnet.ibm.com>
* Rogerio Alves <rogealve@br.ibm.com>
* Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef POWER_INTRINS_H
#define POWER_INTRINS_H
#if defined (__clang__)
/*
* These stubs fix clang incompatibilities with GCC builtins.
*/
#ifndef __builtin_crypto_vpmsumw
#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
#endif
#ifndef __builtin_crypto_vpmsumd
#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
#endif
static inline __vector unsigned long long __attribute__((overloadable))
vec_ld(int __a, const __vector unsigned long long* __b) {
return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
}
#endif
#endif

View File

@@ -0,0 +1,12 @@
/* Optimized slide_hash for POWER processors
* Copyright (C) 2019-2020 IBM Corporation
* Author: Matheus Castanho <msc@linux.ibm.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef POWER8_VSX
#define SLIDE_PPC slide_hash_power8
#include "slide_ppc_tpl.h"
#endif /* POWER8_VSX */

View File

@@ -0,0 +1,10 @@
/* Optimized slide_hash for PowerPC processors with VMX instructions
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef PPC_VMX
#define SLIDE_PPC slide_hash_vmx
#include "slide_ppc_tpl.h"
#endif /* PPC_VMX */

View File

@@ -0,0 +1,32 @@
/* Optimized slide_hash for PowerPC processors
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <altivec.h>
#include "zbuild.h"
#include "deflate.h"
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
const vector unsigned short vmx_wsize = vec_splats(wsize);
Pos *p = table;
do {
vector unsigned short value, result;
value = vec_ld(0, p);
result = vec_subs(value, vmx_wsize);
vec_st(result, 0, p);
p += 8;
entries -= 8;
} while (entries > 0);
}
void Z_INTERNAL SLIDE_PPC(deflate_state *s) {
Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
uint16_t wsize = (uint16_t)s->w_size;
slide_hash_chain(s->head, HASH_SIZE, wsize);
slide_hash_chain(s->prev, wsize, wsize);
}