fix: breakpad use miniz

2025-08-25 15:24:22 +08:00
parent a58517497b
commit 68b2e7f763
728 changed files with 489652 additions and 1211 deletions
--- a/third_party/zlib-ng/arch/power/Makefile.in
+++ b/third_party/zlib-ng/arch/power/Makefile.in
@@ -0,0 +1,93 @@
+# Makefile for POWER-specific files
+# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+P8FLAGS=-mcpu=power8
+P9FLAGS=-mcpu=power9
+PPCFLAGS=-maltivec
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: power_features.o \
+     power_features.lo \
+     adler32_power8.o \
+     adler32_power8.lo \
+     adler32_vmx.o \
+     adler32_vmx.lo \
+     chunkset_power8.o \
+     chunkset_power8.lo \
+     compare256_power9.o \
+     compare256_power9.lo \
+     crc32_power8.o \
+     crc32_power8.lo \
+     slide_hash_power8.o \
+     slide_hash_power8.lo \
+     slide_hash_vmx.o \
+     slide_hash_vmx.lo
+
+power_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+power_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+adler32_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_vmx.o:
+	$(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+adler32_vmx.lo:
+	$(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+chunkset_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+chunkset_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+compare256_power9.o:
+	$(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+compare256_power9.lo:
+	$(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+crc32_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+crc32_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+slide_hash_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_vmx.o:
+	$(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+slide_hash_vmx.lo:
+	$(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
--- a/third_party/zlib-ng/arch/power/adler32_power8.c
+++ b/third_party/zlib-ng/arch/power/adler32_power8.c
@@ -0,0 +1,153 @@
+/* Adler32 for POWER8 using VSX instructions.
+ * Copyright (C) 2020 IBM Corporation
+ * Author: Rogerio Alves <rcardoso@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
+ * instructions.
+ *
+ * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
+ * iteration n) is the initial value of adler - at start  _0 is 1 unless
+ * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
+ * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
+ * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
+ * after iteration N.
+ *
+ * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
+ * N-1*c[1] + ... + c[N]
+ *
+ * In a more general way:
+ *
+ * s1_N = s1_0 + sum(i=1 to N)c[i]
+ * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
+ *
+ * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
+ * can process N-bit at time we can do this at once.
+ *
+ * Since VSX can support 16-bit vector instructions, we can process
+ * 16-bit at time using N = 16 we have:
+ *
+ * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
+ * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
+ *
+ * After the first iteration we calculate the adler32 checksum for 16 bytes.
+ *
+ * For more background about adler32 please check the RFC:
+ * https://www.ietf.org/rfc/rfc1950.txt
+ */
+
+#ifdef POWER8_VSX
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "adler32_p.h"
+
+/* Vector across sum unsigned int (saturate).  */
+static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
+    __b = vec_sld(__a, __a, 8);
+    __b = vec_add(__b, __a);
+    __a = vec_sld(__b, __b, 4);
+    __a = vec_add(__a, __b);
+
+    return __a;
+}
+
+Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t s1 = adler & 0xffff;
+    uint32_t s2 = (adler >> 16) & 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(s1, buf, s2);
+
+    /* If buffer is empty or len=0 we need to return adler initial value.  */
+    if (UNLIKELY(buf == NULL))
+        return 1;
+
+    /* This is faster than VSX code for len < 64.  */
+    if (len < 64)
+        return adler32_len_64(s1, buf, len, s2);
+
+    /* Use POWER VSX instructions for len >= 64. */
+    const vector unsigned int v_zeros = { 0 };
+    const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
+         6, 5, 4, 3, 2, 1};
+    const vector unsigned char vsh = vec_splat_u8(4);
+    const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
+    vector unsigned int vs1 = { 0 };
+    vector unsigned int vs2 = { 0 };
+    vector unsigned int vs1_save = { 0 };
+    vector unsigned int vsum1, vsum2;
+    vector unsigned char vbuf;
+    int n;
+
+    vs1[0] = s1;
+    vs2[0] = s2;
+
+    /* Do length bigger than NMAX in blocks of NMAX size.  */
+    while (len >= NMAX) {
+        len -= NMAX;
+        n = NMAX / 16;
+        do {
+            vbuf = vec_xl(0, (unsigned char *) buf);
+            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+            /* Save vs1.  */
+            vs1_save = vec_add(vs1_save, vs1);
+            /* Accumulate the sums.  */
+            vs1 = vec_add(vsum1, vs1);
+            vs2 = vec_add(vsum2, vs2);
+
+            buf += 16;
+        } while (--n);
+        /* Once each block of NMAX size.  */
+        vs1 = vec_sumsu(vs1, vsum1);
+        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+        vs2 = vec_add(vs1_save, vs2);
+        vs2 = vec_sumsu(vs2, vsum2);
+
+        /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521.  */
+        vs1[0] = vs1[0] % BASE;
+        /* vs2[0] = s2_i + 16*s1_save +
+           sum(i=1 to 16)(16-i+1)*buf[i] mod 65521.  */
+        vs2[0] = vs2[0] % BASE;
+
+        vs1 = vec_and(vs1, vmask);
+        vs2 = vec_and(vs2, vmask);
+        vs1_save = v_zeros;
+    }
+
+    /* len is less than NMAX one modulo is needed.  */
+    if (len >= 16) {
+        while (len >= 16) {
+            len -= 16;
+
+            vbuf = vec_xl(0, (unsigned char *) buf);
+
+            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+            /* Save vs1.  */
+            vs1_save = vec_add(vs1_save, vs1);
+            /* Accumulate the sums.  */
+            vs1 = vec_add(vsum1, vs1);
+            vs2 = vec_add(vsum2, vs2);
+
+            buf += 16;
+        }
+        /* Since the size will be always less than NMAX we do this once.  */
+        vs1 = vec_sumsu(vs1, vsum1);
+        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+        vs2 = vec_add(vs1_save, vs2);
+        vs2 = vec_sumsu(vs2, vsum2);
+    }
+    /* Copy result back to s1, s2 (mod 65521).  */
+    s1 = vs1[0] % BASE;
+    s2 = vs2[0] % BASE;
+
+    /* Process tail (len < 16).  */
+    return adler32_len_16(s1, buf, len, s2);
+}
+
+#endif /* POWER8_VSX */
--- a/third_party/zlib-ng/arch/power/adler32_vmx.c
+++ b/third_party/zlib-ng/arch/power/adler32_vmx.c
@@ -0,0 +1,186 @@
+/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef PPC_VMX
+#include <altivec.h>
+#include "zbuild.h"
+#include "zendian.h"
+#include "adler32_p.h"
+
+#define vmx_zero()  (vec_splat_u32(0))
+
+static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
+    unsigned int i;
+    for (i = 0; i < len; ++i) {
+        pair[0] += buf[i];
+        pair[1] += pair[0];
+    }
+}
+
+static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+    /* Different taps for the separable components of sums */
+    const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
+    const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
+    const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
+    const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+    /* As silly and inefficient as it seems, creating 1 permutation vector to permute
+     * a 2 element vector from a single load + a subsequent shift is just barely faster
+     * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
+    const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+    const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
+    vector unsigned int  adacc, s2acc;
+    vector unsigned int pair_vec = vec_ld(0, s);
+    adacc = vec_perm(pair_vec, pair_vec, s0_perm);
+#if BYTE_ORDER == LITTLE_ENDIAN
+    s2acc = vec_sro(pair_vec, shift_vec);
+#else
+    s2acc = vec_slo(pair_vec, shift_vec);
+#endif
+
+    vector unsigned int zero = vmx_zero();
+    vector unsigned int s3acc = zero;
+    vector unsigned int s3acc_0 = zero;
+    vector unsigned int adacc_prev = adacc;
+    vector unsigned int adacc_prev_0 = zero;
+
+    vector unsigned int s2acc_0 = zero;
+    vector unsigned int s2acc_1 = zero;
+    vector unsigned int s2acc_2 = zero;
+
+    /* Maintain a running sum of a second half, this might help use break yet another
+     * data dependency bubble in the sum */
+    vector unsigned int adacc_0 = zero;
+
+    int num_iter = len / 4;
+    int rem = len & 3;
+
+    for (int i = 0; i < num_iter; ++i) {
+        vector unsigned char d0 = vec_ld(0, buf);
+        vector unsigned char d1 = vec_ld(16, buf);
+        vector unsigned char d2 = vec_ld(32, buf);
+        vector unsigned char d3 = vec_ld(48, buf);
+
+        /* The core operation of the loop, basically
+         * what is being unrolled below */
+        adacc = vec_sum4s(d0, adacc);
+        s3acc = vec_add(s3acc, adacc_prev);
+        s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
+        s2acc = vec_msum(t0, d0, s2acc);
+
+        /* interleave dependent sums in here */
+        adacc_0 = vec_sum4s(d1, adacc_0);
+        s2acc_0 = vec_msum(t1, d1, s2acc_0);
+        adacc = vec_sum4s(d2, adacc);
+        s2acc_1 = vec_msum(t2, d2, s2acc_1);
+        s2acc_2 = vec_msum(t3, d3, s2acc_2);
+        adacc_0 = vec_sum4s(d3, adacc_0);
+
+        adacc_prev = adacc;
+        adacc_prev_0 = adacc_0;
+        buf += 64;
+    }
+
+    adacc = vec_add(adacc, adacc_0);
+    s3acc = vec_add(s3acc, s3acc_0);
+    s3acc = vec_sl(s3acc, vec_splat_u32(6));
+
+    if (rem) {
+        adacc_prev = vec_add(adacc_prev_0, adacc_prev);
+        adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
+        while (rem--) {
+            vector unsigned char d0 = vec_ld(0, buf);
+            adacc = vec_sum4s(d0, adacc);
+            s3acc = vec_add(s3acc, adacc_prev);
+            s2acc = vec_msum(t3, d0, s2acc);
+            adacc_prev = vec_sl(adacc, vec_splat_u32(4));
+            buf += 16;
+        }
+    }
+
+
+    /* Sum up independent second sums */
+    s2acc = vec_add(s2acc, s2acc_0);
+    s2acc_2 = vec_add(s2acc_1, s2acc_2);
+    s2acc = vec_add(s2acc, s2acc_2);
+
+    s2acc = vec_add(s2acc, s3acc);
+
+    adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
+    s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
+    adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
+    s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
+
+    vec_ste(adacc, 0, s);
+    vec_ste(s2acc, 0, s+1);
+}
+
+Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+    uint32_t pair[16] ALIGNED_(16);
+    memset(&pair[2], 0, 14);
+    int n = NMAX;
+    unsigned int done = 0, i;
+
+    /* Split Adler-32 into component sums, it can be supplied by
+     * the caller sites (e.g. in a PNG file).
+     */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+    pair[0] = adler;
+    pair[1] = sum2;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    // Align buffer
+    unsigned int al = 0;
+    if ((uintptr_t)buf & 0xf) {
+        al = 16-((uintptr_t)buf & 0xf);
+        if (al > len) {
+            al=len;
+        }
+        vmx_handle_head_or_tail(pair, buf, al);
+
+        done += al;
+        /* Rather than rebasing, we can reduce the max sums for the
+         * first round only */
+        n -= al;
+    }
+    for (i = al; i < len; i += n) {
+        int remaining = (int)(len-i);
+        n = MIN(remaining, (i == al) ? n : NMAX);
+
+        if (n < 16)
+            break;
+
+        vmx_accum32(pair, buf + i, n / 16);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+
+        done += (n / 16) * 16;
+    }
+
+    /* Handle the tail elements. */
+    if (done < len) {
+        vmx_handle_head_or_tail(pair, (buf + done), len - done);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+    }
+
+    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+    return (pair[1] << 16) | pair[0];
+}
+#endif
--- a/third_party/zlib-ng/arch/power/chunkset_power8.c
+++ b/third_party/zlib-ng/arch/power/chunkset_power8.c
@@ -0,0 +1,50 @@
+/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+#include <altivec.h>
+#include "zbuild.h"
+#include "zmemory.h"
+
+typedef vector unsigned char chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = (vector unsigned char)vec_splats(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = (vector unsigned char)vec_splats(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = (vector unsigned char)vec_splats((unsigned long long)zng_memread_8(from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = vec_xl(0, s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    vec_xst(*chunk, 0, out);
+}
+
+#define CHUNKSIZE        chunksize_power8
+#define CHUNKCOPY        chunkcopy_power8
+#define CHUNKUNROLL      chunkunroll_power8
+#define CHUNKMEMSET      chunkmemset_power8
+#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_power8
+
+#include "inffast_tpl.h"
+
+#endif
--- a/third_party/zlib-ng/arch/power/compare256_power9.c
+++ b/third_party/zlib-ng/arch/power/compare256_power9.c
@@ -0,0 +1,66 @@
+/* compare256_power9.c - Power9 version of compare256
+ * Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER9
+#include <altivec.h>
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "zendian.h"
+
+/* Older versions of GCC misimplemented semantics for these bit counting builtins.
+ * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
+#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 12)
+#if BYTE_ORDER == LITTLE_ENDIAN
+#  define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
+#else
+#  define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
+#endif
+#else
+#  define zng_vec_vctzlsbb(vc, len) len = vec_cntlz_lsbb(vc)
+#endif
+
+static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0, cmplen;
+
+    do {
+        vector unsigned char vsrc0, vsrc1, vc;
+
+        vsrc0 = *((vector unsigned char *)src0);
+        vsrc1 = *((vector unsigned char *)src1);
+
+        /* Compare 16 bytes at a time. Each byte of vc will be either
+         * all ones or all zeroes, depending on the result of the comparison. */
+        vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
+
+        /* Since the index of matching bytes will contain only zeroes
+         * on vc (since we used cmpne), counting the number of consecutive
+         * bytes where LSB == 0 is the same as counting the length of the match. */
+        zng_vec_vctzlsbb(vc, cmplen);
+        if (cmplen != 16)
+            return len + cmplen;
+
+        src0 += 16, src1 += 16, len += 16;
+    } while (len < 256);
+
+   return 256;
+}
+
+Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_power9_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_power9
+#define COMPARE256          compare256_power9_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_power9
+#define COMPARE256          compare256_power9_static
+
+#include "match_tpl.h"
+
+#endif
--- a/third_party/zlib-ng/arch/power/crc32_constants.h
+++ b/third_party/zlib-ng/arch/power/crc32_constants.h
--- a/third_party/zlib-ng/arch/power/crc32_power8.c
+++ b/third_party/zlib-ng/arch/power/crc32_power8.c
@@ -0,0 +1,587 @@
+/* crc32 for POWER8 using VSX instructions
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Author: Rogerio Alves <rogealve@br.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * This code uses gcc vector builtins instead using assembly directly.
+ */
+
+#include <altivec.h>
+#include "zendian.h"
+#include "zbuild.h"
+
+#include "crc32_constants.h"
+#include "crc32_braid_tbl.h"
+
+#include "power_intrins.h"
+
+#define MAX_SIZE    32768
+#define VMX_ALIGN	16
+#define VMX_ALIGN_MASK	(VMX_ALIGN-1)
+
+static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
+    while (len--)
+        crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+    return crc;
+}
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
+
+Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
+    unsigned int prealign;
+    unsigned int tail;
+
+    unsigned long len = (unsigned long) _len;
+
+    if (p == (const unsigned char *) 0x0)
+        return 0;
+
+    crc ^= 0xffffffff;
+
+    if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+        crc = crc32_align(crc, p, len);
+        goto out;
+    }
+
+    if ((unsigned long)p & VMX_ALIGN_MASK) {
+        prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+        crc = crc32_align(crc, p, prealign);
+        len -= prealign;
+        p += prealign;
+    }
+
+    crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+
+    tail = len & VMX_ALIGN_MASK;
+    if (tail) {
+        p += len & ~VMX_ALIGN_MASK;
+        crc = crc32_align(crc, p, tail);
+    }
+
+out:
+    crc ^= 0xffffffff;
+
+    return crc;
+}
+
+/* When we have a load-store in a single-dispatch group and address overlap
+ * such that forward is not allowed (load-hit-store) the group must be flushed.
+ * A group ending NOP prevents the flush.
+ */
+#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define BYTESWAP_DATA
+#endif
+
+#ifdef BYTESWAP_DATA
+#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
+#if BYTE_ORDER == LITTLE_ENDIAN
+/* Byte reverse permute constant LE. */
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
+#else
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
+#endif
+#else
+#define VEC_PERM(vr, va, vb, vc)
+#endif
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
+
+    const __vector unsigned long long vzero = {0,0};
+    const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
+
+    const __vector unsigned long long vmask_32bit =
+        (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
+
+    const __vector unsigned long long vmask_64bit =
+        (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
+
+    __vector unsigned long long vcrc;
+
+    __vector unsigned long long vconst1, vconst2;
+
+    /* vdata0-vdata7 will contain our data (p). */
+    __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
+
+    /* v0-v7 will contain our checksums */
+    __vector unsigned long long v0 = {0,0};
+    __vector unsigned long long v1 = {0,0};
+    __vector unsigned long long v2 = {0,0};
+    __vector unsigned long long v3 = {0,0};
+    __vector unsigned long long v4 = {0,0};
+    __vector unsigned long long v5 = {0,0};
+    __vector unsigned long long v6 = {0,0};
+    __vector unsigned long long v7 = {0,0};
+
+
+    /* Vector auxiliary variables. */
+    __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
+
+    unsigned int offset; /* Constant table offset. */
+
+    unsigned long i; /* Counter. */
+    unsigned long chunks;
+
+    unsigned long block_size;
+    int next_block = 0;
+
+    /* Align by 128 bits. The last 128 bit block will be processed at end. */
+    unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
+
+    vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
+
+    /* Short version. */
+    if (len < 256) {
+        /* Calculate where in the constant table we need to start. */
+        offset = 256 - len;
+
+        vconst1 = vec_ld(offset, vcrc_short_const);
+        vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+        VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+
+        /* xor initial value */
+        vdata0 = vec_xor(vdata0, vcrc);
+
+        vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+        v0 = vec_xor(v0, vdata0);
+
+        for (i = 16; i < len; i += 16) {
+            vconst1 = vec_ld(offset + i, vcrc_short_const);
+            vdata0 = vec_ld(i, (__vector unsigned long long*) p);
+            VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+            vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+                (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+            v0 = vec_xor(v0, vdata0);
+        }
+    } else {
+
+        /* Load initial values. */
+        vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+        vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+        VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+        vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+        vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+        VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+        vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+        vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+        VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+        vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+        vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+        VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+        /* xor in initial value */
+        vdata0 = vec_xor(vdata0, vcrc);
+
+        p = (char *)p + 128;
+
+        do {
+            /* Checksum in blocks of MAX_SIZE. */
+            block_size = length;
+            if (block_size > MAX_SIZE) {
+                block_size = MAX_SIZE;
+            }
+
+            length = length - block_size;
+
+            /*
+             * Work out the offset into the constants table to start at. Each
+             * constant is 16 bytes, and it is used against 128 bytes of input
+             * data - 128 / 16 = 8
+             */
+            offset = (MAX_SIZE/8) - (block_size/8);
+            /* We reduce our final 128 bytes in a separate step */
+            chunks = (block_size/128)-1;
+
+            vconst1 = vec_ld(offset, vcrc_const);
+
+            va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                           (__vector unsigned long long)vconst1);
+            va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                           (__vector unsigned long long)vconst1);
+            va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+                                           (__vector unsigned long long)vconst1);
+            va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                           (__vector unsigned long long)vconst1);
+            va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                           (__vector unsigned long long)vconst1);
+            va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                           (__vector unsigned long long)vconst1);
+            va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                           (__vector unsigned long long)vconst1);
+            va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                           (__vector unsigned long long)vconst1);
+
+            if (chunks > 1) {
+                offset += 16;
+                vconst2 = vec_ld(offset, vcrc_const);
+                GROUP_ENDING_NOP;
+
+                vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+                VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+
+                vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+                VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+                vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+                VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+
+                vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+                VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+                vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+                VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+
+                vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+                VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+                vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+                VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+
+                vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+                VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+                p = (char *)p + 128;
+
+                /*
+                 * main loop. Each iteration calculates the CRC for a 128-byte
+                 * block.
+                 */
+                for (i = 0; i < chunks-2; i++) {
+                    vconst1 = vec_ld(offset, vcrc_const);
+                    offset += 16;
+                    GROUP_ENDING_NOP;
+
+                    v0 = vec_xor(v0, va0);
+                    va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v1 = vec_xor(v1, va1);
+                    va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v2 = vec_xor(v2, va2);
+                    va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
+                                                   vdata2, (__vector unsigned long long)vconst2);
+                    vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v3 = vec_xor(v3, va3);
+                    va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+                    vconst2 = vec_ld(offset, vcrc_const);
+                    GROUP_ENDING_NOP;
+
+                    v4 = vec_xor(v4, va4);
+                    va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v5 = vec_xor(v5, va5);
+                    va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v6 = vec_xor(v6, va6);
+                    va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v7 = vec_xor(v7, va7);
+                    va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+                    p = (char *)p + 128;
+                }
+
+                /* First cool down */
+                vconst1 = vec_ld(offset, vcrc_const);
+                offset += 16;
+
+                v0 = vec_xor(v0, va0);
+                va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v1 = vec_xor(v1, va1);
+                va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v2 = vec_xor(v2, va2);
+                va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v3 = vec_xor(v3, va3);
+                va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v4 = vec_xor(v4, va4);
+                va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v5 = vec_xor(v5, va5);
+                va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v6 = vec_xor(v6, va6);
+                va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v7 = vec_xor(v7, va7);
+                va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                               (__vector unsigned long long)vconst1);
+            }/* else */
+
+            /* Second cool down. */
+            v0 = vec_xor(v0, va0);
+            v1 = vec_xor(v1, va1);
+            v2 = vec_xor(v2, va2);
+            v3 = vec_xor(v3, va3);
+            v4 = vec_xor(v4, va4);
+            v5 = vec_xor(v5, va5);
+            v6 = vec_xor(v6, va6);
+            v7 = vec_xor(v7, va7);
+
+            /*
+             * vpmsumd produces a 96 bit result in the least significant bits
+             * of the register. Since we are bit reflected we have to shift it
+             * left 32 bits so it occupies the least significant bits in the
+             * bit reflected domain.
+             */
+            v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                                      (__vector unsigned char)vzero, 4);
+            v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
+                                                      (__vector unsigned char)vzero, 4);
+            v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
+                                                      (__vector unsigned char)vzero, 4);
+            v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
+                                                      (__vector unsigned char)vzero, 4);
+            v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
+                                                      (__vector unsigned char)vzero, 4);
+            v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
+                                                      (__vector unsigned char)vzero, 4);
+            v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
+                                                      (__vector unsigned char)vzero, 4);
+            v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
+                                                      (__vector unsigned char)vzero, 4);
+
+            /* xor with the last 1024 bits. */
+            va0 = vec_ld(0, (__vector unsigned long long*) p);
+            VEC_PERM(va0, va0, va0, vperm_const);
+
+            va1 = vec_ld(16, (__vector unsigned long long*) p);
+            VEC_PERM(va1, va1, va1, vperm_const);
+
+            va2 = vec_ld(32, (__vector unsigned long long*) p);
+            VEC_PERM(va2, va2, va2, vperm_const);
+
+            va3 = vec_ld(48, (__vector unsigned long long*) p);
+            VEC_PERM(va3, va3, va3, vperm_const);
+
+            va4 = vec_ld(64, (__vector unsigned long long*) p);
+            VEC_PERM(va4, va4, va4, vperm_const);
+
+            va5 = vec_ld(80, (__vector unsigned long long*) p);
+            VEC_PERM(va5, va5, va5, vperm_const);
+
+            va6 = vec_ld(96, (__vector unsigned long long*) p);
+            VEC_PERM(va6, va6, va6, vperm_const);
+
+            va7 = vec_ld(112, (__vector unsigned long long*) p);
+            VEC_PERM(va7, va7, va7, vperm_const);
+
+            p = (char *)p + 128;
+
+            vdata0 = vec_xor(v0, va0);
+            vdata1 = vec_xor(v1, va1);
+            vdata2 = vec_xor(v2, va2);
+            vdata3 = vec_xor(v3, va3);
+            vdata4 = vec_xor(v4, va4);
+            vdata5 = vec_xor(v5, va5);
+            vdata6 = vec_xor(v6, va6);
+            vdata7 = vec_xor(v7, va7);
+
+            /* Check if we have more blocks to process */
+            next_block = 0;
+            if (length != 0) {
+                next_block = 1;
+
+                /* zero v0-v7 */
+                v0 = vec_xor(v0, v0);
+                v1 = vec_xor(v1, v1);
+                v2 = vec_xor(v2, v2);
+                v3 = vec_xor(v3, v3);
+                v4 = vec_xor(v4, v4);
+                v5 = vec_xor(v5, v5);
+                v6 = vec_xor(v6, v6);
+                v7 = vec_xor(v7, v7);
+            }
+            length = length + 128;
+
+        } while (next_block);
+
+        /* Calculate how many bytes we have left. */
+        length = (len & 127);
+
+        /* Calculate where in (short) constant table we need to start. */
+        offset = 128 - length;
+
+        v0 = vec_ld(offset, vcrc_short_const);
+        v1 = vec_ld(offset + 16, vcrc_short_const);
+        v2 = vec_ld(offset + 32, vcrc_short_const);
+        v3 = vec_ld(offset + 48, vcrc_short_const);
+        v4 = vec_ld(offset + 64, vcrc_short_const);
+        v5 = vec_ld(offset + 80, vcrc_short_const);
+        v6 = vec_ld(offset + 96, vcrc_short_const);
+        v7 = vec_ld(offset + 112, vcrc_short_const);
+
+        offset += 128;
+
+        v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata0, (__vector unsigned int)v0);
+        v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata1, (__vector unsigned int)v1);
+        v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata2, (__vector unsigned int)v2);
+        v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata3, (__vector unsigned int)v3);
+        v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata4, (__vector unsigned int)v4);
+        v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata5, (__vector unsigned int)v5);
+        v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata6, (__vector unsigned int)v6);
+        v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata7, (__vector unsigned int)v7);
+
+        /* Now reduce the tail (0-112 bytes). */
+        for (i = 0; i < length; i+=16) {
+            vdata0 = vec_ld(i,(__vector unsigned long long*)p);
+            VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+            va0 = vec_ld(offset + i,vcrc_short_const);
+            va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+                (__vector unsigned int)vdata0, (__vector unsigned int)va0);
+            v0 = vec_xor(v0, va0);
+        }
+
+        /* xor all parallel chunks together. */
+        v0 = vec_xor(v0, v1);
+        v2 = vec_xor(v2, v3);
+        v4 = vec_xor(v4, v5);
+        v6 = vec_xor(v6, v7);
+
+        v0 = vec_xor(v0, v2);
+        v4 = vec_xor(v4, v6);
+
+        v0 = vec_xor(v0, v4);
+    }
+
+    /* Barrett Reduction */
+    vconst1 = vec_ld(0, v_Barrett_const);
+    vconst2 = vec_ld(16, v_Barrett_const);
+
+    v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                              (__vector unsigned char)v0, 8);
+    v0 = vec_xor(v1,v0);
+
+    /* shift left one bit */
+    __vector unsigned char vsht_splat = vec_splat_u8 (1);
+    v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
+
+    v0 = vec_and(v0, vmask_64bit);
+
+    /*
+     * The reflected version of Barrett reduction. Instead of bit
+     * reflecting our data (which is expensive to do), we bit reflect our
+     * constants and our algorithm, which means the intermediate data in
+     * our vector registers goes from 0-63 instead of 63-0. We can reflect
+     * the algorithm because we don't carry in mod 2 arithmetic.
+     */
+
+    /* bottom 32 bits of a */
+    v1 = vec_and(v0, vmask_32bit);
+
+    /* ma */
+    v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+                                  (__vector unsigned long long)vconst1);
+
+    /* bottom 32bits of ma */
+    v1 = vec_and(v1, vmask_32bit);
+    /* qn */
+    v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+                                  (__vector unsigned long long)vconst2);
+    /* a - qn, subtraction is xor in GF(2) */
+    v0 = vec_xor (v0, v1);
+
+    /*
+     * Since we are bit reflected, the result (ie the low 32 bits) is in
+     * the high 32 bits. We just need to shift it left 4 bytes
+     * V0 [ 0 1 X 3 ]
+     * V0 [ 0 X 2 3 ]
+     */
+
+    /* shift result into top 64 bits of */
+    v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                              (__vector unsigned char)vzero, 4);
+
+#if BYTE_ORDER == BIG_ENDIAN
+    return v0[0];
+#else
+    return v0[1];
+#endif
+}
--- a/third_party/zlib-ng/arch/power/power_features.c
+++ b/third_party/zlib-ng/arch/power/power_features.c
@@ -0,0 +1,49 @@
+/* power_features.c - POWER feature check
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef HAVE_SYS_AUXV_H
+#  include <sys/auxv.h>
+#endif
+#ifdef POWER_NEED_AUXVEC_H
+#  include <linux/auxvec.h>
+#endif
+#ifdef __FreeBSD__
+#  include <machine/cpu.h>
+#endif
+#include "zbuild.h"
+#include "power_features.h"
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
+#ifdef PPC_FEATURES
+    unsigned long hwcap;
+#ifdef __FreeBSD__
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+#else
+    hwcap = getauxval(AT_HWCAP);
+#endif
+
+    if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
+        features->has_altivec = 1;
+#endif
+
+#ifdef POWER_FEATURES
+    unsigned long hwcap2;
+#ifdef __FreeBSD__
+    elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+#else
+    hwcap2 = getauxval(AT_HWCAP2);
+#endif
+
+#ifdef POWER8_VSX
+    if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+        features->has_arch_2_07 = 1;
+#endif
+#ifdef POWER9
+    if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+        features->has_arch_3_00 = 1;
+#endif
+#endif
+}
--- a/third_party/zlib-ng/arch/power/power_features.h
+++ b/third_party/zlib-ng/arch/power/power_features.h
@@ -0,0 +1,18 @@
+/* power_features.h -- check for POWER CPU features
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_FEATURES_H_
+#define POWER_FEATURES_H_
+
+struct power_cpu_features {
+    int has_altivec;
+    int has_arch_2_07;
+    int has_arch_3_00;
+};
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features);
+
+#endif /* POWER_FEATURES_H_ */
--- a/third_party/zlib-ng/arch/power/power_functions.h
+++ b/third_party/zlib-ng/arch/power/power_functions.h
@@ -0,0 +1,67 @@
+/* power_functions.h -- POWER implementations for arch-specific functions.
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_FUNCTIONS_H_
+#define POWER_FUNCTIONS_H_
+
+#ifdef PPC_VMX
+uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
+void slide_hash_vmx(deflate_state *s);
+#endif
+
+#ifdef POWER8_VSX
+uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t chunksize_power8(void);
+uint8_t* chunkmemset_safe_power8(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
+void slide_hash_power8(deflate_state *s);
+void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef POWER9
+uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
+uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
+uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Power - VMX
+#  if defined(PPC_VMX) && defined(__ALTIVEC__)
+#    undef native_adler32
+#    define native_adler32 adler32_vmx
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_vmx
+#  endif
+// Power8 - VSX
+#  if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__)
+#    undef native_adler32
+#    define native_adler32 adler32_power8
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_power8
+#    undef native_chunksize
+#    define native_chunksize chunksize_power8
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_power8
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_power8
+#  endif
+#  if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__)
+#    undef native_crc32
+#    define native_crc32 crc32_power8
+#  endif
+// Power9
+#  if defined(POWER9) && defined(_ARCH_PWR9)
+#    undef native_compare256
+#    define native_compare256 compare256_power9
+#    undef native_longest_match
+#    define native_longest_match longest_match_power9
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_power9
+#  endif
+#endif
+
+#endif /* POWER_FUNCTIONS_H_ */
--- a/third_party/zlib-ng/arch/power/power_intrins.h
+++ b/third_party/zlib-ng/arch/power/power_intrins.h
@@ -0,0 +1,34 @@
+/* Helper functions to work around issues with clang builtins
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Authors:
+ *   Daniel Black <daniel@linux.vnet.ibm.com>
+ *   Rogerio Alves <rogealve@br.ibm.com>
+ *   Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_INTRINS_H
+#define POWER_INTRINS_H
+
+#if defined (__clang__)
+/*
+ * These stubs fix clang incompatibilities with GCC builtins.
+ */
+
+#ifndef __builtin_crypto_vpmsumw
+#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
+#endif
+#ifndef __builtin_crypto_vpmsumd
+#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
+#endif
+
+static inline __vector unsigned long long __attribute__((overloadable))
+vec_ld(int __a, const __vector unsigned long long* __b) {
+    return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
+}
+
+#endif
+
+#endif
--- a/third_party/zlib-ng/arch/power/slide_hash_power8.c
+++ b/third_party/zlib-ng/arch/power/slide_hash_power8.c
@@ -0,0 +1,12 @@
+/* Optimized slide_hash for POWER processors
+ * Copyright (C) 2019-2020 IBM Corporation
+ * Author: Matheus Castanho <msc@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+
+#define SLIDE_PPC slide_hash_power8
+#include "slide_ppc_tpl.h"
+
+#endif /* POWER8_VSX */
--- a/third_party/zlib-ng/arch/power/slide_hash_vmx.c
+++ b/third_party/zlib-ng/arch/power/slide_hash_vmx.c
@@ -0,0 +1,10 @@
+/* Optimized slide_hash for PowerPC processors with VMX instructions
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef PPC_VMX
+
+#define SLIDE_PPC slide_hash_vmx
+#include "slide_ppc_tpl.h"
+
+#endif /* PPC_VMX */
--- a/third_party/zlib-ng/arch/power/slide_ppc_tpl.h
+++ b/third_party/zlib-ng/arch/power/slide_ppc_tpl.h
@@ -0,0 +1,32 @@
+/* Optimized slide_hash for PowerPC processors
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "deflate.h"
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    const vector unsigned short vmx_wsize = vec_splats(wsize);
+    Pos *p = table;
+
+    do {
+        vector unsigned short value, result;
+
+        value = vec_ld(0, p);
+        result = vec_subs(value, vmx_wsize);
+        vec_st(result, 0, p);
+
+        p += 8;
+        entries -= 8;
+   } while (entries > 0);
+}
+
+void Z_INTERNAL SLIDE_PPC(deflate_state *s) {
+    Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+    uint16_t wsize = (uint16_t)s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}