fix: breakpad use miniz
Some checks failed
sm-rpc / build (Debug, arm-linux-gnueabihf) (push) Successful in 1m34s
sm-rpc / build (Debug, aarch64-linux-gnu) (push) Successful in 2m46s
sm-rpc / build (Debug, host.gcc) (push) Failing after 1m28s
sm-rpc / build (Release, aarch64-linux-gnu) (push) Successful in 2m14s
sm-rpc / build (Release, arm-linux-gnueabihf) (push) Successful in 2m8s
sm-rpc / build (Debug, mipsel-linux-gnu) (push) Successful in 5m35s
sm-rpc / build (Release, host.gcc) (push) Failing after 1m55s
sm-rpc / build (Release, mipsel-linux-gnu) (push) Successful in 7m21s

This commit is contained in:
tqcq
2025-08-25 15:24:22 +08:00
parent a58517497b
commit 68b2e7f763
728 changed files with 489652 additions and 1211 deletions

View File

@@ -0,0 +1,116 @@
cmake_minimum_required(VERSION 3.12)
include(FetchContent)
if(NOT DEFINED CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 11)
endif()
if(NOT DEFINED CMAKE_CXX_STANDARD_REQUIRED)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
endif()
if(NOT DEFINED CMAKE_CXX_EXTENSIONS)
set(CMAKE_CXX_EXTENSIONS ON)
endif()
enable_language(CXX)
# Search for Google benchmark package
find_package(benchmark QUIET)
if(NOT benchmark_FOUND)
# Fetch google benchmark source code from official repository
set(BENCHMARK_ENABLE_TESTING OFF)
# Allow specifying alternative Google benchmark repository
if(NOT DEFINED GBENCHMARK_REPOSITORY)
set(GBENCHMARK_REPOSITORY https://github.com/google/benchmark.git)
endif()
if(NOT DEFINED GBENCHMARK_TAG)
set(GBENCHMARK_TAG v1.7.1)
endif()
FetchContent_Declare(benchmark
GIT_REPOSITORY ${GBENCHMARK_REPOSITORY}
GIT_TAG ${GBENCHMARK_TAG})
FetchContent_GetProperties(benchmark)
if(NOT benchmark_POPULATED)
FetchContent_Populate(benchmark)
add_subdirectory(${benchmark_SOURCE_DIR} ${benchmark_BINARY_DIR} EXCLUDE_FROM_ALL)
endif()
endif()
add_executable(benchmark_zlib
benchmark_adler32.cc
benchmark_adler32_copy.cc
benchmark_compare256.cc
benchmark_compare256_rle.cc
benchmark_compress.cc
benchmark_crc32.cc
benchmark_main.cc
benchmark_slidehash.cc
benchmark_uncompress.cc
)
target_compile_definitions(benchmark_zlib PRIVATE -DBENCHMARK_STATIC_DEFINE)
target_include_directories(benchmark_zlib PRIVATE
${PROJECT_SOURCE_DIR}
${PROJECT_BINARY_DIR}
${benchmark_SOURCE_DIR}/benchmark/include)
target_link_libraries(benchmark_zlib zlibstatic benchmark::benchmark)
if(WIN32)
target_link_libraries(benchmark_zlib shlwapi)
endif()
add_test(NAME benchmark_zlib
COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:benchmark_zlib>)
if(WITH_BENCHMARK_APPS)
option(BUILD_ALT_BENCH "Link against alternative zlib implementation" OFF)
# Search for libpng package
find_package(PNG QUIET)
if(NOT PNG_FOUND)
FetchContent_Declare(PNG
GIT_REPOSITORY https://github.com/glennrp/libpng.git)
FetchContent_GetProperties(PNG)
if(NOT PNG_POPULATED)
FetchContent_Populate(PNG)
set(PNG_INCLUDE_DIR ${png_SOURCE_DIR})
add_subdirectory(${png_SOURCE_DIR} ${png_BINARY_DIR})
endif()
endif()
set(BENCH_APP_SRCS
benchmark_png_encode.cc
benchmark_png_decode.cc
benchmark_main.cc
)
add_executable(benchmark_zlib_apps ${BENCH_APP_SRCS})
if(DEFINED BUILD_ALT_BENCH)
set(ZLIB_ALT_LIB "libz.a" CACHE FILEPATH "Optional alternative zlib implementation (defaults to stock zlib)")
add_executable(benchmark_zlib_apps_alt ${BENCH_APP_SRCS})
target_link_libraries(benchmark_zlib_apps_alt libpng.a ${ZLIB_ALT_LIB} benchmark::benchmark)
target_compile_definitions(benchmark_zlib_apps_alt PRIVATE BUILD_ALT=1)
target_include_directories(benchmark_zlib_apps_alt PRIVATE
${PROJECT_SOURCE_DIR}
${PROJECT_BINARY_DIR}
${PNG_INCLUDE_DIR}
${benchmark_SOURCE_DIR}/benchmark/include)
endif()
target_include_directories(benchmark_zlib_apps PRIVATE
${PROJECT_SOURCE_DIR}
${PROJECT_BINARY_DIR}
${PNG_INCLUDE_DIR}
${benchmark_SOURCE_DIR}/benchmark/include)
# We need the static png library if we're statically linking to zlib,
# otherwise it will resolve these things in the system provided dynamic
# libraries (likely linked to stock zlib)
target_link_libraries(benchmark_zlib_apps libpng.a zlibstatic benchmark::benchmark)
endif()

View File

@@ -0,0 +1,47 @@
## Benchmarks
These benchmarks are written using [Google Benchmark](https://github.com/google/benchmark).
*Repetitions*
To increase the number of times each benchmark iteration is run use:
```
--benchmark_repetitions=20
```
*Filters*
To filter out which benchmarks are performed use:
```
--benchmark_filter="adler32*"
```
There are two different benchmarks, micro and macro.
### Benchmark benchmark_zlib
These are microbenchmarks intended to test lower level subfunctions of the library.
Benchmarks include implementations of:
- Adler32
- CRC
- 256 byte comparisons
- SIMD accelerated "slide hash" routine
By default these benchmarks report things on the nanosecond scale and are small enough
to measure very minute differences.
### Benchmark benchmark_zlib_apps
These benchmarks measure applications of zlib as a whole. Currently the only examples
are PNG encoding and decoding. The PNG encode and decode tests leveraging procedurally
generated and highly compressible image data.
Additionally, a test called `png_decode_realistic` that will decode any RGB 8 BPP encoded
set of PNGs in the working directory under a directory named "test_pngs" with files named
{0..1}.png. If these images do not exist, they will error out and the benchmark will move
on to the next set of benchmarks.
*benchmark_zlib_apps_alt*
The user can compile a comparison benchmark application linking to any zlib-compatible
implementation of his or her choosing.

View File

@@ -0,0 +1,100 @@
/* benchmark_adler32.cc -- benchmark adler32 variants
* Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <stdio.h>
#include <assert.h>
#include <benchmark/benchmark.h>
extern "C" {
# include "zbuild.h"
# include "zutil_p.h"
# include "arch_functions.h"
# include "../test_cpu_features.h"
}
#define MAX_RANDOM_INTS (1024 * 1024)
#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
class adler32: public benchmark::Fixture {
private:
uint32_t *random_ints;
public:
void SetUp(const ::benchmark::State& state) {
/* Control the alignment so that we have the best case scenario for loads. With
* AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
* And while this is a realistic scenario, it makes it difficult to compare benchmark
* to benchmark because one allocation could have been aligned perfectly for the loads
* while the subsequent one happened to not be. This is not to be advantageous to AVX512
* (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
* control the _consistency_ of the results */
random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
assert(random_ints != NULL);
for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
random_ints[i] = rand();
}
}
void Bench(benchmark::State& state, adler32_func adler32) {
uint32_t hash = 0;
for (auto _ : state) {
hash = adler32(hash, (const unsigned char *)random_ints, (size_t)state.range(0));
}
benchmark::DoNotOptimize(hash);
}
void TearDown(const ::benchmark::State& state) {
zng_free(random_ints);
}
};
#define BENCHMARK_ADLER32(name, fptr, support_flag) \
BENCHMARK_DEFINE_F(adler32, name)(benchmark::State& state) { \
if (!support_flag) { \
state.SkipWithError("CPU does not support " #name); \
} \
Bench(state, fptr); \
} \
BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10)
BENCHMARK_ADLER32(c, adler32_c, 1);
#ifdef DISABLE_RUNTIME_CPU_DETECTION
BENCHMARK_ADLER32(native, native_adler32, 1);
#else
#ifdef ARM_NEON
BENCHMARK_ADLER32(neon, adler32_neon, test_cpu_features.arm.has_neon);
#endif
#ifdef PPC_VMX
BENCHMARK_ADLER32(vmx, adler32_vmx, test_cpu_features.power.has_altivec);
#endif
#ifdef POWER8_VSX
BENCHMARK_ADLER32(power8, adler32_power8, test_cpu_features.power.has_arch_2_07);
#endif
#ifdef RISCV_RVV
BENCHMARK_ADLER32(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv);
#endif
#ifdef X86_SSSE3
BENCHMARK_ADLER32(ssse3, adler32_ssse3, test_cpu_features.x86.has_ssse3);
#endif
#ifdef X86_AVX2
BENCHMARK_ADLER32(avx2, adler32_avx2, test_cpu_features.x86.has_avx2);
#endif
#ifdef X86_AVX512
BENCHMARK_ADLER32(avx512, adler32_avx512, test_cpu_features.x86.has_avx512_common);
#endif
#ifdef X86_AVX512VNNI
BENCHMARK_ADLER32(avx512_vnni, adler32_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
#endif
#endif

View File

@@ -0,0 +1,130 @@
/* benchmark_adler32_copy.cc -- benchmark adler32 (elided copy) variants
* Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <benchmark/benchmark.h>
extern "C" {
# include "zbuild.h"
# include "zutil_p.h"
# include "arch_functions.h"
# include "../test_cpu_features.h"
}
#define MAX_RANDOM_INTS (1024 * 1024)
#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const uint8_t *buf, size_t len);
class adler32_copy: public benchmark::Fixture {
private:
uint32_t *random_ints_src;
uint32_t *random_ints_dst;
public:
void SetUp(const ::benchmark::State& state) {
/* Control the alignment so that we have the best case scenario for loads. With
* AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
* And while this is a realistic scenario, it makes it difficult to compare benchmark
* to benchmark because one allocation could have been aligned perfectly for the loads
* while the subsequent one happened to not be. This is not to be advantageous to AVX512
* (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
* control the _consistency_ of the results */
random_ints_src = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
random_ints_dst = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
assert(random_ints_src != NULL);
assert(random_ints_dst != NULL);
for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
random_ints_src[i] = rand();
}
}
void Bench(benchmark::State& state, adler32_cpy_func adler32_func) {
uint32_t hash = 0;
for (auto _ : state) {
hash = adler32_func(hash, (unsigned char *)random_ints_dst,
(const unsigned char*)random_ints_src, (size_t)state.range(0));
}
benchmark::DoNotOptimize(hash);
}
void TearDown(const ::benchmark::State& state) {
zng_free(random_ints_src);
zng_free(random_ints_dst);
}
};
#define BENCHMARK_ADLER32_COPY(name, fptr, support_flag) \
BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
if (!support_flag) { \
state.SkipWithError("CPU does not support " #name); \
} \
Bench(state, fptr); \
} \
BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
#define BENCHMARK_ADLER32_BASELINE_COPY(name, fptr, support_flag) \
BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
if (!support_flag) { \
state.SkipWithError("CPU does not support " #name); \
} \
Bench(state, [](uint32_t init_sum, unsigned char *dst, \
const uint8_t *buf, size_t len) -> uint32_t { \
memcpy(dst, buf, (size_t)len); \
return fptr(init_sum, buf, len); \
}); \
} \
BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1);
#ifdef DISABLE_RUNTIME_CPU_DETECTION
BENCHMARK_ADLER32_BASELINE_COPY(native, native_adler32, 1);
#else
#ifdef ARM_NEON
/* If we inline this copy for neon, the function would go here */
//BENCHMARK_ADLER32_COPY(neon, adler32_neon, test_cpu_features.arm.has_neon);
BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, test_cpu_features.arm.has_neon);
#endif
#ifdef PPC_VMX
//BENCHMARK_ADLER32_COPY(vmx_inline_copy, adler32_fold_copy_vmx, test_cpu_features.power.has_altivec);
BENCHMARK_ADLER32_BASELINE_COPY(vmx_copy_baseline, adler32_vmx, test_cpu_features.power.has_altivec);
#endif
#ifdef POWER8_VSX
//BENCHMARK_ADLER32_COPY(power8_inline_copy, adler32_fold_copy_power8, test_cpu_features.power.has_arch_2_07);
BENCHMARK_ADLER32_BASELINE_COPY(power8, adler32_power8, test_cpu_features.power.has_arch_2_07);
#endif
#ifdef RISCV_RVV
//BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv);
BENCHMARK_ADLER32_BASELINE_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv);
#endif
#ifdef X86_SSE42
BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, test_cpu_features.x86.has_ssse3);
BENCHMARK_ADLER32_COPY(sse42, adler32_fold_copy_sse42, test_cpu_features.x86.has_sse42);
#endif
#ifdef X86_AVX2
BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, test_cpu_features.x86.has_avx2);
BENCHMARK_ADLER32_COPY(avx2, adler32_fold_copy_avx2, test_cpu_features.x86.has_avx2);
#endif
#ifdef X86_AVX512
BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, test_cpu_features.x86.has_avx512_common);
BENCHMARK_ADLER32_COPY(avx512, adler32_fold_copy_avx512, test_cpu_features.x86.has_avx512_common);
#endif
#ifdef X86_AVX512VNNI
BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_fold_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
#endif
#endif

View File

@@ -0,0 +1,93 @@
/* benchmark_compare256.cc -- benchmark compare256 variants
* Copyright (C) 2022 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <stdio.h>
#include <benchmark/benchmark.h>
extern "C" {
# include "zbuild.h"
# include "zutil_p.h"
# include "arch_functions.h"
# include "../test_cpu_features.h"
# include "arch/generic/compare256_p.h"
}
#define MAX_COMPARE_SIZE (256)
class compare256: public benchmark::Fixture {
private:
uint8_t *str1;
uint8_t *str2;
public:
void SetUp(const ::benchmark::State& state) {
str1 = (uint8_t *)zng_alloc(MAX_COMPARE_SIZE);
assert(str1 != NULL);
memset(str1, 'a', MAX_COMPARE_SIZE);
str2 = (uint8_t *)zng_alloc(MAX_COMPARE_SIZE);
assert(str2 != NULL);
memset(str2, 'a', MAX_COMPARE_SIZE);
}
void Bench(benchmark::State& state, compare256_func compare256) {
int32_t match_len = (int32_t)state.range(0) - 1;
uint32_t len = 0;
str2[match_len] = 0;
for (auto _ : state) {
len = compare256((const uint8_t *)str1, (const uint8_t *)str2);
}
str2[match_len] = 'a';
benchmark::DoNotOptimize(len);
}
void TearDown(const ::benchmark::State& state) {
zng_free(str1);
zng_free(str2);
}
};
#define BENCHMARK_COMPARE256(name, fptr, support_flag) \
BENCHMARK_DEFINE_F(compare256, name)(benchmark::State& state) { \
if (!support_flag) { \
state.SkipWithError("CPU does not support " #name); \
} \
Bench(state, fptr); \
} \
BENCHMARK_REGISTER_F(compare256, name)->Range(1, MAX_COMPARE_SIZE);
#ifdef DISABLE_RUNTIME_CPU_DETECTION
BENCHMARK_COMPARE256(native, native_compare256, 1);
#else
BENCHMARK_COMPARE256(8, compare256_8, 1);
BENCHMARK_COMPARE256(16, compare256_16, 1);
#if defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256(32, compare256_32, 1);
#endif
#if defined(HAVE_BUILTIN_CTZLL)
BENCHMARK_COMPARE256(64, compare256_64, 1);
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2);
#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2);
#endif
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
BENCHMARK_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon);
#endif
#ifdef POWER9
BENCHMARK_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch_3_00);
#endif
#ifdef RISCV_RVV
BENCHMARK_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv);
#endif
#endif

View File

@@ -0,0 +1,69 @@
/* benchmark_compare256_rle.cc -- benchmark compare256_rle variants
* Copyright (C) 2022 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <stdio.h>
#include <benchmark/benchmark.h>
extern "C" {
# include "zbuild.h"
# include "zutil_p.h"
# include "compare256_rle.h"
}
#define MAX_COMPARE_SIZE (256)
class compare256_rle: public benchmark::Fixture {
private:
uint8_t *str1;
uint8_t *str2;
public:
void SetUp(const ::benchmark::State& state) {
str1 = (uint8_t *)zng_alloc(MAX_COMPARE_SIZE);
assert(str1 != NULL);
memset(str1, 'a', MAX_COMPARE_SIZE);
str2 = (uint8_t *)zng_alloc(MAX_COMPARE_SIZE);
assert(str2 != NULL);
memset(str2, 'a', MAX_COMPARE_SIZE);
}
void Bench(benchmark::State& state, compare256_rle_func compare256_rle) {
int32_t match_len = (int32_t)state.range(0) - 1;
uint32_t len = 0;
str2[match_len] = 0;
for (auto _ : state) {
len = compare256_rle((const uint8_t *)str1, (const uint8_t *)str2);
}
str2[match_len] = 'a';
benchmark::DoNotOptimize(len);
}
void TearDown(const ::benchmark::State& state) {
zng_free(str1);
zng_free(str2);
}
};
#define BENCHMARK_COMPARE256_RLE(name, fptr, support_flag) \
BENCHMARK_DEFINE_F(compare256_rle, name)(benchmark::State& state) { \
if (!support_flag) { \
state.SkipWithError("CPU does not support " #name); \
} \
Bench(state, fptr); \
} \
BENCHMARK_REGISTER_F(compare256_rle, name)->Range(1, MAX_COMPARE_SIZE);
BENCHMARK_COMPARE256_RLE(8, compare256_rle_8, 1);
BENCHMARK_COMPARE256_RLE(16, compare256_rle_16, 1);
#if defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256_RLE(32, compare256_rle_32, 1);
#endif
#if defined(HAVE_BUILTIN_CTZLL)
BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1);
#endif

View File

@@ -0,0 +1,67 @@
/* benchmark_compress.cc -- benchmark compress()
* Copyright (C) 2024 Hans Kristian Rosbach
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <stdio.h>
#include <assert.h>
#include <benchmark/benchmark.h>
extern "C" {
# include "zbuild.h"
# include "zutil_p.h"
# if defined(ZLIB_COMPAT)
# include "zlib.h"
# else
# include "zlib-ng.h"
# endif
}
#define MAX_SIZE (32 * 1024)
class compress_bench: public benchmark::Fixture {
private:
size_t maxlen;
uint8_t *inbuff;
uint8_t *outbuff;
public:
void SetUp(const ::benchmark::State& state) {
const char teststr[42] = "Hello hello World broken Test tast mello.";
maxlen = MAX_SIZE;
inbuff = (uint8_t *)zng_alloc(MAX_SIZE + 1);
assert(inbuff != NULL);
outbuff = (uint8_t *)zng_alloc(MAX_SIZE + 1);
assert(outbuff != NULL);
int pos = 0;
for (int32_t i = 0; i < MAX_SIZE - 42 ; i+=42){
pos += sprintf((char *)inbuff+pos, "%s", teststr);
}
}
void Bench(benchmark::State& state) {
int err = 0;
for (auto _ : state) {
err = PREFIX(compress)(outbuff, &maxlen, inbuff, (size_t)state.range(0));
}
benchmark::DoNotOptimize(err);
}
void TearDown(const ::benchmark::State& state) {
zng_free(inbuff);
zng_free(outbuff);
}
};
#define BENCHMARK_COMPRESS(name) \
BENCHMARK_DEFINE_F(compress_bench, name)(benchmark::State& state) { \
Bench(state); \
} \
BENCHMARK_REGISTER_F(compress_bench, name)->Arg(1)->Arg(8)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10);
BENCHMARK_COMPRESS(compress_bench);

View File

@@ -0,0 +1,83 @@
/* benchmark_crc32.cc -- benchmark crc32 variants
* Copyright (C) 2022 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <stdio.h>
#include <assert.h>
#include <benchmark/benchmark.h>
extern "C" {
# include "zbuild.h"
# include "zutil_p.h"
# include "arch_functions.h"
# include "../test_cpu_features.h"
}
#define MAX_RANDOM_INTS (1024 * 1024)
#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
class crc32: public benchmark::Fixture {
private:
uint32_t *random_ints;
public:
void SetUp(const ::benchmark::State& state) {
random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
assert(random_ints != NULL);
for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
random_ints[i] = rand();
}
}
void Bench(benchmark::State& state, crc32_func crc32) {
uint32_t hash = 0;
for (auto _ : state) {
hash = crc32(hash, (const unsigned char *)random_ints, (size_t)state.range(0));
}
benchmark::DoNotOptimize(hash);
}
void TearDown(const ::benchmark::State& state) {
zng_free(random_ints);
}
};
#define BENCHMARK_CRC32(name, fptr, support_flag) \
BENCHMARK_DEFINE_F(crc32, name)(benchmark::State& state) { \
if (!support_flag) { \
state.SkipWithError("CPU does not support " #name); \
} \
Bench(state, fptr); \
} \
BENCHMARK_REGISTER_F(crc32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
BENCHMARK_CRC32(braid, PREFIX(crc32_braid), 1);
#ifdef DISABLE_RUNTIME_CPU_DETECTION
BENCHMARK_CRC32(native, native_crc32, 1);
#else
#ifdef ARM_ACLE
BENCHMARK_CRC32(acle, crc32_acle, test_cpu_features.arm.has_crc32);
#endif
#ifdef POWER8_VSX_CRC32
BENCHMARK_CRC32(power8, crc32_power8, test_cpu_features.power.has_arch_2_07);
#endif
#ifdef S390_CRC32_VX
BENCHMARK_CRC32(vx, crc32_s390_vx, test_cpu_features.s390.has_vx);
#endif
#ifdef X86_PCLMULQDQ_CRC
/* CRC32 fold does a memory copy while hashing */
BENCHMARK_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq);
#endif
#ifdef X86_VPCLMULQDQ_CRC
/* CRC32 fold does a memory copy while hashing */
BENCHMARK_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq));
#endif
#endif

View File

@@ -0,0 +1,32 @@
/* benchmark_main.cc -- benchmark suite main entry point
* Copyright (C) 2022 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <stdio.h>
#include <benchmark/benchmark.h>
#ifndef BUILD_ALT
extern "C" {
# include "zbuild.h"
# include "../test_cpu_features.h"
# ifndef DISABLE_RUNTIME_CPU_DETECTION
struct cpu_features test_cpu_features;
# endif
}
#endif
int main(int argc, char** argv) {
#ifndef BUILD_ALT
# ifndef DISABLE_RUNTIME_CPU_DETECTION
cpu_check_features(&test_cpu_features);
# endif
#endif
::benchmark::Initialize(&argc, argv);
::benchmark::RunSpecifiedBenchmarks();
return EXIT_SUCCESS;
}

View File

@@ -0,0 +1,126 @@
#include <stdio.h>
#include <benchmark/benchmark.h>
#include "benchmark_png_shared.h"
#include <assert.h>
class png_decode: public benchmark::Fixture {
protected:
png_dat inpng[10];
/* Backing this on the heap is a more realistic benchmark */
uint8_t *output_img_buf = NULL;
public:
/* Let's make the vanilla version have something extremely compressible */
virtual void init_img(png_bytep img_bytes, size_t width, size_t height) {
init_compressible(img_bytes, width*height);
}
void SetUp(const ::benchmark::State& state) {
output_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3);
assert(output_img_buf != NULL);
init_img(output_img_buf, IMWIDTH, IMHEIGHT);
/* First we need to author the png bytes to be decoded */
for (int i = 0; i < 10; ++i) {
inpng[i] = {NULL, 0, 0};
encode_png(output_img_buf, &inpng[i], i, IMWIDTH, IMHEIGHT);
}
}
/* State in this circumstance will convey the compression level */
void Bench(benchmark::State &state) {
for (auto _ : state) {
int compress_lvl = state.range(0);
png_parse_dat in = { inpng[compress_lvl].buf };
uint32_t width, height;
decode_png(&in, (png_bytepp)&output_img_buf, IMWIDTH * IMHEIGHT * 3, width, height);
}
}
void TearDown(const ::benchmark::State &state) {
free(output_img_buf);
for (int i = 0; i < 10; ++i) {
free(inpng[i].buf);
}
}
};
class png_decode_realistic: public png_decode {
private:
bool test_files_found = false;
public:
void SetUp(const ::benchmark::State &state) {
output_img_buf = NULL;
output_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3);
/* Let's take all the images at different compression levels and jam their bytes into buffers */
char test_fname[25];
FILE *files[10];
/* Set all to NULL */
memset(files, 0, sizeof(FILE*));
for (size_t i = 0; i < 10; ++i) {
sprintf(test_fname, "test_pngs/%1lu.png", i);
FILE *in_img = fopen(test_fname, "r");
if (in_img == NULL) {
for (size_t j = 0; j < i; ++j) {
if (files[j])
fclose(files[j]);
}
/* For proper cleanup */
for (size_t j = i; j < 10; ++j) {
inpng[i] = { NULL, 0, 0 };
}
return;
}
files[i] = in_img;
}
test_files_found = true;
/* Now that we've established we have all the png files, let's read all of their bytes into buffers */
for (size_t i = 0; i < 10; ++i) {
FILE *in_file = files[i];
fseek(in_file, 0, SEEK_END);
size_t num_bytes = ftell(in_file);
rewind(in_file);
uint8_t *raw_file = (uint8_t*)malloc(num_bytes);
if (raw_file == NULL)
abort();
inpng[i].buf = raw_file;
inpng[i].len = num_bytes;
inpng[i].buf_rem = 0;
size_t bytes_read = fread(raw_file, 1, num_bytes, in_file);
if (bytes_read != num_bytes) {
fprintf(stderr, "couldn't read all of the bytes for file test_pngs/%lu.png", i);
abort();
}
fclose(in_file);
}
}
void Bench(benchmark::State &state) {
if (!test_files_found) {
state.SkipWithError("Test imagery in test_pngs not found");
}
png_decode::Bench(state);
}
};
BENCHMARK_DEFINE_F(png_decode, png_decode)(benchmark::State &state) {
Bench(state);
}
BENCHMARK_REGISTER_F(png_decode, png_decode)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond);
BENCHMARK_DEFINE_F(png_decode_realistic, png_decode_realistic)(benchmark::State &state) {
Bench(state);
}
BENCHMARK_REGISTER_F(png_decode_realistic, png_decode_realistic)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond);

View File

@@ -0,0 +1,54 @@
#include <stdio.h>
#include <assert.h>
#include <benchmark/benchmark.h>
#include "benchmark_png_shared.h"
#define IMWIDTH 1024
#define IMHEIGHT 1024
class png_encode: public benchmark::Fixture {
private:
png_dat outpng;
/* Backing this on the heap is a more realistic benchmark */
uint8_t *input_img_buf = NULL;
public:
/* Let's make the vanilla version have something extremely compressible */
virtual void init_img(png_bytep img_bytes, size_t width, size_t height) {
init_compressible(img_bytes, width * height);
}
void SetUp(const ::benchmark::State& state) {
input_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3);
outpng.buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3);
/* Using malloc rather than zng_alloc so that we can call realloc.
* IMWIDTH * IMHEIGHT is likely to be more than enough bytes, though,
* given that a simple run length encoding already pretty much can
* reduce to this */
outpng.len = 0;
outpng.buf_rem = IMWIDTH * IMHEIGHT * 3;
assert(input_img_buf != NULL);
assert(outpng.buf != NULL);
init_img(input_img_buf, IMWIDTH, IMHEIGHT);
}
/* State in this circumstance will convey the compression level */
void Bench(benchmark::State &state) {
for (auto _ : state) {
encode_png((png_bytep)input_img_buf, &outpng, state.range(0), IMWIDTH, IMHEIGHT);
outpng.buf_rem = outpng.len;
outpng.len = 0;
}
}
void TearDown(const ::benchmark::State &state) {
free(input_img_buf);
free(outpng.buf);
}
};
BENCHMARK_DEFINE_F(png_encode, encode_compressible)(benchmark::State &state) {
Bench(state);
}
BENCHMARK_REGISTER_F(png_encode, encode_compressible)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond);

View File

@@ -0,0 +1,146 @@
#pragma once
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#define IMWIDTH 1024
#define IMHEIGHT 1024
extern "C" {
# include <png.h>
}
typedef struct _png_dat {
uint8_t *buf;
int64_t len;
size_t buf_rem;
} png_dat;
typedef struct _png_parse_dat {
uint8_t *cur_pos;
} png_parse_dat;
/* Write a customized write callback so that we write back to an in-memory buffer.
* This allows the testing to not involve disk IO */
static void png_write_cb(png_structp pngp, png_bytep data, png_size_t len) {
png_dat *dat = (png_dat*)png_get_io_ptr(pngp);
size_t curSize = dat->len + len;
/* realloc double the requested buffer size to prevent excessive reallocs */
if (dat->buf_rem < len) {
dat->buf = (uint8_t*)realloc(dat->buf, dat->len + dat->buf_rem + 2 * len);
if (!dat->buf) {
/* Pretty unlikely but we'll put it here just in case */
fprintf(stderr, "realloc failed, exiting\n");
exit(1);
}
dat->buf_rem += 2 * len;
}
memcpy(dat->buf + dat->len, data, len);
dat->len = curSize;
dat->buf_rem -= len;
}
static void init_compressible(png_bytep buf, size_t num_pix) {
/* It doesn't actually matter what we make this, but for
* the sake of a reasonable test image, let's make this
* be a stripe of R, G, & B, with no alpha channel */
int32_t i = 0;
int32_t red_stop = num_pix / 3;
int32_t blue_stop = 2 * num_pix / 3;
int32_t green_stop = num_pix;
for (int32_t x = 0; i < red_stop; x += 3, ++i) {
buf[x] = 255;
buf[x + 1] = 0;
buf[x + 2] = 0;
}
for (int32_t x = 3 * i; i < blue_stop; x+= 3, ++i) {
buf[x] = 0;
buf[x + 1] = 255;
buf[x + 2] = 0;
}
for (int32_t x = 3 * i; i < green_stop; x += 3, ++i) {
buf[x] = 0;
buf[x + 1] = 0;
buf[x + 2] = 255;
}
}
static inline void encode_png(png_bytep buf, png_dat *outpng, int32_t comp_level, uint32_t width, uint32_t height) {
png_structp png = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
/* Most of this error handling is _likely_ not necessary. Likewise it's likely
* a lot of this stuff can be done in the setup function to avoid measuring this
* fixed setup time, but for now we'll do it here */
if (!png) abort();
png_infop info = png_create_info_struct(png);
if (!info) abort();
png_set_write_fn(png, outpng, png_write_cb, NULL);
png_bytep *png_row_ptrs = new png_bytep[height];
for (int i = 0; i < IMHEIGHT; ++i) {
png_row_ptrs[i] = (png_bytep)&buf[3*i*width];
}
png_set_IHDR(png, info, IMWIDTH, IMHEIGHT, 8, PNG_COLOR_TYPE_RGB,
PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT,
PNG_FILTER_TYPE_DEFAULT);
png_write_info(png, info);
png_set_compression_level(png, comp_level);
png_set_filter(png, 0, PNG_FILTER_NONE);
png_write_image(png, (png_bytepp)png_row_ptrs);
png_write_end(png, NULL);
png_destroy_write_struct(&png, &info);
delete[] png_row_ptrs;
}
static void read_from_pngdat(png_structp png, png_bytep out, png_size_t bytes_to_read) {
png_parse_dat *io = (png_parse_dat*)png_get_io_ptr(png);
memcpy(out, io->cur_pos, bytes_to_read);
io->cur_pos += bytes_to_read;
}
static inline int decode_png(png_parse_dat *dat, png_bytepp out_bytes, size_t in_size, uint32_t &width, uint32_t &height) {
png_structp png = NULL;
png = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
if (!png) abort();
png_infop info = NULL;
info = png_create_info_struct(png);
if (!info) abort();
png_set_read_fn(png, dat, read_from_pngdat);
png_read_info(png, info);
int bit_depth = 0, color_type = -1;
png_get_IHDR(png, info, &width, &height, &bit_depth, &color_type, NULL, NULL, NULL);
size_t im_size = width * height * bit_depth/8 * 3;
if (color_type != PNG_COLOR_TYPE_RGB) {
fprintf(stderr, "expected an 8 bpp RGB image\n");
abort();
}
if (im_size > in_size) {
*out_bytes = (png_bytep)realloc(*out_bytes, im_size);
}
png_bytep *out_rows = new png_bytep[height];
for (size_t i = 0; i < height; ++i)
out_rows[i] = *out_bytes + (width*i*3);
png_read_rows(png, out_rows, NULL, height);
png_destroy_read_struct(&png, &info, NULL);
delete[] out_rows;
return im_size;
}

View File

@@ -0,0 +1,98 @@
/* benchmark_slidehash.cc -- benchmark slide_hash variants
* Copyright (C) 2022 Adam Stylinski, Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <limits.h>
#include <benchmark/benchmark.h>
extern "C" {
# include "zbuild.h"
# include "zutil_p.h"
# include "deflate.h"
# include "arch_functions.h"
# include "../test_cpu_features.h"
}
#define MAX_RANDOM_INTS 32768
class slide_hash: public benchmark::Fixture {
private:
uint16_t *l0;
uint16_t *l1;
deflate_state *s_g;
public:
void SetUp(const ::benchmark::State& state) {
l0 = (uint16_t *)zng_alloc(HASH_SIZE * sizeof(uint16_t));
for (uint32_t i = 0; i < HASH_SIZE; i++) {
l0[i] = rand();
}
l1 = (uint16_t *)zng_alloc(MAX_RANDOM_INTS * sizeof(uint16_t));
for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
l1[i] = rand();
}
deflate_state *s = (deflate_state*)malloc(sizeof(deflate_state));
s->head = l0;
s->prev = l1;
s_g = s;
}
void Bench(benchmark::State& state, slide_hash_func slide_hash) {
s_g->w_size = (uint32_t)state.range(0);
for (auto _ : state) {
slide_hash(s_g);
benchmark::DoNotOptimize(s_g);
}
}
void TearDown(const ::benchmark::State& state) {
zng_free(l0);
zng_free(l1);
}
};
#define BENCHMARK_SLIDEHASH(name, fptr, support_flag) \
BENCHMARK_DEFINE_F(slide_hash, name)(benchmark::State& state) { \
if (!support_flag) { \
state.SkipWithError("CPU does not support " #name); \
} \
Bench(state, fptr); \
} \
BENCHMARK_REGISTER_F(slide_hash, name)->RangeMultiplier(2)->Range(1024, MAX_RANDOM_INTS);
BENCHMARK_SLIDEHASH(c, slide_hash_c, 1);
#ifdef DISABLE_RUNTIME_CPU_DETECTION
BENCHMARK_SLIDEHASH(native, native_slide_hash, 1);
#else
#ifdef ARM_SIMD
BENCHMARK_SLIDEHASH(armv6, slide_hash_armv6, test_cpu_features.arm.has_simd);
#endif
#ifdef ARM_NEON
BENCHMARK_SLIDEHASH(neon, slide_hash_neon, test_cpu_features.arm.has_neon);
#endif
#ifdef POWER8_VSX
BENCHMARK_SLIDEHASH(power8, slide_hash_power8, test_cpu_features.power.has_arch_2_07);
#endif
#ifdef PPC_VMX
BENCHMARK_SLIDEHASH(vmx, slide_hash_vmx, test_cpu_features.power.has_altivec);
#endif
#ifdef RISCV_RVV
BENCHMARK_SLIDEHASH(rvv, slide_hash_rvv, test_cpu_features.riscv.has_rvv);
#endif
#ifdef X86_SSE2
BENCHMARK_SLIDEHASH(sse2, slide_hash_sse2, test_cpu_features.x86.has_sse2);
#endif
#ifdef X86_AVX2
BENCHMARK_SLIDEHASH(avx2, slide_hash_avx2, test_cpu_features.x86.has_avx2);
#endif
#endif

View File

@@ -0,0 +1,94 @@
/* benchmark_uncompress.cc -- benchmark uncompress()
* Copyright (C) 2024 Hans Kristian Rosbach
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <stdio.h>
#include <assert.h>
#include <benchmark/benchmark.h>
extern "C" {
# include "zbuild.h"
# include "zutil_p.h"
# if defined(ZLIB_COMPAT)
# include "zlib.h"
# else
# include "zlib-ng.h"
# endif
}
#define MAX_SIZE (1024 * 1024)
#define NUM_TESTS 6
class uncompress_bench: public benchmark::Fixture {
private:
size_t maxlen;
uint8_t *inbuff;
uint8_t *outbuff;
uint8_t *compressed_buff[NUM_TESTS];
uLong compressed_sizes[NUM_TESTS];
int64_t sizes[NUM_TESTS] = {1, 64, 1024, 16384, 128*1024, 1024*1024};
public:
void SetUp(const ::benchmark::State& state) {
const char teststr[42] = "Hello hello World broken Test tast mello.";
maxlen = MAX_SIZE;
inbuff = (uint8_t *)zng_alloc(MAX_SIZE + 1);
assert(inbuff != NULL);
outbuff = (uint8_t *)zng_alloc(MAX_SIZE + 1);
assert(outbuff != NULL);
// Initialize input buffer
int pos = 0;
for (int32_t i = 0; i < MAX_SIZE - 42 ; i+=42){
pos += sprintf((char *)inbuff+pos, "%s", teststr);
}
// Compress data into different buffers
for (size_t i = 0; i < NUM_TESTS; ++i) {
compressed_buff[i] = (uint8_t *)zng_alloc(MAX_SIZE + 1);
assert(compressed_buff[i] != NULL);
uLong compressed_size = maxlen;
int err = PREFIX(compress)(compressed_buff[i], &compressed_size, inbuff, sizes[i]);
if (err != Z_OK) {
fprintf(stderr, "Compression failed with error %d\n", err);
abort();
}
compressed_sizes[i] = compressed_size;
}
}
void Bench(benchmark::State& state) {
int err = 0;
for (auto _ : state) {
int index = 0;
while (sizes[index] != state.range(0)) ++index;
uLong out_size = maxlen;
err = PREFIX(uncompress)(outbuff, &out_size, compressed_buff[index], compressed_sizes[index]);
}
benchmark::DoNotOptimize(err);
}
void TearDown(const ::benchmark::State& state) {
zng_free(inbuff);
zng_free(outbuff);
for (size_t i = 0; i < NUM_TESTS; ++i) {
zng_free(compressed_buff[i]);
}
}
};
#define BENCHMARK_UNCOMPRESS(name) \
BENCHMARK_DEFINE_F(uncompress_bench, name)(benchmark::State& state) { \
Bench(state); \
} \
BENCHMARK_REGISTER_F(uncompress_bench, name)->Arg(1)->Arg(64)->Arg(1024)->Arg(16<<10)->Arg(128<<10)->Arg(1024<<10);
BENCHMARK_UNCOMPRESS(uncompress_bench);