From 34897eeb406af69868ad92fbb490a5c260a68bf0 Mon Sep 17 00:00:00 2001
From: tqcq <99722391+tqcq@users.noreply.github.com>
Date: Sun, 31 Mar 2024 00:51:44 +0800
Subject: [PATCH] feat update picobench

---
 CMakeLists.txt                       |   22 +-
 README.md                            |   22 +
 src/sled/random_bench.cc             |   83 +-
 src/sled/strings/base64_bench.cc     |   29 +-
 src/sled/system/fiber/fiber_bench.cc |    1 -
 src/sled/system/thread_bench.cc      |   24 +
 src/sled/system/thread_pool_bench.cc |   18 +-
 src/sled/system_time_bench.cc        |    6 +-
 src/sled/testing/benchmark.cc        |    3 +
 src/sled/testing/benchmark.h         |    6 +
 src/sled/testing/benchmark_main.cc   |   11 +
 src/sled/testing/picobench.h         | 1393 ++++++++++++++++++++++++++
 12 files changed, 1535 insertions(+), 83 deletions(-)
 create mode 100644 src/sled/system/thread_bench.cc
 create mode 100644 src/sled/testing/benchmark.cc
 create mode 100644 src/sled/testing/benchmark.h
 create mode 100644 src/sled/testing/benchmark_main.cc
 create mode 100644 src/sled/testing/picobench.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 259b802..773055c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,6 +18,10 @@ set(BUILD_RTTR_DYNAMIC OFF)
 set(BUILD_UNIT_TESTS OFF)
 set(BUILD_EXAMPLES OFF)
 add_library(test_main STATIC src/sled/testing/test_main.cc)
+add_library(benchmark_main STATIC src/sled/testing/benchmark_main.cc)
+target_include_directories(test_main PUBLIC src/)
+target_include_directories(benchmark_main PUBLIC src/)
+
 add_library(sled STATIC "")
 
 add_subdirectory(3party/gperftools EXCLUDE_FROM_ALL)
@@ -43,7 +47,6 @@ endif()
 # add_subdirectory(3party/eigen EXCLUDE_FROM_ALL)
 target_include_directories(sled PUBLIC src/ 3party/eigen 3party/inja
                                        3party/rxcpp)
-target_include_directories(test_main PUBLIC src/)
 target_sources(
   sled
   PRIVATE src/sled/async/async.cc
@@ -71,6 +74,7 @@ target_sources(
           src/sled/task_queue/pending_task_safety_flag.cc
           src/sled/task_queue/task_queue_base.cc
           src/sled/testing/test.cc
+          src/sled/testing/benchmark.cc
           src/sled/timer/task_queue_timeout.cc
           src/sled/timer/timer.cc
           src/sled/units/time_delta.cc
@@ -104,17 +108,19 @@ target_link_libraries(
 set_target_properties(sled PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 if(SLED_BUILD_BENCHMARK)
-  if(NOT TARGET benchmark)
-    find_package(benchmark REQUIRED)
-  endif()
+  # if(NOT TARGET benchmark) find_package(mbenchmark REQUIRED) endif()
 
   add_executable(
     sled_benchmark
-    src/sled/random_bench.cc src/sled/strings/base64_bench.cc
-    src/sled/system/fiber/fiber_bench.cc src/sled/system/thread_pool_bench.cc
+    src/sled/random_bench.cc
+    src/sled/strings/base64_bench.cc
+    # src/sled/system/fiber/fiber_bench.cc
+    src/sled/system/thread_bench.cc
+    src/sled/system/thread_pool_bench.cc
     src/sled/system_time_bench.cc)
-  target_link_libraries(sled_benchmark PRIVATE sled benchmark::benchmark
-                                               benchmark::benchmark_main)
+  target_link_libraries(sled_benchmark PRIVATE sled benchmark_main)
+  target_compile_options(sled_benchmark PRIVATE -include
+                                                sled/testing/benchmark.h)
 endif(SLED_BUILD_BENCHMARK)
 
 function(sled_add_test)
diff --git a/README.md b/README.md
index 8569112..e9d014d 100644
--- a/README.md
+++ b/README.md
@@ -3,3 +3,25 @@
 - [ ] add log module
 - [ ] add http module(cpp-httplib)
 - [ ] add ORM module
+
+
+## Benchmark
+ Name (* = baseline)      |  ns/op  | Baseline |  Ops/second
+--------------------------|--------:|---------:|-----------:
+ Random bool *            |       9 |        - | 109123586.4
+ Random int8_t            |       7 |    0.778 | 134375880.8
+ Random int32_t           |       8 |    0.889 | 120300189.7
+ Random uint32_t          |       7 |    0.778 | 131234452.1
+ Random uint32_t range    |       8 |    0.889 | 123079276.7
+ Gaussian(0, 1)           |      17 |    1.889 |  58742994.8
+ Exponential(1)           |      12 |    1.333 |  81447219.4
+ Random float             |       7 |    0.778 | 136571495.3
+ Random double            |       7 |    0.778 | 131796121.5
+ Base64Encode             |  106299 |11811.000 |      9407.4
+ Base64Decode             |  104897 |11655.222 |      9533.1
+ ThreadBlockingCallByDefaultSocketServer |    6624 |  736.000 |    150950.1
+ ThreadBlockingCallByNullSocketServer |    5309 |  589.889 |    188358.8
+ ThreadPoolBench          |    3096 |  344.000 |    322941.4
+ SystemTimeNanos          |      24 |    2.667 |  40659163.6
+
+
diff --git a/src/sled/random_bench.cc b/src/sled/random_bench.cc
index 6defd91..91e3c98 100644
--- a/src/sled/random_bench.cc
+++ b/src/sled/random_bench.cc
@@ -1,56 +1,47 @@
-#include <benchmark/benchmark.h>
 #include <sled/random.h>
+#include <sled/testing/benchmark.h>
 
-class RandomFixture : public benchmark::Fixture {
-    void SetUp(::benchmark::State &state) { rand_ = new sled::Random(1314); }
+PICOBENCH([](picobench::state &s) {
+    sled::Random rand(s.user_data());
+    for (auto _ : s) { bool b = rand.Rand<bool>(); }
+}).label("Random bool");
 
-    void TearDown(::benchmark::State &state) { delete rand_; }
+PICOBENCH([](picobench::state &s) {
+    sled::Random rand(s.user_data());
+    for (auto _ : s) { int32_t i = rand.Rand<int8_t>(); }
+}).label("Random int8_t");
 
-protected:
-    sled::Random *rand_;
-};
+PICOBENCH([](picobench::state &s) {
+    sled::Random rand(s.user_data());
+    for (auto _ : s) { int32_t i = rand.Rand(-1000, 1000); }
+}).label("Random int32_t");
 
-BENCHMARK_F(RandomFixture, bool)(benchmark::State &state)
-{
-    for (auto _ : state) { bool b = rand_->Rand<bool>(); }
-}
+PICOBENCH([](picobench::state &s) {
+    sled::Random rand(s.user_data());
+    for (auto _ : s) { uint32_t i = rand.Rand<uint32_t>(); }
+}).label("Random uint32_t");
 
-BENCHMARK_F(RandomFixture, int32_t)(benchmark::State &state)
-{
-    for (auto _ : state) { int32_t i = rand_->Rand<int8_t>(); }
-}
+PICOBENCH([](picobench::state &s) {
+    sled::Random rand(s.user_data());
+    for (auto _ : s) { uint32_t i = rand.Rand(0u, 1000u); }
+}).label("Random uint32_t range");
 
-BENCHMARK_F(RandomFixture, int32_t_range)(benchmark::State &state)
-{
-    for (auto _ : state) { int32_t i = rand_->Rand(-1000, 1000); }
-}
+PICOBENCH([](picobench::state &s) {
+    sled::Random rand(s.user_data());
+    for (auto _ : s) { double d = rand.Gaussian(0, 1); }
+}).label("Gaussian(0, 1)");
 
-BENCHMARK_F(RandomFixture, uint32_t)(benchmark::State &state)
-{
-    for (auto _ : state) { uint32_t i = rand_->Rand<uint32_t>(); }
-}
+PICOBENCH([](picobench::state &s) {
+    sled::Random rand(s.user_data());
+    for (auto _ : s) { double d = rand.Exponential(1); }
+}).label("Exponential(1)");
 
-BENCHMARK_F(RandomFixture, uint32_t_range)(benchmark::State &state)
-{
-    for (auto _ : state) { uint32_t i = rand_->Rand(0u, 1000u); }
-}
+PICOBENCH([](picobench::state &s) {
+    sled::Random rand(s.user_data());
+    for (auto _ : s) { float f = rand.Rand<float>(); }
+}).label("Random float");
 
-BENCHMARK_F(RandomFixture, Gaussian)(benchmark::State &state)
-{
-    for (auto _ : state) { double d = rand_->Gaussian(0, 1); }
-}
-
-BENCHMARK_F(RandomFixture, Exponential)(benchmark::State &state)
-{
-    for (auto _ : state) { double d = rand_->Exponential(1); }
-}
-
-BENCHMARK_F(RandomFixture, float)(benchmark::State &state)
-{
-    for (auto _ : state) { float f = rand_->Rand<float>(); }
-}
-
-BENCHMARK_F(RandomFixture, double)(benchmark::State &state)
-{
-    for (auto _ : state) { double d = rand_->Rand<double>(); }
-}
+PICOBENCH([](picobench::state &s) {
+    sled::Random rand(s.user_data());
+    for (auto _ : s) { double d = rand.Rand<double>(); }
+}).label("Random double");
diff --git a/src/sled/strings/base64_bench.cc b/src/sled/strings/base64_bench.cc
index f9cf85f..2cb8066 100644
--- a/src/sled/strings/base64_bench.cc
+++ b/src/sled/strings/base64_bench.cc
@@ -1,14 +1,15 @@
-#include <benchmark/benchmark.h>
 #include <sled/random.h>
 #include <sled/strings/base64.h>
+#include <sled/testing/benchmark.h>
 #include <sstream>
 
 static std::string
 RandomString(size_t length)
 {
-    static const char chars[] = "0123456789"
-                                "abcdefghijklmnopqrstuvwxyz"
-                                "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+    static const char chars[]
+        = "0123456789"
+          "abcdefghijklmnopqrstuvwxyz"
+          "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
     std::stringstream ss;
     sled::Random rand(1314);
     while (length--) { ss << chars[rand.Rand(sizeof(chars))]; }
@@ -16,27 +17,27 @@ RandomString(size_t length)
 }
 
 static void
-Base64Encode(benchmark::State &state)
+Base64Encode(picobench::state &state)
 {
     for (auto _ : state) {
-        state.PauseTiming();
-        std::string input = RandomString(state.range(0));
-        state.ResumeTiming();
+        state.pause_timer();
+        std::string input = RandomString(10000);
+        state.resume_timer();
         (void) sled::Base64::Encode(input);
     }
 }
 
 static void
-Base64Decode(benchmark::State &state)
+Base64Decode(picobench::state &state)
 {
     for (auto _ : state) {
-        state.PauseTiming();
-        std::string input = RandomString(state.range(0));
+        state.pause_timer();
+        std::string input        = RandomString(10000);
         std::string base64_input = sled::Base64::Encode(input);
-        state.ResumeTiming();
+        state.resume_timer();
         (void) sled::Base64::Decode(base64_input);
     }
 }
 
-BENCHMARK(Base64Encode)->RangeMultiplier(100)->Range(10, 100000);
-BENCHMARK(Base64Decode)->RangeMultiplier(100)->Range(10, 100000);
+PICOBENCH(Base64Encode);
+PICOBENCH(Base64Decode);
diff --git a/src/sled/system/fiber/fiber_bench.cc b/src/sled/system/fiber/fiber_bench.cc
index 2908d26..d4e3791 100644
--- a/src/sled/system/fiber/fiber_bench.cc
+++ b/src/sled/system/fiber/fiber_bench.cc
@@ -1,4 +1,3 @@
-#include <benchmark/benchmark.h>
 #include <sled/system/fiber/scheduler.h>
 #include <sled/system/fiber/wait_group.h>
 
diff --git a/src/sled/system/thread_bench.cc b/src/sled/system/thread_bench.cc
new file mode 100644
index 0000000..70a931b
--- /dev/null
+++ b/src/sled/system/thread_bench.cc
@@ -0,0 +1,24 @@
+#include <sled/system/thread.h>
+
+void
+ThreadBlockingCallByDefaultSocketServer(picobench::state &s)
+{
+    auto thread = sled::Thread::CreateWithSocketServer();
+    thread->Start();
+    for (auto _ : s) {
+        (void) thread->BlockingCall([] { return 1; });
+    }
+}
+
+void
+ThreadBlockingCallByNullSocketServer(picobench::state &s)
+{
+    auto thread = sled::Thread::Create();
+    thread->Start();
+    for (auto _ : s) {
+        (void) thread->BlockingCall([] { return 1; });
+    }
+}
+
+PICOBENCH(ThreadBlockingCallByDefaultSocketServer);
+PICOBENCH(ThreadBlockingCallByNullSocketServer);
diff --git a/src/sled/system/thread_pool_bench.cc b/src/sled/system/thread_pool_bench.cc
index ab0d577..0bd6697 100644
--- a/src/sled/system/thread_pool_bench.cc
+++ b/src/sled/system/thread_pool_bench.cc
@@ -1,20 +1,16 @@
-#include "sled/system/fiber/wait_group.h"
-#include <benchmark/benchmark.h>
-#include <future>
+#include <sled/system/fiber/wait_group.h>
 #include <sled/system/thread_pool.h>
+#include <sled/testing/benchmark.h>
 
 static void
-ThreadPoolBench(benchmark::State &state)
+ThreadPoolBench(picobench::state &state)
 {
     sled::ThreadPool pool(-1);
     for (auto _ : state) {
-        std::vector<std::future<int>> futures;
-        for (int i = 0; i < state.range(0); i++) {
-            std::future<int> f = pool.submit([]() { return 1; });
-            futures.push_back(std::move(f));
-        }
-        for (auto &f : futures) { f.get(); }
+        std::future<int> f = pool.submit([]() { return 1; });
+        (void) f.get();
     }
 }
 
-BENCHMARK(ThreadPoolBench)->RangeMultiplier(10)->Range(10, 10000);
+// BENCHMARK(ThreadPoolBench)->RangeMultiplier(10)->Range(10, 10000);
+PICOBENCH(ThreadPoolBench);
diff --git a/src/sled/system_time_bench.cc b/src/sled/system_time_bench.cc
index 9af5e7f..245dd91 100644
--- a/src/sled/system_time_bench.cc
+++ b/src/sled/system_time_bench.cc
@@ -1,10 +1,10 @@
-#include <benchmark/benchmark.h>
 #include <sled/system_time.h>
+#include <sled/testing/benchmark.h>
 
 static void
-SystemTimeNanos(benchmark::State &state)
+SystemTimeNanos(picobench::state &state)
 {
     for (auto _ : state) { (void) sled::SystemTimeNanos(); }
 }
 
-BENCHMARK(SystemTimeNanos);
+PICOBENCH(SystemTimeNanos);
diff --git a/src/sled/testing/benchmark.cc b/src/sled/testing/benchmark.cc
new file mode 100644
index 0000000..332f170
--- /dev/null
+++ b/src/sled/testing/benchmark.cc
@@ -0,0 +1,3 @@
+#define SLED_TESTING_BENCHMARK_H
+// #define DOCTEST_CONFIG_IMPLEMENT
+// #include "sled/testing/benchmark.h"
diff --git a/src/sled/testing/benchmark.h b/src/sled/testing/benchmark.h
new file mode 100644
index 0000000..ed09284
--- /dev/null
+++ b/src/sled/testing/benchmark.h
@@ -0,0 +1,6 @@
+#ifndef SLED_TESTING_BENCHAMRK_H
+#define SLED_TESTING_BENCHAMRK_H
+
+#include "sled/testing/picobench.h"
+
+#endif// SLED_TESTING_BENCHAMRK_H
diff --git a/src/sled/testing/benchmark_main.cc b/src/sled/testing/benchmark_main.cc
new file mode 100644
index 0000000..0ba8c37
--- /dev/null
+++ b/src/sled/testing/benchmark_main.cc
@@ -0,0 +1,11 @@
+#define SLED_TESTING_BENCHMARK_H
+#define PICOBENCH_IMPLEMENT_WITH_MAIN
+#include "sled/testing/benchmark.h"
+
+// int
+// main(int argc, char *argv[])
+// {
+//     PICOBENCH_NAMESPACE::runner r;
+//     r.parse_cmd_line(argc, argv);
+//     return r.run();
+// }
diff --git a/src/sled/testing/picobench.h b/src/sled/testing/picobench.h
new file mode 100644
index 0000000..17e933c
--- /dev/null
+++ b/src/sled/testing/picobench.h
@@ -0,0 +1,1393 @@
+// picobench v2.07
+// https://github.com/iboB/picobench
+//
+// A micro microbenchmarking library in a single header file
+//
+// SPDX-License-Identifier: MIT
+//
+// MIT License
+//
+// Copyright(c) 2017-2024 Borislav Stanimirov
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+//
+//                  VERSION HISTORY
+//
+//  2.07 (2024-03-06) * Text output is now markdown compatible
+//                    * Allow including picobench.hpp before defining
+//                      PICOBENCH_IMPLEMENT
+//  2.06 (2023-11-24) Intenal. This file was not affected
+//  2.05 (2023-04-26) Fixed MinGW build
+//  2.04 (2023-04-12) Added CLI args to run specific benchmarks or suites
+//  2.03 (2023-03-21) * Added PICOBENCH_UNIQUE_SYM_SUFFIX
+//                    * Fixed several warnings
+//  2.02 (2023-02-16) * Fixed same-func warning if user data is different
+//                    * Macro PICOBENCH_NAMESPACE to change namespace
+//                    * Changed marking of baseline in human-readable reports
+//                    * Minor internal changes in strings
+//  2.01 (2019-03-03) * Fixed android build when binding to a signle core
+//                    * Minor doc fixes
+//  2.00 (2018-10-30) * Breaking change! runner::run_benchmarks doesn't return
+//                      a report anymore. The report is generated by
+//                      runner::generate_report instead
+//                    * Breaking change! report_output_format doesn't accept
+//                      output streams as arguments. Use set_output_streams.
+//                    * Potentially breaking change (gcc and clang)! Always set
+//                      thread affinity to first core. Macro to turn this off.
+//                    * Added runner::run which performs a full execution
+//                    * Added benchmark results and results comparison
+//                    * Added error enum
+//                    * Macro option to allow a std::function as a benchmark
+//                    * Macros for default iterations and samples
+//                    * Allowing local registration of benchmarks in a runner
+//                    * Added local_runner which doesn't consume registry
+//                    * More force-inline functions in states
+//                    * Fixed some potential compilation warnings
+//                    * Removed tests from header
+//                    * Anonymous namespace for impl-only classes and funcs
+//                    * Added setters and getters for every config option
+//  1.05 (2018-07-17) * Counting iterations of state
+//                    * Optionally set thread affinity when running benchmarks
+//                      so as not to miss cpu cycles with the high res clock
+//  1.04 (2018-02-06) * User data for benchmarks, which can be seen from states
+//                    * `add_custom_duration` to states so the user can modify time
+//                    * Text table format fixes
+//                    * Custom cmd opts in runner
+//                    * --version CLI command
+//  1.03 (2018-01-05) Added helper methods for easier browsing of reports
+//  1.02 (2018-01-04) Added parsing of command line
+//  1.01 (2018-01-03) * Only taking the fastest sample into account
+//                    * Set default number of samples to 2
+//                    * Added CSV output
+//  1.00 (2018-01-01) Initial release
+//  0.01 (2017-12-28) Initial prototype release
+//
+//
+//                  EXAMPLE
+//
+// void my_function(); // the function you want to benchmark
+//
+// // write your benchmarking code in a function like this
+// static void benchmark_my_function(picobench::state& state)
+// {
+//     // use the state in a range-based for loop to call your code
+//     for (auto _ : state)
+//         my_function();
+// }
+// // create a picobench with your benchmarking code
+// PICOBENCH(benchmark_my_function);
+//
+//
+//                  BASIC DOCUMENTATION
+//
+// A very brief usage guide follows. For more detailed documentation see the
+// README here: https://github.com/iboB/picobench/blob/master/README.md
+//
+// Simply include this file wherever you need.
+// You need to define PICOBENCH_IMPLEMENT_WITH_MAIN (or PICOBENCH_IMPLEMENT if
+// you want to write your own main function) in one compilation unit to have
+// the implementation compiled there.
+//
+// The benchmark code must be a `void (picobench::state&)` function which
+// you have written. Benchmarks are registered using the `PICOBENCH` macro
+// where the only argument is the function's name.
+//
+// You can have multiple benchmarks in multiple files. All will be run when the
+// executable starts.
+//
+// Typically a benchmark has a loop. To run the loop use the state argument in
+// a range-based for loop in your function. The time spent looping is measured
+// for the benchmark. You can have initialization/deinitialization code outside
+// of the loop and it won't be measured.
+//
+#if !defined(PICOBENCH_HPP_INCLUDED)
+#define PICOBENCH_HPP_INCLUDED
+#ifndef SLED_TESTING_BENCHAMRK_H
+#error "This file should only be included from sled/testing/benchmark.h"
+#endif// !SLED_TESTING_BENCHAMRK_H
+
+#include <chrono>
+#include <cstdint>
+#include <vector>
+
+#if defined(PICOBENCH_STD_FUNCTION_BENCHMARKS)
+#include <functional>
+#endif
+
+#define PICOBENCH_VERSION 2.05
+#define PICOBENCH_VERSION_STR "2.05"
+
+#if defined(PICOBENCH_DEBUG)
+#include <cassert>
+#define I_PICOBENCH_ASSERT assert
+#else
+#define I_PICOBENCH_ASSERT(...)
+#endif
+
+#if defined(__GNUC__)
+#define PICOBENCH_INLINE __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define PICOBENCH_INLINE __forceinline
+#else
+#define PICOBENCH_INLINE inline
+#endif
+
+#if !defined(PICOBENCH_NAMESPACE)
+#define PICOBENCH_NAMESPACE picobench
+#endif
+
+namespace PICOBENCH_NAMESPACE {
+
+#if defined(_MSC_VER) || defined(__MINGW32__) || defined(PICOBENCH_TEST)
+struct high_res_clock {
+    typedef long long rep;
+    typedef std::nano period;
+    typedef std::chrono::duration<rep, period> duration;
+    typedef std::chrono::time_point<high_res_clock> time_point;
+    static const bool is_steady = true;
+
+    static time_point now();
+};
+#else
+using high_res_clock = std::chrono::high_resolution_clock;
+#endif
+
+using result_t = intptr_t;
+
+class state {
+public:
+    explicit state(int num_iterations, uintptr_t user_data = 0) : _user_data(user_data), _iterations(num_iterations)
+    {
+        I_PICOBENCH_ASSERT(_iterations > 0);
+    }
+
+    int iterations() const { return _iterations; }
+
+    int64_t duration_ns() const { return _duration_ns; }
+
+    void add_custom_duration(int64_t duration_ns) { _duration_ns += duration_ns; }
+
+    uintptr_t user_data() const { return _user_data; }
+
+    // optionally set result of benchmark
+    // this can be used as a value sync to prevent optimizations
+    // or a way to check whether benchmarks produce the same results
+    void set_result(uintptr_t data) { _result = data; }
+
+    result_t result() const { return _result; }
+
+    PICOBENCH_INLINE
+    void start_timer()
+    {
+        _start       = high_res_clock::now();
+        _duration_ns = 0;
+        _pause       = false;
+    }
+
+    PICOBENCH_INLINE
+    void resume_timer()
+    {
+        _start = high_res_clock::now();
+        _pause = false;
+    }
+
+    PICOBENCH_INLINE
+    void pause_timer()
+    {
+        auto duration = high_res_clock::now() - _start;
+        _duration_ns += std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
+        _pause = true;
+    }
+
+    PICOBENCH_INLINE
+    void stop_timer()
+    {
+        auto duration = high_res_clock::now() - _start;
+        if (_pause) { duration = duration.zero(); }
+        _duration_ns += std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
+    }
+
+    template<typename Rep, typename Period>
+    PICOBENCH_INLINE void timer_add(std::chrono::duration<Rep, Period> duration)
+    {
+        _duration_ns += duration;
+    }
+
+    struct iterator {
+        PICOBENCH_INLINE
+        iterator(state *parent) : _counter(0), _lim(parent->iterations()), _state(parent)
+        {
+            I_PICOBENCH_ASSERT(_counter < _lim);
+        }
+
+        PICOBENCH_INLINE
+        iterator() : _counter(0), _lim(0), _state(nullptr) {}
+
+        PICOBENCH_INLINE
+        iterator &operator++()
+        {
+            I_PICOBENCH_ASSERT(_counter < _lim);
+            ++_counter;
+            return *this;
+        }
+
+        PICOBENCH_INLINE
+        bool operator!=(const iterator &) const
+        {
+            if (_counter < _lim) return true;
+            _state->stop_timer();
+            return false;
+        }
+
+        PICOBENCH_INLINE
+        int operator*() const { return _counter; }
+
+    private:
+        int _counter;
+        const int _lim;
+        state *_state;
+    };
+
+    PICOBENCH_INLINE
+    iterator begin()
+    {
+        start_timer();
+        return iterator(this);
+    }
+
+    PICOBENCH_INLINE
+    iterator end() { return iterator(); }
+
+private:
+    high_res_clock::time_point _start;
+    int64_t _duration_ns = 0;
+    uintptr_t _user_data;
+    int _iterations;
+    result_t _result = 0;
+    bool _pause;
+};
+
+// this can be used for manual measurement
+class scope {
+public:
+    PICOBENCH_INLINE
+    scope(state &s) : _state(s) { _state.start_timer(); }
+
+    PICOBENCH_INLINE
+    ~scope() { _state.stop_timer(); }
+
+private:
+    state &_state;
+};
+
+#if defined(PICOBENCH_STD_FUNCTION_BENCHMARKS)
+using benchmark_proc = std::function<void(state &)>;
+#else
+using benchmark_proc = void (*)(state &);
+#endif
+
+class benchmark {
+public:
+    const char *name() const { return _name; }
+
+    benchmark &iterations(std::vector<int> data)
+    {
+        _state_iterations = std::move(data);
+        return *this;
+    }
+
+    benchmark &samples(int n)
+    {
+        _samples = n;
+        return *this;
+    }
+
+    benchmark &label(const char *label)
+    {
+        _name = label;
+        return *this;
+    }
+
+    benchmark &baseline(bool b = true)
+    {
+        _baseline = b;
+        return *this;
+    }
+
+    benchmark &user_data(uintptr_t data)
+    {
+        _user_data = data;
+        return *this;
+    }
+
+protected:
+    friend class runner;
+
+    benchmark(const char *name, benchmark_proc proc);
+
+    const char *_name;
+    const benchmark_proc _proc;
+    bool _baseline = false;
+
+    uintptr_t _user_data = 0;
+    std::vector<int> _state_iterations;
+    int _samples = 0;
+};
+
+// used for globally  functions
+// note that you can instantiate a runner and register local benchmarks for it alone
+class global_registry {
+public:
+    static int set_bench_suite(const char *name);
+    static benchmark &new_benchmark(const char *name, benchmark_proc proc);
+};
+
+}// namespace PICOBENCH_NAMESPACE
+
+// Optionally define PICOBENCH_UNIQUE_SYM_SUFFIX to replace __LINE__ with something
+// non standard like __COUNTER__ in case you need multiple PICOBENCH macros in a
+// macro of yours
+#if !defined(PICOBENCH_UNIQUE_SYM_SUFFIX)
+#define PICOBENCH_UNIQUE_SYM_SUFFIX __LINE__
+#endif
+
+#define I_PICOBENCH_PP_CAT(a, b) I_PICOBENCH_PP_INTERNAL_CAT(a, b)
+#define I_PICOBENCH_PP_INTERNAL_CAT(a, b) a##b
+
+#define PICOBENCH_SUITE(name)                                                                                          \
+    static int I_PICOBENCH_PP_CAT(picobench_suite, PICOBENCH_UNIQUE_SYM_SUFFIX)                                        \
+        = PICOBENCH_NAMESPACE::global_registry::set_bench_suite(name)
+
+#define PICOBENCH(func)                                                                                                \
+    static auto &I_PICOBENCH_PP_CAT(picobench, PICOBENCH_UNIQUE_SYM_SUFFIX)                                            \
+        = PICOBENCH_NAMESPACE::global_registry::new_benchmark(#func, func)
+
+#if defined(PICOBENCH_IMPLEMENT_WITH_MAIN)
+#define PICOBENCH_IMPLEMENT
+#define PICOBENCH_IMPLEMENT_MAIN
+#endif
+
+#endif// PICOBENCH_HPP_INCLUDED
+
+#if defined(PICOBENCH_IMPLEMENT)
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <random>
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#else
+#if !defined(PICOBENCH_DONT_BIND_TO_ONE_CORE)
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#else
+#include <sched.h>
+#endif
+#endif
+#endif
+
+namespace PICOBENCH_NAMESPACE {
+
+// namespace
+// {
+
+enum error_t {
+    no_error,
+    error_bad_cmd_line_argument,    // ill-formed command-line argument
+    error_unknown_cmd_line_argument,// command argument looks like a picobench one, but isn't
+    error_sample_compare,           // benchmark produced different results across samples
+    error_benchmark_compare,        // two benchmarks of the same suite and dimension produced different results
+};
+
+class report {
+public:
+    struct benchmark_problem_space {
+        int dimension;        // number of iterations for the problem space
+        int samples;          // number of samples taken
+        int64_t total_time_ns;// fastest sample!!!
+        result_t result;      // result of fastest sample
+    };
+
+    struct benchmark {
+        const char *name;
+        bool is_baseline;
+        std::vector<benchmark_problem_space> data;
+    };
+
+    struct suite {
+        const char *name;
+        std::vector<benchmark> benchmarks;// benchmark view
+
+        const benchmark *find_benchmark(const char *bname) const
+        {
+            for (auto &b : benchmarks) {
+                if (strcmp(b.name, bname) == 0) return &b;
+            }
+
+            return nullptr;
+        }
+
+        const benchmark *find_baseline() const
+        {
+            for (auto &b : benchmarks) {
+                if (b.is_baseline) return &b;
+            }
+
+            return nullptr;
+        }
+    };
+
+    std::vector<suite> suites;
+    error_t error = no_error;
+
+    const suite *find_suite(const char *name) const
+    {
+        for (auto &s : suites) {
+            if (strcmp(s.name, name) == 0) return &s;
+        }
+
+        return nullptr;
+    }
+
+    void to_text(std::ostream &out) const
+    {
+        using namespace std;
+        for (auto &suite : suites) {
+            if (suite.name) { out << "## " << suite.name << ":\n"; }
+
+            out.put('\n');
+            out << " Name (* = baseline)      |   Dim   |  Total ms |  ns/op  |Baseline| Ops/second\n";
+            out << "--------------------------|--------:|----------:|--------:|-------:|----------:\n";
+
+            auto problem_space_view = get_problem_space_view(suite);
+            for (auto &ps : problem_space_view) {
+                const problem_space_benchmark *baseline = nullptr;
+                for (auto &bm : ps.second) {
+                    if (bm.is_baseline) {
+                        baseline = &bm;
+                        break;
+                    }
+                }
+
+                for (auto &bm : ps.second) {
+                    out << ' ' << bm.name;
+                    auto pad = 24 - int(strlen(bm.name));
+                    if (bm.is_baseline) {
+                        out << " *";
+                        pad -= 2;
+                    }
+                    for (int i = 0; i < pad; ++i) { out.put(' '); }
+
+                    out << " |" << setw(8) << ps.first << " |" << setw(10) << fixed << setprecision(3)
+                        << double(bm.total_time_ns) / 1000000.0 << " |";
+
+                    auto ns_op = (bm.total_time_ns / ps.first);
+                    if (ns_op > 99999999) {
+                        int e = 0;
+                        while (ns_op > 999999) {
+                            ++e;
+                            ns_op /= 10;
+                        }
+                        out << ns_op << 'e' << e;
+                    } else {
+                        out << setw(8) << ns_op;
+                    }
+
+                    out << " |";
+
+                    if (baseline == &bm) {
+                        out << "      - |";
+                    } else if (baseline) {
+                        out << setw(7) << fixed << setprecision(3)
+                            << double(bm.total_time_ns) / double(baseline->total_time_ns) << " |";
+                    } else {
+                        // no baseline to compare to
+                        out << "    ??? |";
+                    }
+
+                    auto ops_per_sec = ps.first * (1000000000.0 / double(bm.total_time_ns));
+                    out << setw(11) << fixed << setprecision(1) << ops_per_sec << "\n";
+                }
+            }
+            out.put('\n');
+        }
+    }
+
+    void to_text_concise(std::ostream &out)
+    {
+        using namespace std;
+        for (auto &suite : suites) {
+            if (suite.name) { out << "## " << suite.name << ":\n"; }
+
+            out.put('\n');
+            out << " Name (* = baseline)      |  ns/op  | Baseline |  Ops/second\n";
+            out << "--------------------------|--------:|---------:|-----------:\n";
+
+            const benchmark *baseline = nullptr;
+            for (auto &bm : suite.benchmarks) {
+                if (bm.is_baseline) {
+                    baseline = &bm;
+                    break;
+                }
+            }
+            I_PICOBENCH_ASSERT(baseline);
+            int64_t baseline_total_time   = 0;
+            int baseline_total_iterations = 0;
+            for (auto &d : baseline->data) {
+                baseline_total_time += d.total_time_ns;
+                baseline_total_iterations += d.dimension;
+            }
+            int64_t baseline_ns_per_op = baseline_total_time / baseline_total_iterations;
+
+            for (auto &bm : suite.benchmarks) {
+                out << ' ' << bm.name;
+                auto pad = 24 - int(strlen(bm.name));
+                if (bm.is_baseline) {
+                    out << " *";
+                    pad -= 2;
+                }
+                for (int i = 0; i < pad; ++i) { out.put(' '); }
+
+                int64_t total_time   = 0;
+                int total_iterations = 0;
+                for (auto &d : bm.data) {
+                    total_time += d.total_time_ns;
+                    total_iterations += d.dimension;
+                }
+                int64_t ns_per_op = total_time / total_iterations;
+
+                out << " |" << setw(8) << ns_per_op << " |";
+
+                if (&bm == baseline) {
+                    out << "        - |";
+                } else {
+                    out << setw(9) << fixed << setprecision(3) << double(ns_per_op) / double(baseline_ns_per_op)
+                        << " |";
+                }
+
+                auto ops_per_sec = total_iterations * (1000000000.0 / double(total_time));
+                out << setw(12) << fixed << setprecision(1) << ops_per_sec << "\n";
+            }
+
+            out.put('\n');
+        }
+    }
+
+    void to_csv(std::ostream &out, bool header = true) const
+    {
+        using namespace std;
+
+        if (header) { out << "Suite,Benchmark,b,D,S,\"Total ns\",Result,\"ns/op\",Baseline\n"; }
+
+        for (auto &suite : suites) {
+            const benchmark *baseline = nullptr;
+            for (auto &bm : suite.benchmarks) {
+                if (bm.is_baseline) {
+                    baseline = &bm;
+                    break;
+                }
+            }
+            I_PICOBENCH_ASSERT(baseline);
+
+            for (auto &bm : suite.benchmarks) {
+                for (auto &d : bm.data) {
+                    if (suite.name) {
+                        out << '"' << suite.name << '"';
+                        ;
+                    }
+                    out << ",\"" << bm.name << "\",";
+                    if (&bm == baseline) { out << '*'; }
+                    out << ',' << d.dimension << ',' << d.samples << ',' << d.total_time_ns << ',' << d.result << ','
+                        << (d.total_time_ns / d.dimension) << ',';
+
+                    if (baseline) {
+                        for (auto &bd : baseline->data) {
+                            if (bd.dimension == d.dimension) {
+                                out << fixed << setprecision(3) << (double(d.total_time_ns) / double(bd.total_time_ns));
+                            }
+                        }
+                    }
+
+                    out << '\n';
+                }
+            }
+        }
+    }
+
+    struct problem_space_benchmark {
+        const char *name;
+        bool is_baseline;
+        int64_t total_time_ns;// fastest sample!!!
+        result_t result;      // result of fastest sample
+    };
+
+    static std::map<int, std::vector<problem_space_benchmark>> get_problem_space_view(const suite &s)
+    {
+        std::map<int, std::vector<problem_space_benchmark>> res;
+        for (auto &bm : s.benchmarks) {
+            for (auto &d : bm.data) {
+                auto &pvbs = res[d.dimension];
+                pvbs.push_back({bm.name, bm.is_baseline, d.total_time_ns, d.result});
+            }
+        }
+        return res;
+    }
+
+private:
+};
+
+class benchmark_impl : public benchmark {
+public:
+    benchmark_impl(const char *name, benchmark_proc proc) : benchmark(name, proc) {}
+
+private:
+    friend class runner;
+
+    // state
+    std::vector<state> _states;// length is _samples * _state_iterations.size()
+    std::vector<state>::iterator _istate;
+};
+
+class picostring {
+public:
+    picostring() = default;
+
+    explicit picostring(const char *text) : str(text), len(int(strlen(text))) {}
+
+    picostring(const char *text, int len) : str(text), len(len) {}
+
+    const char *str;
+    int len = 0;
+
+    // checks whether other begins with this string
+    bool is_start_of(const char *other) const { return strncmp(str, other, size_t(len)) == 0; }
+
+    bool operator==(const picostring &other) const
+    {
+        if (len != other.len) return false;
+        return strncmp(str, other.str, size_t(len)) == 0;
+    }
+
+    bool operator==(const char *other) const { return operator==(picostring(other)); }
+};
+
+class null_streambuf : public std::streambuf {
+public:
+    virtual int overflow(int c) override { return c; }
+};
+
+struct null_stream : public std::ostream {
+    null_stream() : std::ostream(&_buf) {}
+
+private:
+    null_streambuf _buf;
+} cnull;
+
+enum class report_output_format {
+    text,
+    concise_text,
+    csv,
+};
+
+#if !defined(PICOBENCH_DEFAULT_ITERATIONS)
+#define PICOBENCH_DEFAULT_ITERATIONS                                                                                   \
+    {                                                                                                                  \
+        8, 64, 512, 4096, 8192                                                                                         \
+    }
+#endif
+
+#if !defined(PICOBENCH_DEFAULT_SAMPLES)
+#define PICOBENCH_DEFAULT_SAMPLES 2
+#endif
+
+using benchmarks_vector = std::vector<std::unique_ptr<benchmark_impl>>;
+
+struct rsuite {
+    const char *name;
+    benchmarks_vector benchmarks;
+};
+
+class registry {
+public:
+    benchmark &add_benchmark(const char *name, benchmark_proc proc)
+    {
+        auto b = new benchmark_impl(name, proc);
+        benchmarks_for_current_suite().emplace_back(b);
+        return *b;
+    }
+
+    void set_suite(const char *name) { _current_suite_name = name; }
+
+    const char *&current_suite_name() { return _current_suite_name; }
+
+    benchmarks_vector &benchmarks_for_current_suite()
+    {
+        for (auto &s : _suites) {
+            if (s.name == _current_suite_name) return s.benchmarks;
+
+            if (s.name && _current_suite_name && strcmp(s.name, _current_suite_name) == 0) return s.benchmarks;
+        }
+        _suites.push_back({_current_suite_name, {}});
+        return _suites.back().benchmarks;
+    }
+
+protected:
+    friend class runner;
+    const char *_current_suite_name = nullptr;
+    std::vector<rsuite> _suites;
+};
+
+registry &
+g_registry()
+{
+    static registry r;
+    return r;
+}
+
+class runner : public registry {
+public:
+    runner(bool local = false)
+        : _default_state_iterations(PICOBENCH_DEFAULT_ITERATIONS),
+          _default_samples(PICOBENCH_DEFAULT_SAMPLES)
+    {
+        if (!local) { _suites = std::move(g_registry()._suites); }
+    }
+
+    int run(int benchmark_random_seed = -1)
+    {
+        if (should_run()) {
+            run_benchmarks(benchmark_random_seed);
+            auto report       = generate_report();
+            std::ostream *out = _stdout;
+            std::ofstream fout;
+            if (preferred_output_filename()) {
+                fout.open(preferred_output_filename());
+                if (!fout.is_open()) {
+                    std::cerr << "Error: Could not open output file `" << preferred_output_filename() << "`\n";
+                    return 1;
+                }
+                out = &fout;
+            }
+
+            switch (preferred_output_format()) {
+            case report_output_format::text:
+                report.to_text(*out);
+                break;
+            case report_output_format::concise_text:
+                report.to_text_concise(*out);
+                break;
+            case report_output_format::csv:
+                report.to_csv(*out);
+                break;
+            }
+        }
+        return error();
+    }
+
+    void run_benchmarks(int random_seed = -1)
+    {
+        I_PICOBENCH_ASSERT(_error == no_error && _should_run);
+
+        if (random_seed == -1) { random_seed = int(std::random_device()()); }
+
+        std::minstd_rand rnd(random_seed);
+
+        // vector of all benchmarks
+        std::vector<benchmark_impl *> benchmarks;
+        for (auto &suite : _suites) {
+            // also identify a baseline in this loop
+            // if there is no explicit one, set the first one as a baseline
+            bool found_baseline = false;
+            for (auto irb = suite.benchmarks.begin(); irb != suite.benchmarks.end(); ++irb) {
+                auto &rb = *irb;
+                rb->_states.clear();// clear states so we can safely call run_benchmarks multiple times
+                benchmarks.push_back(rb.get());
+                if (rb->_baseline) { found_baseline = true; }
+
+#if !defined(PICOBENCH_STD_FUNCTION_BENCHMARKS)
+                // check for same func
+                for (auto ib = irb + 1; ib != suite.benchmarks.end(); ++ib) {
+                    auto &b = *ib;
+                    if (rb->_proc == b->_proc && rb->_user_data == b->_user_data) {
+                        *_stdwarn << "Warning: " << rb->name() << " and " << b->name()
+                                  << " are benchmarks of the same function.\n";
+                    }
+                }
+#endif
+            }
+
+            if (!found_baseline && !suite.benchmarks.empty()) { suite.benchmarks.front()->_baseline = true; }
+        }
+
+        // initialize benchmarks
+        for (auto b : benchmarks) {
+            const std::vector<int> &state_iterations
+                = b->_state_iterations.empty() ? _default_state_iterations : b->_state_iterations;
+
+            if (b->_samples == 0) b->_samples = _default_samples;
+
+            b->_states.reserve(state_iterations.size() * size_t(b->_samples));
+
+            // fill states while random shuffling them
+            for (auto iters : state_iterations) {
+                for (int i = 0; i < b->_samples; ++i) {
+                    auto index = rnd() % (b->_states.size() + 1);
+                    auto pos   = b->_states.begin() + long(index);
+                    b->_states.emplace(pos, iters, b->_user_data);
+                }
+            }
+
+            b->_istate = b->_states.begin();
+        }
+
+#if !defined(PICOBENCH_DONT_BIND_TO_ONE_CORE)
+        // set thread affinity to first cpu
+        // so the high resolution clock doesn't miss cycles
+        {
+#if defined(_WIN32)
+            SetThreadAffinityMask(GetCurrentThread(), 1);
+#elif defined(__APPLE__)
+            thread_affinity_policy_data_t policy = {0};
+            thread_policy_set(pthread_mach_thread_np(pthread_self()),
+                              THREAD_AFFINITY_POLICY,
+                              (thread_policy_t) &policy,
+                              1);
+#else
+            cpu_set_t cpuset;
+            CPU_ZERO(&cpuset);
+            CPU_SET(0, &cpuset);
+
+            sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
+#endif
+        }
+#endif
+
+        // we run a random benchmark from it incrementing _istate for each
+        // when _istate reaches _states.end(), we erase the benchmark
+        // when the vector becomes empty, we're done
+        while (!benchmarks.empty()) {
+            auto i  = benchmarks.begin() + long(rnd() % benchmarks.size());
+            auto &b = *i;
+
+            b->_proc(*b->_istate);
+
+            ++b->_istate;
+
+            if (b->_istate == b->_states.end()) { benchmarks.erase(i); }
+        }
+    }
+
+    // function to compare results
+    template<typename CompareResult = std::equal_to<result_t>>
+    report generate_report(CompareResult cmp = std::equal_to<result_t>()) const
+    {
+        report rpt;
+
+        rpt.suites.resize(_suites.size());
+        auto rpt_suite = rpt.suites.begin();
+
+        for (auto &suite : _suites) {
+            rpt_suite->name = suite.name;
+
+            // build benchmark view
+            rpt_suite->benchmarks.resize(suite.benchmarks.size());
+            auto rpt_benchmark = rpt_suite->benchmarks.begin();
+
+            for (auto &b : suite.benchmarks) {
+                rpt_benchmark->name        = b->_name;
+                rpt_benchmark->is_baseline = b->_baseline;
+
+                const std::vector<int> &state_iterations
+                    = b->_state_iterations.empty() ? _default_state_iterations : b->_state_iterations;
+
+                rpt_benchmark->data.reserve(state_iterations.size());
+                for (auto d : state_iterations) { rpt_benchmark->data.push_back({d, 0, 0ll, result_t(0)}); }
+
+                for (auto &state : b->_states) {
+                    for (auto &d : rpt_benchmark->data) {
+                        if (state.iterations() == d.dimension) {
+                            if (d.total_time_ns == 0 || d.total_time_ns > state.duration_ns()) {
+                                d.total_time_ns = state.duration_ns();
+                                d.result        = state.result();
+                            }
+
+                            if (_compare_results_across_samples) {
+                                if (d.result != state.result() && !cmp(d.result, state.result())) {
+                                    *_stderr << "Error: Two samples of " << b->name() << " @" << d.dimension
+                                             << " produced different results: " << d.result << " and " << state.result()
+                                             << '\n';
+                                    _error = error_sample_compare;
+                                }
+                            }
+
+                            ++d.samples;
+                        }
+                    }
+                }
+
+#if defined(PICOBENCH_DEBUG)
+                for (auto &d : rpt_benchmark->data) { I_PICOBENCH_ASSERT(d.samples == b->_samples); }
+#endif
+
+                ++rpt_benchmark;
+            }
+
+            ++rpt_suite;
+        }
+
+        if (_compare_results_across_benchmarks) {
+            for (auto &suite : rpt.suites) {
+                auto psview = report::get_problem_space_view(suite);
+
+                for (auto &space : psview) {
+                    I_PICOBENCH_ASSERT(!space.second.empty());
+
+                    if (space.second.size() == 1) {
+                        auto &b = space.second.front();
+                        *_stdwarn << "Warning: Benchmark " << b.name << " @" << space.first
+                                  << " has a single instance and cannot be compared to others.\n";
+                        continue;
+                    }
+
+                    auto result0 = space.second.front().result;
+
+                    for (auto &b : space.second) {
+                        if (result0 != b.result && !cmp(result0, b.result)) {
+                            auto &f = space.second.front();
+                            *_stderr << "Error: Benchmarks " << f.name << " and " << b.name << " @" << space.first
+                                     << " produce different results: " << result0 << " and " << b.result << '\n';
+                            _error = error_benchmark_compare;
+                        }
+                    }
+                }
+            }
+        }
+
+        return rpt;
+    }
+
+    void set_default_state_iterations(const std::vector<int> &data) { _default_state_iterations = data; }
+
+    const std::vector<int> &default_state_iterations() const { return _default_state_iterations; }
+
+    void set_default_samples(int n) { _default_samples = n; }
+
+    int default_samples() const { return _default_samples; }
+
+    void add_cmd_opt(const char *cmd,
+                     const char *arg_desc,
+                     const char *cmd_desc,
+                     bool (*handler)(uintptr_t, const char *),
+                     uintptr_t user_data = 0)
+    {
+        cmd_line_option opt;
+        opt.cmd          = picostring(cmd);
+        opt.arg_desc     = picostring(arg_desc);
+        opt.desc         = cmd_desc;
+        opt.handler      = nullptr;
+        opt.user_data    = user_data;
+        opt.user_handler = handler;
+        _opts.push_back(opt);
+    }
+
+    // returns false if there were errors parsing the command line
+    // all args starting with prefix are parsed
+    // the others are ignored
+    bool parse_cmd_line(int argc, const char *const argv[], const char *cmd_prefix = "-")
+    {
+        _cmd_prefix = picostring(cmd_prefix);
+
+        if (!_has_opts) {
+            _opts.emplace_back("-iters=",
+                               "<n1,n2,n3,...>",
+                               "Sets default iterations for benchmarks",
+                               &runner::cmd_iters);
+            _opts.emplace_back("-samples=",
+                               "<n>",
+                               "Sets default number of samples for benchmarks",
+                               &runner::cmd_samples);
+            _opts.emplace_back("-out-fmt=", "<txt|con|csv>", "Outputs text or concise or csv", &runner::cmd_out_fmt);
+            _opts.emplace_back("-output=", "<filename>", "Sets output filename or `stdout`", &runner::cmd_output);
+            _opts.emplace_back("-compare-results", "", "Compare benchmark results", &runner::cmd_compare_results);
+            _opts.emplace_back("-no-run", "", "Doesn't run benchmarks", &runner::cmd_no_run);
+            _opts.emplace_back("-run-suite=", "<suite>", "Runs only benchmarks from suite", &runner::cmd_run_suite);
+            _opts.emplace_back("-run-only=", "<b1,b2,...>", "Runs only selected benchmarks", &runner::cmd_run_only);
+            _opts.emplace_back("-list", "", "Lists available benchmarks", &runner::cmd_list);
+            _opts.emplace_back("-version", "", "Show version info", &runner::cmd_version);
+            _opts.emplace_back("-help", "", "Prints help", &runner::cmd_help);
+            _has_opts = true;
+        }
+
+        for (int i = 1; i < argc; ++i) {
+            if (!_cmd_prefix.is_start_of(argv[i])) continue;
+
+            auto arg = argv[i] + _cmd_prefix.len;
+
+            bool found = false;
+            for (auto &opt : _opts) {
+                if (opt.cmd.is_start_of(arg)) {
+                    found        = true;
+                    bool success = false;
+                    if (opt.handler) {
+                        success = (this->*opt.handler)(arg + opt.cmd.len);
+                    } else {
+                        I_PICOBENCH_ASSERT(opt.user_handler);
+                        success = opt.user_handler(opt.user_data, arg + opt.cmd.len);
+                    }
+
+                    if (!success) {
+                        *_stderr << "Error: Bad command-line argument: " << argv[i] << "\n";
+                        _error = error_bad_cmd_line_argument;
+                        return false;
+                    }
+                    break;
+                }
+            }
+
+            if (!found) {
+                *_stderr << "Error: Unknown command-line argument: " << argv[i] << "\n";
+                _error = error_unknown_cmd_line_argument;
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    void set_should_run(bool set) { _should_run = set; }
+
+    bool should_run() const { return _error == no_error && _should_run; }
+
+    void set_error(error_t e) { _error = e; }
+
+    error_t error() const { return _error; }
+
+    void set_output_streams(std::ostream &out, std::ostream &err)
+    {
+        _stdout  = &out;
+        _stderr  = &err;
+        _stdwarn = &out;
+    }
+
+    void set_preferred_output_format(report_output_format fmt) { _output_format = fmt; }
+
+    report_output_format preferred_output_format() const { return _output_format; }
+
+    // can be nullptr (run will interpret it as stdout)
+    void set_preferred_output_filename(const char *path) { _output_file = path; }
+
+    const char *preferred_output_filename() const { return _output_file; }
+
+    void set_compare_results_across_samples(bool b) { _compare_results_across_samples = b; }
+
+    bool compare_results_across_samples() const { return _compare_results_across_samples; }
+
+    void set_compare_results_across_benchmarks(bool b) { _compare_results_across_benchmarks = b; }
+
+    bool compare_results_across_benchmarks() const { return _compare_results_across_benchmarks; }
+
+private:
+    // runner's suites and benchmarks come from its parent: registry
+
+    // state and configuration
+    mutable error_t _error = no_error;
+    bool _should_run       = true;
+
+    bool _compare_results_across_samples    = false;
+    bool _compare_results_across_benchmarks = false;
+
+    report_output_format _output_format = report_output_format::text;
+    const char *_output_file            = nullptr;// nullptr means stdout
+
+    std::ostream *_stdout  = &std::cout;
+    std::ostream *_stderr  = &std::cerr;
+    std::ostream *_stdwarn = &std::cout;
+
+    // default data
+
+    // default iterations per state per benchmark
+    std::vector<int> _default_state_iterations;
+
+    // default samples per benchmark
+    int _default_samples;
+
+    // command line parsing
+    picostring _cmd_prefix;
+    typedef bool (runner::*cmd_handler)(const char *);                     // internal handler
+    typedef bool (*ext_handler)(uintptr_t user_data, const char *cmd_line);// external (user) handler
+
+    struct cmd_line_option {
+        cmd_line_option() = default;
+
+        cmd_line_option(const char *c, const char *a, const char *d, cmd_handler h)
+            : cmd(c),
+              arg_desc(a),
+              desc(d),
+              handler(h),
+              user_data(0),
+              user_handler(nullptr)
+        {}
+
+        picostring cmd;
+        picostring arg_desc;
+        const char *desc;
+        cmd_handler handler;// may be nullptr for external handlers
+        uintptr_t user_data;// passed as an argument to user handlers
+        ext_handler user_handler;
+    };
+
+    bool _has_opts = false;// have opts been added to list
+    std::vector<cmd_line_option> _opts;
+
+    bool cmd_iters(const char *line)
+    {
+        std::vector<int> iters;
+        auto p = line;
+        while (true) {
+            auto i = int(strtoul(p, nullptr, 10));
+            if (i <= 0) return false;
+            iters.push_back(i);
+            p = strchr(p + 1, ',');
+            if (!p) break;
+            ++p;
+        }
+        if (iters.empty()) return false;
+        _default_state_iterations = iters;
+        return true;
+    }
+
+    bool cmd_samples(const char *line)
+    {
+        int samples = int(strtol(line, nullptr, 10));
+        if (samples <= 0) return false;
+        _default_samples = samples;
+        return true;
+    }
+
+    bool cmd_no_run(const char *line)
+    {
+        if (*line) return false;
+        _should_run = false;
+        return true;
+    }
+
+    bool cmd_run_suite(const char *line)
+    {
+        auto new_end = std::remove_if(_suites.begin(), _suites.end(), [line](const rsuite &s) {
+            return !s.name || strcmp(s.name, line) != 0;
+        });
+        _suites.erase(new_end, _suites.end());
+        return true;
+    }
+
+    bool cmd_run_only(const char *line)
+    {
+        std::vector<picostring> names;
+
+        auto p = line;
+        while (true) {
+            const char *q = strchr(p, ',');
+            if (!q) q = p + strlen(p);
+            names.emplace_back(p, int(q - p));
+            if (!*q) break;
+            p = q + 1;
+        }
+
+        for (auto &s : _suites) {
+            auto new_end = std::remove_if(s.benchmarks.begin(),
+                                          s.benchmarks.end(),
+                                          [&names](const std::unique_ptr<benchmark_impl> &b) {
+                                              auto f = std::find(names.begin(), names.end(), b->name());
+                                              return f == names.end();
+                                          });
+            s.benchmarks.erase(new_end, s.benchmarks.end());
+        }
+        return true;
+    }
+
+    bool cmd_list(const char *line)
+    {
+        if (*line) return false;
+        _should_run = false;
+        for (auto &suite : _suites) {
+            if (suite.name) {
+                *_stdout << "  " << suite.name << ":\n";
+            } else {
+                *_stdout << "  <Default suite>:\n";
+            }
+            for (auto &bench : suite.benchmarks) { *_stdout << "    " << bench->name() << "\n"; }
+        }
+        _should_run = false;
+        return true;
+    }
+
+    bool cmd_version(const char *line)
+    {
+        if (*line) return false;
+        *_stdout << "picobench " PICOBENCH_VERSION_STR << "\n";
+        _should_run = false;
+        return true;
+    }
+
+    bool cmd_help(const char *line)
+    {
+        if (*line) return false;
+        cmd_version(line);
+        auto &cout = *_stdout;
+        for (auto &opt : _opts) {
+            cout << ' ' << _cmd_prefix.str << opt.cmd.str << opt.arg_desc.str;
+            int w = 27 - (_cmd_prefix.len + opt.cmd.len + opt.arg_desc.len);
+            for (int i = 0; i < w; ++i) { cout.put(' '); }
+            cout << opt.desc << "\n";
+        }
+        _should_run = false;
+        return true;
+    }
+
+    bool cmd_out_fmt(const char *line)
+    {
+        if (strcmp(line, "txt") == 0) {
+            _output_format = report_output_format::text;
+        } else if (strcmp(line, "con") == 0) {
+            _output_format = report_output_format::concise_text;
+        } else if (strcmp(line, "csv") == 0) {
+            _output_format = report_output_format::csv;
+        } else {
+            return false;
+        }
+        return true;
+    }
+
+    bool cmd_output(const char *line)
+    {
+        if (strcmp(line, "stdout") != 0) {
+            _output_file = line;
+        } else {
+            _output_file = nullptr;
+        }
+        return true;
+    }
+
+    bool cmd_compare_results(const char *line)
+    {
+        if (*line) return false;
+        _compare_results_across_samples    = true;
+        _compare_results_across_benchmarks = true;
+        return true;
+    }
+};
+
+class local_runner : public runner {
+public:
+    local_runner() : runner(true) {}
+};
+
+// } // anonymous namespace
+
+benchmark::benchmark(const char *name, benchmark_proc proc) : _name(name), _proc(proc) {}
+
+benchmark &
+global_registry::new_benchmark(const char *name, benchmark_proc proc)
+{
+    return g_registry().add_benchmark(name, proc);
+}
+
+int
+global_registry::set_bench_suite(const char *name)
+{
+    g_registry().current_suite_name() = name;
+    return 0;
+}
+
+#if (defined(_MSC_VER) || defined(__MINGW32__)) && !defined(PICOBENCH_TEST)
+
+static const long long high_res_clock_freq = []() -> long long {
+    LARGE_INTEGER frequency;
+    QueryPerformanceFrequency(&frequency);
+    return frequency.QuadPart;
+}();
+
+high_res_clock::time_point
+high_res_clock::now()
+{
+    LARGE_INTEGER t;
+    QueryPerformanceCounter(&t);
+    return time_point(duration((t.QuadPart * rep(period::den)) / high_res_clock_freq));
+}
+#endif
+}// namespace PICOBENCH_NAMESPACE
+
+#endif
+
+#if defined(PICOBENCH_IMPLEMENT_MAIN)
+int
+main(int argc, char *argv[])
+{
+    PICOBENCH_NAMESPACE::runner r;
+    r.parse_cmd_line(argc, argv);
+    return r.run();
+}
+#endif
+
+#if defined(PICOBENCH_TEST)
+
+// fake time keeping functions for the tests
+namespace PICOBENCH_NAMESPACE {
+namespace test {
+
+void this_thread_sleep_for_ns(uint64_t ns);
+
+template<class Rep, class Period>
+void
+this_thread_sleep_for(const std::chrono::duration<Rep, Period> &duration)
+{
+    this_thread_sleep_for_ns(std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count());
+}
+
+#if defined(PICOBENCH_IMPLEMENT)
+static struct fake_time {
+    uint64_t now;
+} the_time;
+
+void
+this_thread_sleep_for_ns(uint64_t ns)
+{
+    the_time.now += ns;
+}
+
+}// namespace test
+
+high_res_clock::time_point
+high_res_clock::now()
+{
+    auto ret = time_point(duration(test::the_time.now));
+    return ret;
+#endif
+}// namespace test
+}// namespace PICOBENCH_NAMESPACE
+#endif