init repo.

2024-12-19 13:14:37 +08:00
commit 7d7845acb5
1412 changed files with 596214 additions and 0 deletions
--- a/third_party/mimalloc/include/mimalloc/atomic.h
+++ b/third_party/mimalloc/include/mimalloc/atomic.h
@ -0,0 +1,507 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_ATOMIC_H
+#define MIMALLOC_ATOMIC_H
+
+// include windows.h or pthreads.h
+#if defined(_WIN32)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#elif !defined(__wasi__) && (!defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__))
+#define  MI_USE_PTHREADS
+#include <pthread.h>
+#endif
+
+// --------------------------------------------------------------------------------------------
+// Atomics
+// We need to be portable between C, C++, and MSVC.
+// We base the primitives on the C/C++ atomics and create a mimimal wrapper for MSVC in C compilation mode.
+// This is why we try to use only `uintptr_t` and `<type>*` as atomic types.
+// To gain better insight in the range of used atomics, we use explicitly named memory order operations
+// instead of passing the memory order as a parameter.
+// -----------------------------------------------------------------------------------------------
+
+#if defined(__cplusplus)
+// Use C++ atomics
+#include <atomic>
+#define  _Atomic(tp)            std::atomic<tp>
+#define  mi_atomic(name)        std::atomic_##name
+#define  mi_memory_order(name)  std::memory_order_##name
+#if (__cplusplus >= 202002L)    // c++20, see issue #571
+ #define MI_ATOMIC_VAR_INIT(x)  x
+#elif !defined(ATOMIC_VAR_INIT)
+ #define MI_ATOMIC_VAR_INIT(x)  x
+#else
+ #define MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+#endif
+#elif defined(_MSC_VER)
+// Use MSVC C wrapper for C11 atomics
+#define  _Atomic(tp)            tp
+#define  MI_ATOMIC_VAR_INIT(x)  x
+#define  mi_atomic(name)        mi_atomic_##name
+#define  mi_memory_order(name)  mi_memory_order_##name
+#else
+// Use C11 atomics
+#include <stdatomic.h>
+#define  mi_atomic(name)        atomic_##name
+#define  mi_memory_order(name)  memory_order_##name
+#if (__STDC_VERSION__ >= 201710L) // c17, see issue #735
+ #define MI_ATOMIC_VAR_INIT(x) x
+#elif !defined(ATOMIC_VAR_INIT)
+ #define MI_ATOMIC_VAR_INIT(x) x
+#else
+ #define MI_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x)
+#endif
+#endif
+
+// Various defines for all used memory orders in mimalloc
+#define mi_atomic_cas_weak(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_weak_explicit)(p,expected,desired,mem_success,mem_fail)
+
+#define mi_atomic_cas_strong(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_strong_explicit)(p,expected,desired,mem_success,mem_fail)
+
+#define mi_atomic_load_acquire(p)                mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_load_relaxed(p)                mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_store_release(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_store_relaxed(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+#define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+
+#define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))
+
+#define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
+#define mi_atomic_decrement_relaxed(p)           mi_atomic_sub_relaxed(p,(uintptr_t)1)
+#define mi_atomic_increment_acq_rel(p)           mi_atomic_add_acq_rel(p,(uintptr_t)1)
+#define mi_atomic_decrement_acq_rel(p)           mi_atomic_sub_acq_rel(p,(uintptr_t)1)
+
+static inline void mi_atomic_yield(void);
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add);
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
+
+
+#if defined(__cplusplus) || !defined(_MSC_VER)
+
+// In C++/C11 atomics we have polymorphic atomics so can use the typed `ptr` variants (where `tp` is the type of atomic value)
+// We use these macros so we can provide a typed wrapper in MSVC in C compilation mode as well
+#define mi_atomic_load_ptr_acquire(tp,p)                mi_atomic_load_acquire(p)
+#define mi_atomic_load_ptr_relaxed(tp,p)                mi_atomic_load_relaxed(p)
+
+// In C++ we need to add casts to help resolve templates if NULL is passed
+#if defined(__cplusplus)
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release(p,(tp*)x)
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed(p,(tp*)x)
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,(tp*)des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,(tp*)x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,(tp*)x)
+#else
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release(p,x)
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed(p,x)
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,x)
+#endif
+
+// These are used by the statistics
+static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) {
+  return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
+}
+static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
+  int64_t current = mi_atomic_load_relaxed((_Atomic(int64_t)*)p);
+  while (current < x && !mi_atomic_cas_weak_release((_Atomic(int64_t)*)p, &current, x)) { /* nothing */ };
+}
+
+// Used by timers
+#define mi_atomic_loadi64_acquire(p)            mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)            mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x)         mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x)         mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+
+#define mi_atomic_casi64_strong_acq_rel(p,e,d)  mi_atomic_cas_strong_acq_rel(p,e,d)
+#define mi_atomic_addi64_acq_rel(p,i)           mi_atomic_add_acq_rel(p,i)
+
+
+#elif defined(_MSC_VER)
+
+// Legacy MSVC plain C compilation wrapper that uses Interlocked operations to model C11 atomics.
+#include <intrin.h>
+#ifdef _WIN64
+typedef LONG64   msc_intptr_t;
+#define MI_64(f) f##64
+#else
+typedef LONG     msc_intptr_t;
+#define MI_64(f) f
+#endif
+
+typedef enum mi_memory_order_e {
+  mi_memory_order_relaxed,
+  mi_memory_order_consume,
+  mi_memory_order_acquire,
+  mi_memory_order_release,
+  mi_memory_order_acq_rel,
+  mi_memory_order_seq_cst
+} mi_memory_order;
+
+static inline uintptr_t mi_atomic_fetch_add_explicit(_Atomic(uintptr_t)*p, uintptr_t add, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+}
+static inline uintptr_t mi_atomic_fetch_sub_explicit(_Atomic(uintptr_t)*p, uintptr_t sub, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub));
+}
+static inline uintptr_t mi_atomic_fetch_and_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline uintptr_t mi_atomic_fetch_or_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline bool mi_atomic_compare_exchange_strong_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
+  (void)(mo1); (void)(mo2);
+  uintptr_t read = (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)(*expected));
+  if (read == *expected) {
+    return true;
+  }
+  else {
+    *expected = read;
+    return false;
+  }
+}
+static inline bool mi_atomic_compare_exchange_weak_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
+  return mi_atomic_compare_exchange_strong_explicit(p, expected, desired, mo1, mo2);
+}
+static inline uintptr_t mi_atomic_exchange_explicit(_Atomic(uintptr_t)*p, uintptr_t exchange, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
+}
+static inline void mi_atomic_thread_fence(mi_memory_order mo) {
+  (void)(mo);
+  _Atomic(uintptr_t) x = 0;
+  mi_atomic_exchange_explicit(&x, 1, mo);
+}
+static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_IX86) || defined(_M_X64)
+  return *p;
+#else
+  uintptr_t x = *p;
+  if (mo > mi_memory_order_relaxed) {
+    while (!mi_atomic_compare_exchange_weak_explicit((_Atomic(uintptr_t)*)p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
+  }
+  return x;
+#endif
+}
+static inline void mi_atomic_store_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_IX86) || defined(_M_X64)
+  *p = x;
+#else
+  mi_atomic_exchange_explicit(p, x, mo);
+#endif
+}
+static inline int64_t mi_atomic_loadi64_explicit(_Atomic(int64_t)*p, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_X64)
+  return *p;
+#else
+  int64_t old = *p;
+  int64_t x = old;
+  while ((old = InterlockedCompareExchange64(p, x, old)) != x) {
+    x = old;
+  }
+  return x;
+#endif
+}
+static inline void mi_atomic_storei64_explicit(_Atomic(int64_t)*p, int64_t x, mi_memory_order mo) {
+  (void)(mo);
+#if defined(x_M_IX86) || defined(_M_X64)
+  *p = x;
+#else
+  InterlockedExchange64(p, x);
+#endif
+}
+
+// These are used by the statistics
+static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)*p, int64_t add) {
+#ifdef _WIN64
+  return (int64_t)mi_atomic_addi((int64_t*)p, add);
+#else
+  int64_t current;
+  int64_t sum;
+  do {
+    current = *p;
+    sum = current + add;
+  } while (_InterlockedCompareExchange64(p, sum, current) != current);
+  return current;
+#endif
+}
+static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) {
+  int64_t current;
+  do {
+    current = *p;
+  } while (current < x && _InterlockedCompareExchange64(p, x, current) != current);
+}
+
+static inline void mi_atomic_addi64_acq_rel(volatile _Atomic(int64_t*)p, int64_t i) {
+  mi_atomic_addi64_relaxed(p, i);
+}
+
+static inline bool mi_atomic_casi64_strong_acq_rel(volatile _Atomic(int64_t*)p, int64_t* exp, int64_t des) {
+  int64_t read = _InterlockedCompareExchange64(p, des, *exp);
+  if (read == *exp) {
+    return true;
+  }
+  else {
+    *exp = read;
+    return false;
+  }
+}
+
+// The pointer macros cast to `uintptr_t`.
+#define mi_atomic_load_ptr_acquire(tp,p)                (tp*)mi_atomic_load_acquire((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_load_ptr_relaxed(tp,p)                (tp*)mi_atomic_load_relaxed((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+
+#define mi_atomic_loadi64_acquire(p)    mi_atomic(loadi64_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)    mi_atomic(loadi64_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(relaxed))
+
+
+#endif
+
+
+// Atomically add a signed value; returns the previous value.
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add) {
+  return (intptr_t)mi_atomic_add_acq_rel((_Atomic(uintptr_t)*)p, (uintptr_t)add);
+}
+
+// Atomically subtract a signed value; returns the previous value.
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
+  return (intptr_t)mi_atomic_addi(p, -sub);
+}
+
+
+// ----------------------------------------------------------------------
+// Once and Guard
+// ----------------------------------------------------------------------
+
+typedef _Atomic(uintptr_t) mi_atomic_once_t;
+
+// Returns true only on the first invocation
+static inline bool mi_atomic_once( mi_atomic_once_t* once ) {
+  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(once, &expected, (uintptr_t)1); // try to set to 1
+}
+
+typedef _Atomic(uintptr_t) mi_atomic_guard_t;
+
+// Allows only one thread to execute at a time
+#define mi_atomic_guard(guard) \
+  uintptr_t _mi_guard_expected = 0; \
+  for(bool _mi_guard_once = true; \
+      _mi_guard_once && mi_atomic_cas_strong_acq_rel(guard,&_mi_guard_expected,(uintptr_t)1); \
+      (mi_atomic_store_release(guard,(uintptr_t)0), _mi_guard_once = false) )
+
+
+
+// ----------------------------------------------------------------------
+// Yield
+// ----------------------------------------------------------------------
+
+#if defined(__cplusplus)
+#include <thread>
+static inline void mi_atomic_yield(void) {
+  std::this_thread::yield();
+}
+#elif defined(_WIN32)
+static inline void mi_atomic_yield(void) {
+  YieldProcessor();
+}
+#elif defined(__SSE2__)
+#include <emmintrin.h>
+static inline void mi_atomic_yield(void) {
+  _mm_pause();
+}
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__armel__) || defined(__ARMEL__) || \
+       defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) || defined(__POWERPC__)
+#if defined(__x86_64__) || defined(__i386__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("pause" ::: "memory");
+}
+#elif defined(__aarch64__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile("wfe");
+}
+#elif (defined(__arm__) && __ARM_ARCH__ >= 7)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__)
+#ifdef __APPLE__
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("or r27,r27,r27" ::: "memory");
+}
+#else
+static inline void mi_atomic_yield(void) {
+  __asm__ __volatile__ ("or 27,27,27" ::: "memory");
+}
+#endif
+#elif defined(__armel__) || defined(__ARMEL__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("nop" ::: "memory");
+}
+#endif
+#elif defined(__sun)
+// Fallback for other archs
+#include <synch.h>
+static inline void mi_atomic_yield(void) {
+  smt_pause();
+}
+#elif defined(__wasi__)
+#include <sched.h>
+static inline void mi_atomic_yield(void) {
+  sched_yield();
+}
+#else
+#include <unistd.h>
+static inline void mi_atomic_yield(void) {
+  sleep(0);
+}
+#endif
+
+
+// ----------------------------------------------------------------------
+// Locks are only used for abandoned segment visiting in `arena.c`
+// ----------------------------------------------------------------------
+
+#if defined(_WIN32)
+
+#define mi_lock_t  CRITICAL_SECTION
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return TryEnterCriticalSection(lock);
+}
+static inline bool mi_lock_acquire(mi_lock_t* lock) {
+  EnterCriticalSection(lock);
+  return true;
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  LeaveCriticalSection(lock);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  InitializeCriticalSection(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  DeleteCriticalSection(lock);
+}
+
+
+#elif defined(MI_USE_PTHREADS)
+
+#define mi_lock_t  pthread_mutex_t
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return (pthread_mutex_trylock(lock) == 0);
+}
+static inline bool mi_lock_acquire(mi_lock_t* lock) {
+  return (pthread_mutex_lock(lock) == 0);
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  pthread_mutex_unlock(lock);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  pthread_mutex_init(lock, NULL);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  pthread_mutex_destroy(lock);
+}
+
+/*
+#elif defined(__cplusplus)
+
+#include <mutex>
+#define mi_lock_t  std::mutex
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return lock->lock_try_acquire();
+}
+static inline bool mi_lock_acquire(mi_lock_t* lock) {
+  lock->lock();
+  return true;
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  lock->unlock();
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  (void)(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
+*/
+
+#else
+
+// fall back to poor man's locks.
+// this should only be the case in a single-threaded environment (like __wasi__)
+
+#define mi_lock_t  _Atomic(uintptr_t)
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(lock, &expected, (uintptr_t)1);
+}
+static inline bool mi_lock_acquire(mi_lock_t* lock) {
+  for (int i = 0; i < 1000; i++) {  // for at most 1000 tries?
+    if (mi_lock_try_acquire(lock)) return true;
+    mi_atomic_yield();
+  }
+  return true;
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  mi_atomic_store_release(lock, (uintptr_t)0);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  mi_lock_release(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
+
+#endif
+
+
+
+
+#endif // __MIMALLOC_ATOMIC_H
--- a/third_party/mimalloc/include/mimalloc/internal.h
+++ b/third_party/mimalloc/include/mimalloc/internal.h
--- a/third_party/mimalloc/include/mimalloc/prim.h
+++ b/third_party/mimalloc/include/mimalloc/prim.h
@ -0,0 +1,375 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_PRIM_H
+#define MIMALLOC_PRIM_H
+
+
+// --------------------------------------------------------------------------
+// This file specifies the primitive portability API.
+// Each OS/host needs to implement these primitives, see `src/prim`
+// for implementations on Window, macOS, WASI, and Linux/Unix.
+//
+// note: on all primitive functions, we always have result parameters != NULL, and:
+//  addr != NULL and page aligned
+//  size > 0     and page aligned
+//  the return value is an error code as an `int` where 0 is success
+// --------------------------------------------------------------------------
+
+// OS memory configuration
+typedef struct mi_os_mem_config_s {
+  size_t  page_size;            // default to 4KiB
+  size_t  large_page_size;      // 0 if not supported, usually 2MiB (4MiB on Windows)
+  size_t  alloc_granularity;    // smallest allocation size (usually 4KiB, on Windows 64KiB)
+  size_t  physical_memory;      // physical memory size
+  size_t  virtual_address_bits; // usually 48 or 56 bits on 64-bit systems. (used to determine secure randomization)
+  bool    has_overcommit;       // can we reserve more memory than can be actually committed?
+  bool    has_partial_free;     // can allocated blocks be freed partially? (true for mmap, false for VirtualAlloc)
+  bool    has_virtual_reserve;  // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory)
+} mi_os_mem_config_t;
+
+// Initialize
+void _mi_prim_mem_init( mi_os_mem_config_t* config );
+
+// Free OS memory
+int _mi_prim_free(void* addr, size_t size );
+
+// Allocate OS memory. Return NULL on error.
+// The `try_alignment` is just a hint and the returned pointer does not have to be aligned.
+// If `commit` is false, the virtual memory range only needs to be reserved (with no access)
+// which will later be committed explicitly using `_mi_prim_commit`.
+// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
+// The `hint_addr` address is either `NULL` or a preferred allocation address but can be ignored.
+// pre: !commit => !allow_large
+//      try_alignment >= _mi_os_page_size() and a power of 2
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr);
+
+// Commit memory. Returns error code or 0 on success.
+// For example, on Linux this would make the memory PROT_READ|PROT_WRITE.
+// `is_zero` is set to true if the memory was zero initialized (e.g. on Windows)
+int _mi_prim_commit(void* addr, size_t size, bool* is_zero);
+
+// Decommit memory. Returns error code or 0 on success. The `needs_recommit` result is true
+// if the memory would need to be re-committed. For example, on Windows this is always true,
+// but on Linux we could use MADV_DONTNEED to decommit which does not need a recommit.
+// pre: needs_recommit != NULL
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit);
+
+// Reset memory. The range keeps being accessible but the content might be reset.
+// Returns error code or 0 on success.
+int _mi_prim_reset(void* addr, size_t size);
+
+// Protect memory. Returns error code or 0 on success.
+int _mi_prim_protect(void* addr, size_t size, bool protect);
+
+// Allocate huge (1GiB) pages possibly associated with a NUMA node.
+// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
+// pre: size > 0  and a multiple of 1GiB.
+//      numa_node is either negative (don't care), or a numa node number.
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr);
+
+// Return the current NUMA node
+size_t _mi_prim_numa_node(void);
+
+// Return the number of logical NUMA nodes
+size_t _mi_prim_numa_node_count(void);
+
+// Clock ticks
+mi_msecs_t _mi_prim_clock_now(void);
+
+// Return process information (only for statistics)
+typedef struct mi_process_info_s {
+  mi_msecs_t  elapsed;
+  mi_msecs_t  utime;
+  mi_msecs_t  stime;
+  size_t      current_rss;
+  size_t      peak_rss;
+  size_t      current_commit;
+  size_t      peak_commit;
+  size_t      page_faults;
+} mi_process_info_t;
+
+void _mi_prim_process_info(mi_process_info_t* pinfo);
+
+// Default stderr output. (only for warnings etc. with verbose enabled)
+// msg != NULL && _mi_strlen(msg) > 0
+void _mi_prim_out_stderr( const char* msg );
+
+// Get an environment variable. (only for options)
+// name != NULL, result != NULL, result_size >= 64
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size);
+
+
+// Fill a buffer with strong randomness; return `false` on error or if
+// there is no strong randomization available.
+bool _mi_prim_random_buf(void* buf, size_t buf_len);
+
+// Called on the first thread start, and should ensure `_mi_thread_done` is called on thread termination.
+void _mi_prim_thread_init_auto_done(void);
+
+// Called on process exit and may take action to clean up resources associated with the thread auto done.
+void _mi_prim_thread_done_auto_done(void);
+
+// Called when the default heap for a thread changes
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
+
+
+
+//-------------------------------------------------------------------
+// Thread id: `_mi_prim_thread_id()`
+//
+// Getting the thread id should be performant as it is called in the
+// fast path of `_mi_free` and we specialize for various platforms as
+// inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
+// We only require _mi_prim_thread_id() to return a unique id
+// for each thread (unequal to zero).
+//-------------------------------------------------------------------
+
+// On some libc + platform combinations we can directly access a thread-local storage (TLS) slot.
+// The TLS layout depends on both the OS and libc implementation so we use specific tests for each main platform.
+// If you test on another platform and it works please send a PR :-)
+// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
+//
+// Note: we would like to prefer `__builtin_thread_pointer()` nowadays instead of using assembly,
+// but unfortunately we can not detect support reliably (see issue #883)
+// We also use it on Apple OS as we use a TLS slot for the default heap there.
+#if defined(__GNUC__) && ( \
+           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
+        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__) || defined(__POWERPC__))) \
+        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
+        || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+        || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+      )
+
+#define MI_HAS_TLS_SLOT
+
+static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
+  void* res;
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    res = tcb[slot];
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    res = tcb[slot];
+  #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
+    MI_UNUSED(ofs);
+    res = pthread_getspecific(slot);
+  #endif
+  return res;
+}
+
+// setting a tls slot is only used on macOS for now
+static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOS uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    tcb[slot] = value;
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    tcb[slot] = value;
+  #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
+    MI_UNUSED(ofs);
+    pthread_setspecific(slot, value);
+  #endif
+}
+
+#endif
+
+// Do we have __builtin_thread_pointer? This would be the preferred way to get a unique thread id
+// but unfortunately, it seems we cannot test for this reliably at this time (see issue #883)
+// Nevertheless, it seems needed on older graviton platforms (see issue #851).
+// For now, we only enable this for specific platforms.
+#if !defined(__APPLE__)  /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
+    && !defined(MI_LIBC_MUSL) \
+    && (!defined(__clang_major__) || __clang_major__ >= 14)  /* older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>) */
+  #if    (defined(__GNUC__) && (__GNUC__ >= 7)  && defined(__aarch64__)) /* aarch64 for older gcc versions (issue #851) */ \
+      || (defined(__GNUC__) && (__GNUC__ >= 11) && defined(__x86_64__)) \
+      || (defined(__clang_major__) && (__clang_major__ >= 14) && (defined(__aarch64__) || defined(__x86_64__)))
+    #define MI_USE_BUILTIN_THREAD_POINTER  1
+  #endif
+#endif
+
+
+
+// defined in `init.c`; do not use these directly
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
+
+// Get a unique id for the current thread.
+#if defined(MI_PRIM_THREAD_ID)
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  return MI_PRIM_THREAD_ID();  // used for example by CPython for a free threaded build (see python/cpython#115488)
+}
+
+#elif defined(_WIN32)
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Windows: works on Intel and ARM in both 32- and 64-bit
+  return (uintptr_t)NtCurrentTeb();
+}
+
+#elif MI_USE_BUILTIN_THREAD_POINTER
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Works on most Unix based platforms with recent compilers
+  return (uintptr_t)__builtin_thread_pointer();
+}
+
+#elif defined(MI_HAS_TLS_SLOT)
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  #if defined(__BIONIC__)
+    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
+    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
+    return (uintptr_t)mi_prim_tls_slot(1);
+  #else
+    // in all our other targets, slot 0 is the thread id
+    // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
+    // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
+    return (uintptr_t)mi_prim_tls_slot(0);
+  #endif
+}
+
+#else
+
+// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  return (uintptr_t)&_mi_heap_default;
+}
+
+#endif
+
+
+
+/* ----------------------------------------------------------------------------------------
+The thread local default heap: `_mi_prim_get_default_heap()`
+This is inlined here as it is on the fast path for allocation functions.
+
+On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
+__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
+that the storage will always be available (allocated on the thread stacks).
+
+On some platforms though we cannot use that when overriding `malloc` since the underlying
+TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
+We try to circumvent this in an efficient way:
+- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
+           loader itself calls `malloc` even before the modules are initialized.
+- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
+- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
+------------------------------------------------------------------------------------------- */
+
+static inline mi_heap_t* mi_prim_get_default_heap(void);
+
+#if defined(MI_MALLOC_OVERRIDE)
+#if defined(__APPLE__) // macOS
+  #define MI_TLS_SLOT               89  // seems unused?
+  // other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+  // see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#elif defined(__OpenBSD__)
+  // use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16)
+  // see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+  #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)
+  // #elif defined(__DragonFly__)
+  // #warning "mimalloc is not working correctly on DragonFly yet."
+  // #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
+#elif defined(__ANDROID__)
+  // See issue #381
+  #define MI_TLS_PTHREAD
+#endif
+#endif
+
+
+#if defined(MI_TLS_SLOT)
+# if !defined(MI_HAS_TLS_SLOT)
+#  error "trying to use a TLS slot for the default heap, but the mi_prim_tls_slot primitives are not defined"
+# endif
+
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT);
+  if mi_unlikely(heap == NULL) {
+    #ifdef __GNUC__
+    __asm(""); // prevent conditional load of the address of _mi_heap_empty
+    #endif
+    heap = (mi_heap_t*)&_mi_heap_empty;
+  }
+  return heap;
+}
+
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+
+static inline mi_heap_t** mi_prim_tls_pthread_heap_slot(void) {
+  pthread_t self = pthread_self();
+  #if defined(__DragonFly__)
+  if (self==NULL) return NULL;
+  #endif
+  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
+}
+
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  mi_heap_t** pheap = mi_prim_tls_pthread_heap_slot();
+  if mi_unlikely(pheap == NULL) return _mi_heap_main_get();
+  mi_heap_t* heap = *pheap;
+  if mi_unlikely(heap == NULL) return (mi_heap_t*)&_mi_heap_empty;
+  return heap;
+}
+
+#elif defined(MI_TLS_PTHREAD)
+
+extern pthread_key_t _mi_heap_default_key;
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+}
+
+#else // default using a thread local variable; used on most platforms.
+
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  #if defined(MI_TLS_RECURSE_GUARD)
+  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
+  #endif
+  return _mi_heap_default;
+}
+
+#endif  // mi_prim_get_default_heap()
+
+
+
+
+
+#endif  // MIMALLOC_PRIM_H
--- a/third_party/mimalloc/include/mimalloc/track.h
+++ b/third_party/mimalloc/include/mimalloc/track.h
@ -0,0 +1,145 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_TRACK_H
+#define MIMALLOC_TRACK_H
+
+/* ------------------------------------------------------------------------------------------------------
+Track memory ranges with macros for tools like Valgrind address sanitizer, or other memory checkers.
+These can be defined for tracking allocation:
+
+  #define mi_track_malloc_size(p,reqsize,size,zero)
+  #define mi_track_free_size(p,_size)
+
+The macros are set up such that the size passed to `mi_track_free_size`
+always matches the size of `mi_track_malloc_size`. (currently, `size == mi_usable_size(p)`).
+The `reqsize` is what the user requested, and `size >= reqsize`.
+The `size` is either byte precise (and `size==reqsize`) if `MI_PADDING` is enabled,
+or otherwise it is the usable block size which may be larger than the original request.
+Use `_mi_block_size_of(void* p)` to get the full block size that was allocated (including padding etc).
+The `zero` parameter is `true` if the allocated block is zero initialized.
+
+Optional:
+
+  #define mi_track_align(p,alignedp,offset,size)
+  #define mi_track_resize(p,oldsize,newsize)
+  #define mi_track_init()
+
+The `mi_track_align` is called right after a `mi_track_malloc` for aligned pointers in a block.
+The corresponding `mi_track_free` still uses the block start pointer and original size (corresponding to the `mi_track_malloc`).
+The `mi_track_resize` is currently unused but could be called on reallocations within a block.
+`mi_track_init` is called at program start.
+
+The following macros are for tools like asan and valgrind to track whether memory is
+defined, undefined, or not accessible at all:
+
+  #define mi_track_mem_defined(p,size)
+  #define mi_track_mem_undefined(p,size)
+  #define mi_track_mem_noaccess(p,size)
+
+-------------------------------------------------------------------------------------------------------*/
+
+#if MI_TRACK_VALGRIND
+// valgrind tool
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 1           // track free of individual blocks on heap_destroy
+#define MI_TRACK_TOOL         "valgrind"
+
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+
+#define mi_track_malloc_size(p,reqsize,size,zero) VALGRIND_MALLOCLIKE_BLOCK(p,size,MI_PADDING_SIZE /*red zone*/,zero)
+#define mi_track_free_size(p,_size)               VALGRIND_FREELIKE_BLOCK(p,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_resize(p,oldsize,newsize)        VALGRIND_RESIZEINPLACE_BLOCK(p,oldsize,newsize,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_mem_defined(p,size)              VALGRIND_MAKE_MEM_DEFINED(p,size)
+#define mi_track_mem_undefined(p,size)            VALGRIND_MAKE_MEM_UNDEFINED(p,size)
+#define mi_track_mem_noaccess(p,size)             VALGRIND_MAKE_MEM_NOACCESS(p,size)
+
+#elif MI_TRACK_ASAN
+// address sanitizer
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 0
+#define MI_TRACK_TOOL         "asan"
+
+#include <sanitizer/asan_interface.h>
+
+#define mi_track_malloc_size(p,reqsize,size,zero) ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_free_size(p,size)                ASAN_POISON_MEMORY_REGION(p,size)
+#define mi_track_mem_defined(p,size)              ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_undefined(p,size)            ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_noaccess(p,size)             ASAN_POISON_MEMORY_REGION(p,size)
+
+#elif MI_TRACK_ETW
+// windows event tracing
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 1
+#define MI_TRACK_TOOL         "ETW"
+
+#include "../src/prim/windows/etw.h"
+
+#define mi_track_init()                           EventRegistermicrosoft_windows_mimalloc();
+#define mi_track_malloc_size(p,reqsize,size,zero) EventWriteETW_MI_ALLOC((UINT64)(p), size)
+#define mi_track_free_size(p,size)                EventWriteETW_MI_FREE((UINT64)(p), size)
+
+#else
+// no tracking
+
+#define MI_TRACK_ENABLED      0
+#define MI_TRACK_HEAP_DESTROY 0
+#define MI_TRACK_TOOL         "none"
+
+#define mi_track_malloc_size(p,reqsize,size,zero)
+#define mi_track_free_size(p,_size)
+
+#endif
+
+// -------------------
+// Utility definitions
+
+#ifndef mi_track_resize
+#define mi_track_resize(p,oldsize,newsize)      mi_track_free_size(p,oldsize); mi_track_malloc(p,newsize,false)
+#endif
+
+#ifndef mi_track_align
+#define mi_track_align(p,alignedp,offset,size)  mi_track_mem_noaccess(p,offset)
+#endif
+
+#ifndef mi_track_init
+#define mi_track_init()
+#endif
+
+#ifndef mi_track_mem_defined
+#define mi_track_mem_defined(p,size)
+#endif
+
+#ifndef mi_track_mem_undefined
+#define mi_track_mem_undefined(p,size)
+#endif
+
+#ifndef mi_track_mem_noaccess
+#define mi_track_mem_noaccess(p,size)
+#endif
+
+
+#if MI_PADDING
+#define mi_track_malloc(p,reqsize,zero) \
+  if ((p)!=NULL) { \
+    mi_assert_internal(mi_usable_size(p)==(reqsize)); \
+    mi_track_malloc_size(p,reqsize,reqsize,zero); \
+  }
+#else
+#define mi_track_malloc(p,reqsize,zero) \
+  if ((p)!=NULL) { \
+    mi_assert_internal(mi_usable_size(p)>=(reqsize)); \
+    mi_track_malloc_size(p,reqsize,mi_usable_size(p),zero); \
+  }
+#endif
+
+#endif
--- a/third_party/mimalloc/include/mimalloc/types.h
+++ b/third_party/mimalloc/include/mimalloc/types.h
@ -0,0 +1,683 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_TYPES_H
+#define MIMALLOC_TYPES_H
+
+// --------------------------------------------------------------------------
+// This file contains the main type definitions for mimalloc:
+// mi_heap_t      : all data for a thread-local heap, contains
+//                  lists of all managed heap pages.
+// mi_segment_t   : a larger chunk of memory (32GiB) from where pages
+//                  are allocated.
+// mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
+//                  where objects are allocated.
+//                  Note: we write "OS page" for OS memory pages while
+//                  using plain "page" for mimalloc pages (`mi_page_t`).
+// --------------------------------------------------------------------------
+
+
+#include <stddef.h>   // ptrdiff_t
+#include <stdint.h>   // uintptr_t, uint16_t, etc
+#include "atomic.h"   // _Atomic
+
+#ifdef _MSC_VER
+#pragma warning(disable:4214) // bitfield is not int
+#endif
+
+// Minimal alignment necessary. On most platforms 16 bytes are needed
+// due to SSE registers for example. This must be at least `sizeof(void*)`
+#ifndef MI_MAX_ALIGN_SIZE
+#define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
+#endif
+
+// ------------------------------------------------------
+// Variants
+// ------------------------------------------------------
+
+// Define NDEBUG in the release version to disable assertions.
+// #define NDEBUG
+
+// Define MI_TRACK_<tool> to enable tracking support
+// #define MI_TRACK_VALGRIND 1
+// #define MI_TRACK_ASAN     1
+// #define MI_TRACK_ETW      1
+
+// Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
+// #define MI_STAT 1
+
+// Define MI_SECURE to enable security mitigations
+// #define MI_SECURE 1  // guard page around metadata
+// #define MI_SECURE 2  // guard page around each mimalloc page
+// #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
+// #define MI_SECURE 4  // checks for double free. (may be more expensive)
+
+#if !defined(MI_SECURE)
+#define MI_SECURE 0
+#endif
+
+// Define MI_DEBUG for debug mode
+// #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free.
+// #define MI_DEBUG 2  // + internal assertion checks
+// #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
+#if !defined(MI_DEBUG)
+#if !defined(NDEBUG) || defined(_DEBUG)
+#define MI_DEBUG 2
+#else
+#define MI_DEBUG 0
+#endif
+#endif
+
+// Use guard pages behind objects of a certain size (set by the MIMALLOC_DEBUG_GUARDED_MIN/MAX options)
+// Padding should be disabled when using guard pages
+// #define MI_GUARDED 1
+#if defined(MI_GUARDED)
+#define MI_PADDING  0
+#endif
+
+// Reserve extra padding at the end of each block to be more resilient against heap block overflows.
+// The padding can detect buffer overflow on free.
+#if !defined(MI_PADDING) && (MI_SECURE>=3 || MI_DEBUG>=1 || (MI_TRACK_VALGRIND || MI_TRACK_ASAN || MI_TRACK_ETW))
+#define MI_PADDING  1
+#endif
+
+// Check padding bytes; allows byte-precise buffer overflow detection
+#if !defined(MI_PADDING_CHECK) && MI_PADDING && (MI_SECURE>=3 || MI_DEBUG>=1)
+#define MI_PADDING_CHECK 1
+#endif
+
+
+// Encoded free lists allow detection of corrupted free lists
+// and can detect buffer overflows, modify after free, and double `free`s.
+#if (MI_SECURE>=3 || MI_DEBUG>=1)
+#define MI_ENCODE_FREELIST  1
+#endif
+
+
+// We used to abandon huge pages in order to eagerly deallocate it if freed from another thread.
+// Unfortunately, that makes it not possible to visit them during a heap walk or include them in a
+// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks nowadays if freed from
+// another thread so the memory becomes "virtually" available (and eventually gets properly freed by
+// the owning thread).
+// #define MI_HUGE_PAGE_ABANDON 1
+
+
+// ------------------------------------------------------
+// Platform specific values
+// ------------------------------------------------------
+
+// ------------------------------------------------------
+// Size of a pointer.
+// We assume that `sizeof(void*)==sizeof(intptr_t)`
+// and it holds for all platforms we know of.
+//
+// However, the C standard only requires that:
+//  p == (void*)((intptr_t)p))
+// but we also need:
+//  i == (intptr_t)((void*)i)
+// or otherwise one might define an intptr_t type that is larger than a pointer...
+// ------------------------------------------------------
+
+#if INTPTR_MAX > INT64_MAX
+# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
+#elif INTPTR_MAX == INT64_MAX
+# define MI_INTPTR_SHIFT (3)
+#elif INTPTR_MAX == INT32_MAX
+# define MI_INTPTR_SHIFT (2)
+#else
+#error platform pointers must be 32, 64, or 128 bits
+#endif
+
+#if SIZE_MAX == UINT64_MAX
+# define MI_SIZE_SHIFT (3)
+typedef int64_t  mi_ssize_t;
+#elif SIZE_MAX == UINT32_MAX
+# define MI_SIZE_SHIFT (2)
+typedef int32_t  mi_ssize_t;
+#else
+#error platform objects must be 32 or 64 bits
+#endif
+
+#if (SIZE_MAX/2) > LONG_MAX
+# define MI_ZU(x)  x##ULL
+# define MI_ZI(x)  x##LL
+#else
+# define MI_ZU(x)  x##UL
+# define MI_ZI(x)  x##L
+#endif
+
+#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
+#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+
+#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
+#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+
+#define MI_KiB     (MI_ZU(1024))
+#define MI_MiB     (MI_KiB*MI_KiB)
+#define MI_GiB     (MI_MiB*MI_KiB)
+
+
+// ------------------------------------------------------
+// Main internal data-structures
+// ------------------------------------------------------
+
+// Main tuning parameters for segment and page sizes
+// Sizes for 64-bit, divide by two for 32-bit
+#ifndef MI_SMALL_PAGE_SHIFT
+#define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64KiB
+#endif
+#ifndef MI_MEDIUM_PAGE_SHIFT
+#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512KiB
+#endif
+#ifndef MI_LARGE_PAGE_SHIFT
+#define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4MiB
+#endif
+#ifndef MI_SEGMENT_SHIFT
+#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB -- must be equal to `MI_LARGE_PAGE_SHIFT`
+#endif
+
+// Derived constants
+#define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
+#define MI_SEGMENT_ALIGN                  (MI_SEGMENT_SIZE)
+#define MI_SEGMENT_MASK                   ((uintptr_t)(MI_SEGMENT_ALIGN - 1))
+
+#define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
+#define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
+#define MI_LARGE_PAGE_SIZE                (MI_ZU(1)<<MI_LARGE_PAGE_SHIFT)
+
+#define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
+#define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
+#define MI_LARGE_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE)
+
+// The max object size are checked to not waste more than 12.5% internally over the page sizes.
+// (Except for large pages since huge objects are allocated in 4MiB chunks)
+#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 16KiB
+#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB
+#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)   // 2MiB
+#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
+
+// Maximum number of size classes. (spaced exponentially in 12.5% increments)
+#define MI_BIN_HUGE  (73U)
+
+#if (MI_LARGE_OBJ_WSIZE_MAX >= 655360)
+#error "mimalloc internal: define more bins"
+#endif
+
+// Maximum block size for which blocks are guaranteed to be block size aligned. (see `segment.c:_mi_segment_page_start`)
+#define MI_MAX_ALIGN_GUARANTEE   (MI_MEDIUM_OBJ_SIZE_MAX)
+
+// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
+#define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
+
+// We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+#define MI_MAX_ALLOC_SIZE   PTRDIFF_MAX
+
+// ------------------------------------------------------
+// Mimalloc pages contain allocated blocks
+// ------------------------------------------------------
+
+// The free lists use encoded next fields
+// (Only actually encodes when MI_ENCODED_FREELIST is defined.)
+typedef uintptr_t  mi_encoded_t;
+
+// thread id's
+typedef size_t     mi_threadid_t;
+
+// free lists contain blocks
+typedef struct mi_block_s {
+  mi_encoded_t next;
+} mi_block_t;
+
+#if MI_GUARDED
+// we always align guarded pointers in a block at an offset
+// the block `next` field is then used as a tag to distinguish regular offset aligned blocks from guarded ones
+#define MI_BLOCK_TAG_ALIGNED   ((mi_encoded_t)(0))
+#define MI_BLOCK_TAG_GUARDED   (~MI_BLOCK_TAG_ALIGNED)
+#endif
+
+
+// The delayed flags are used for efficient multi-threaded free-ing
+typedef enum mi_delayed_e {
+  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
+  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
+  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
+  MI_NEVER_DELAYED_FREE = 3  // sticky: used for abondoned pages without a owning heap; this only resets on page reclaim
+} mi_delayed_t;
+
+
+// The `in_full` and `has_aligned` page flags are put in a union to efficiently
+// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+#if !MI_TSAN
+typedef union mi_page_flags_s {
+  uint8_t full_aligned;
+  struct {
+    uint8_t in_full : 1;
+    uint8_t has_aligned : 1;
+  } x;
+} mi_page_flags_t;
+#else
+// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
+typedef union mi_page_flags_s {
+  uint32_t full_aligned;
+  struct {
+    uint8_t in_full;
+    uint8_t has_aligned;
+  } x;
+} mi_page_flags_t;
+#endif
+
+// Thread free list.
+// We use the bottom 2 bits of the pointer for mi_delayed_t flags
+typedef uintptr_t mi_thread_free_t;
+
+// A page contains blocks of one specific size (`block_size`).
+// Each page has three list of free blocks:
+// `free` for blocks that can be allocated,
+// `local_free` for freed blocks that are not yet available to `mi_malloc`
+// `thread_free` for freed blocks by other threads
+// The `local_free` and `thread_free` lists are migrated to the `free` list
+// when it is exhausted. The separate `local_free` list is necessary to
+// implement a monotonic heartbeat. The `thread_free` list is needed for
+// avoiding atomic operations in the common case.
+//
+// `used - |thread_free|` == actual blocks that are in use (alive)
+// `used - |thread_free| + |free| + |local_free| == capacity`
+//
+// We don't count `freed` (as |free|) but use `used` to reduce
+// the number of memory accesses in the `mi_page_all_free` function(s).
+//
+// Notes:
+// - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
+// - Using `uint16_t` does not seem to slow things down
+// - The size is 10 words on 64-bit which helps the page index calculations
+//   (and 12 words on 32-bit, and encoded free lists add 2 words)
+// - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
+//   concurrent frees where only the first concurrent free adds to the owning
+//   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
+//   The invariant is that no-delayed-free is only set if there is
+//   at least one block that will be added, or as already been added, to
+//   the owning heap `thread_delayed_free` list. This guarantees that pages
+//   will be freed correctly even if only other threads free blocks.
+typedef struct mi_page_s {
+  // "owned" by the segment
+  uint8_t               segment_idx;       // index in the segment `pages` array, `page == &segment->pages[page->segment_idx]`
+  uint8_t               segment_in_use:1;  // `true` if the segment allocated this page
+  uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
+  uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
+  uint8_t               is_huge:1;         // `true` if the page is in a huge segment
+
+  // layout like this to optimize access in `mi_malloc` and `mi_free`
+  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
+  uint16_t              reserved;          // number of blocks reserved in memory
+  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
+  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
+  uint8_t               retire_expire:7;   // expiration count for retired blocks
+
+  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
+  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
+                                           // padding
+  size_t                block_size;        // size available in each block (always `>0`)
+  uint8_t*              page_start;        // start of the page area containing the blocks
+
+  #if (MI_ENCODE_FREELIST || MI_PADDING)
+  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
+  #endif
+
+  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
+  _Atomic(uintptr_t)        xheap;
+
+  struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
+  struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
+
+  #if MI_INTPTR_SIZE==4                    // pad to 12 words on 32-bit
+  void* padding[1];
+  #endif
+} mi_page_t;
+
+
+
+// ------------------------------------------------------
+// Mimalloc segments contain mimalloc pages
+// ------------------------------------------------------
+
+typedef enum mi_page_kind_e {
+  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
+  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages inside a segment
+  MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
+  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size (but still 2MiB aligned)
+                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`.
+} mi_page_kind_t;
+
+
+// ---------------------------------------------------------------
+// a memory id tracks the provenance of arena/OS allocated memory
+// ---------------------------------------------------------------
+
+// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
+typedef enum mi_memkind_e {
+  MI_MEM_NONE,      // not allocated
+  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
+  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
+  MI_MEM_OS,        // allocated from the OS
+  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
+  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
+  MI_MEM_ARENA      // allocated from an arena (the usual case)
+} mi_memkind_t;
+
+static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
+  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
+}
+
+typedef struct mi_memid_os_info {
+  void*         base;               // actual base address of the block (used for offset aligned allocations)
+  size_t        alignment;          // alignment at allocation
+} mi_memid_os_info_t;
+
+typedef struct mi_memid_arena_info {
+  size_t        block_index;        // index in the arena
+  mi_arena_id_t id;                 // arena id (>= 1)
+  bool          is_exclusive;       // this arena can only be used for specific arena allocations
+} mi_memid_arena_info_t;
+
+typedef struct mi_memid_s {
+  union {
+    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
+    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
+  } mem;
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
+  bool          initially_committed;// `true` if the memory was originally allocated as committed
+  bool          initially_zero;     // `true` if the memory was originally zero initialized
+  mi_memkind_t  memkind;
+} mi_memid_t;
+
+
+// ---------------------------------------------------------------
+// Segments contain mimalloc pages
+// ---------------------------------------------------------------
+typedef struct mi_subproc_s mi_subproc_t;
+
+// Segments are large allocated memory blocks (2MiB on 64 bit) from the OS.
+// Inside segments we allocated fixed size _pages_ that contain blocks.
+typedef struct mi_segment_s {
+  // constant fields
+  mi_memid_t           memid;            // memory id to track provenance
+  bool                 allow_decommit;
+  bool                 allow_purge;
+  size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
+  mi_subproc_t*        subproc;          // segment belongs to sub process
+
+  // segment fields
+  struct mi_segment_s* next;             // must be the first (non-constant) segment field  -- see `segment.c:segment_init`
+  struct mi_segment_s* prev;
+  bool                 was_reclaimed;    // true if it was reclaimed (used to limit reclaim-on-free reclamation)
+  bool                 dont_free;        // can be temporarily true to ensure the segment is not freed
+
+  size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
+  size_t               abandoned_visits; // count how often this segment is visited for reclaiming (to force reclaim if it is too long)
+
+  size_t               used;             // count of pages in use (`used <= capacity`)
+  size_t               capacity;         // count of available pages (`#free + used`)
+  size_t               segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages.
+  uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
+
+  struct mi_segment_s* abandoned_os_next; // only used for abandoned segments outside arena's, and only if `mi_option_visit_abandoned` is enabled
+  struct mi_segment_s* abandoned_os_prev;
+
+  // layout like this to optimize access in `mi_free`
+  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
+  size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
+  mi_page_kind_t       page_kind;        // kind of pages: small, medium, large, or huge
+  mi_page_t            pages[1];         // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
+} mi_segment_t;
+
+
+// ------------------------------------------------------
+// Heaps
+// Provide first-class heaps to allocate from.
+// A heap just owns a set of pages for allocation and
+// can only be allocate/reallocate from the thread that created it.
+// Freeing blocks can be done from any thread though.
+// Per thread, the segments are shared among its heaps.
+// Per thread, there is always a default heap that is
+// used for allocation; it is initialized to statically
+// point to an empty heap to avoid initialization checks
+// in the fast path.
+// ------------------------------------------------------
+
+// Thread local data
+typedef struct mi_tld_s mi_tld_t;
+
+// Pages of a certain block size are held in a queue.
+typedef struct mi_page_queue_s {
+  mi_page_t* first;
+  mi_page_t* last;
+  size_t     block_size;
+} mi_page_queue_t;
+
+#define MI_BIN_FULL  (MI_BIN_HUGE+1)
+
+// Random context
+typedef struct mi_random_cxt_s {
+  uint32_t input[16];
+  uint32_t output[16];
+  int      output_available;
+  bool     weak;
+} mi_random_ctx_t;
+
+
+// In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
+#if (MI_PADDING)
+typedef struct mi_padding_s {
+  uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
+  uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
+} mi_padding_t;
+#define MI_PADDING_SIZE   (sizeof(mi_padding_t))
+#define MI_PADDING_WSIZE  ((MI_PADDING_SIZE + MI_INTPTR_SIZE - 1) / MI_INTPTR_SIZE)
+#else
+#define MI_PADDING_SIZE   0
+#define MI_PADDING_WSIZE  0
+#endif
+
+#define MI_PAGES_DIRECT   (MI_SMALL_WSIZE_MAX + MI_PADDING_WSIZE + 1)
+
+
+// A heap owns a set of pages.
+struct mi_heap_s {
+  mi_tld_t*             tld;
+  _Atomic(mi_block_t*)  thread_delayed_free;
+  mi_threadid_t         thread_id;                           // thread this heap belongs too
+  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
+  uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
+  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
+  mi_random_ctx_t       random;                              // random number context used for secure allocation
+  size_t                page_count;                          // total number of pages in the `pages` queues.
+  size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
+  size_t                page_retired_max;                    // largest retired index into the `pages` array.
+  mi_heap_t*            next;                                // list of heaps per thread
+  bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+  uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
+  #if MI_GUARDED
+  size_t                guarded_size_min;                    // minimal size for guarded objects
+  size_t                guarded_size_max;                    // maximal size for guarded objects
+  size_t                guarded_sample_rate;                 // sample rate (set to 0 to disable guarded pages)
+  size_t                guarded_sample_seed;                 // starting sample count
+  size_t                guarded_sample_count;                // current sample count (counting down to 0)
+  #endif
+  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
+  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
+};
+
+
+
+// ------------------------------------------------------
+// Debug
+// ------------------------------------------------------
+
+#if !defined(MI_DEBUG_UNINIT)
+#define MI_DEBUG_UNINIT     (0xD0)
+#endif
+#if !defined(MI_DEBUG_FREED)
+#define MI_DEBUG_FREED      (0xDF)
+#endif
+#if !defined(MI_DEBUG_PADDING)
+#define MI_DEBUG_PADDING    (0xDE)
+#endif
+
+#if (MI_DEBUG)
+// use our own assertion to print without memory allocation
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
+#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
+#else
+#define mi_assert(x)
+#endif
+
+#if (MI_DEBUG>1)
+#define mi_assert_internal    mi_assert
+#else
+#define mi_assert_internal(x)
+#endif
+
+#if (MI_DEBUG>2)
+#define mi_assert_expensive   mi_assert
+#else
+#define mi_assert_expensive(x)
+#endif
+
+// ------------------------------------------------------
+// Statistics
+// ------------------------------------------------------
+
+#ifndef MI_STAT
+#if (MI_DEBUG>0)
+#define MI_STAT 2
+#else
+#define MI_STAT 0
+#endif
+#endif
+
+typedef struct mi_stat_count_s {
+  int64_t allocated;
+  int64_t freed;
+  int64_t peak;
+  int64_t current;
+} mi_stat_count_t;
+
+typedef struct mi_stat_counter_s {
+  int64_t total;
+  int64_t count;
+} mi_stat_counter_t;
+
+typedef struct mi_stats_s {
+  mi_stat_count_t segments;
+  mi_stat_count_t pages;
+  mi_stat_count_t reserved;
+  mi_stat_count_t committed;
+  mi_stat_count_t reset;
+  mi_stat_count_t purged;
+  mi_stat_count_t page_committed;
+  mi_stat_count_t segments_abandoned;
+  mi_stat_count_t pages_abandoned;
+  mi_stat_count_t threads;
+  mi_stat_count_t normal;
+  mi_stat_count_t huge;
+  mi_stat_count_t giant;
+  mi_stat_count_t malloc;
+  mi_stat_count_t segments_cache;
+  mi_stat_counter_t pages_extended;
+  mi_stat_counter_t mmap_calls;
+  mi_stat_counter_t commit_calls;
+  mi_stat_counter_t reset_calls;
+  mi_stat_counter_t purge_calls;
+  mi_stat_counter_t page_no_retire;
+  mi_stat_counter_t searches;
+  mi_stat_counter_t normal_count;
+  mi_stat_counter_t huge_count;
+  mi_stat_counter_t arena_count;
+  mi_stat_counter_t arena_crossover_count;
+  mi_stat_counter_t arena_rollback_count;
+  mi_stat_counter_t guarded_alloc_count;
+#if MI_STAT>1
+  mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
+#endif
+} mi_stats_t;
+
+
+void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+
+#if (MI_STAT)
+#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
+#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
+#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
+#else
+#define mi_stat_increase(stat,amount)         (void)0
+#define mi_stat_decrease(stat,amount)         (void)0
+#define mi_stat_counter_increase(stat,amount) (void)0
+#endif
+
+#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
+
+
+// ------------------------------------------------------
+// Sub processes do not reclaim or visit segments
+// from other sub processes
+// ------------------------------------------------------
+
+struct mi_subproc_s {
+  _Atomic(size_t)    abandoned_count;         // count of abandoned segments for this sub-process
+  _Atomic(size_t)    abandoned_os_list_count; // count of abandoned segments in the os-list
+  mi_lock_t          abandoned_os_lock;       // lock for the abandoned os segment list (outside of arena's) (this lock protect list operations)
+  mi_lock_t          abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list
+  mi_segment_t*      abandoned_os_list;       // doubly-linked list of abandoned segments outside of arena's (in OS allocated memory)
+  mi_segment_t*      abandoned_os_list_tail;  // the tail-end of the list
+  mi_memid_t         memid;                   // provenance of this memory block
+};
+
+// ------------------------------------------------------
+// Thread Local data
+// ------------------------------------------------------
+
+// Milliseconds as in `int64_t` to avoid overflows
+typedef int64_t  mi_msecs_t;
+
+// Queue of segments
+typedef struct mi_segment_queue_s {
+  mi_segment_t* first;
+  mi_segment_t* last;
+} mi_segment_queue_t;
+
+// Segments thread local data
+typedef struct mi_segments_tld_s {
+  mi_segment_queue_t  small_free;   // queue of segments with free small pages
+  mi_segment_queue_t  medium_free;  // queue of segments with free medium pages
+  mi_page_queue_t     pages_purge;  // queue of freed pages that are delay purged
+  size_t              count;        // current number of segments;
+  size_t              peak_count;   // peak number of segments
+  size_t              current_size; // current size of all segments
+  size_t              peak_size;    // peak size of all segments
+  size_t              reclaim_count;// number of reclaimed (abandoned) segments
+  mi_subproc_t*       subproc;      // sub-process this thread belongs to.
+  mi_stats_t*         stats;        // points to tld stats
+} mi_segments_tld_t;
+
+// Thread local data
+struct mi_tld_s {
+  unsigned long long  heartbeat;     // monotonic heartbeat count
+  bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
+  mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
+  mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
+  mi_segments_tld_t   segments;      // segment tld
+  mi_stats_t          stats;         // statistics
+};
+
+#endif