Merge branch 'dev-exp-tls' into dev-exp

2024-12-27 13:33:18 +08:00 · 2020-02-09 18:34:23 -08:00 · 2020-02-09 18:34:23 -08:00 · 609703a7f3
commit 609703a7f3
parent 4f3ad24480 afe434463a
11 changed files with 338 additions and 184 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -5,11 +5,12 @@ set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 17)

 option(MI_OVERRIDE          "Override the standard malloc interface" ON)
-option(MI_INTERPOSE         "Use interpose to override standard malloc on macOS" ON)
 option(MI_DEBUG_FULL        "Use full internal heap invariant checking in DEBUG mode" OFF)
 option(MI_SECURE            "Use full security mitigations (like guard pages, allocation randomization, double-free mitigation, and free-list corruption detection)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library" OFF)
 option(MI_SEE_ASM           "Generate assembly files" OFF)
+option(MI_INTERPOSE         "Use interpose to override standard malloc on macOS" ON)
+option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" OFF) # enables interpose as well
 option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
 option(MI_BUILD_TESTS       "Build test executables" ON)
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
@ -61,14 +62,19 @@ endif()
 if(MI_OVERRIDE MATCHES "ON")
  message(STATUS "Override standard malloc (MI_OVERRIDE=ON)")
  if(APPLE)
+    if(MI_OSX_ZONE MATCHES "ON")
+      # use zone's on macOS
+      message(STATUS "  Use malloc zone to override malloc (MI_OSX_ZONE=ON)")
+      list(APPEND mi_sources src/alloc-override-osx.c)
+      if(NOT MI_INTERPOSE MATCHES "ON")
+        message(STATUS "  (enabling INTERPOSE as well since zone's require this)")
+        set(MI_INTERPOSE "ON")
+      endif()
+    endif()
    if(MI_INTERPOSE MATCHES "ON")
      # use interpose on macOS
      message(STATUS "  Use interpose to override malloc (MI_INTERPOSE=ON)")
      list(APPEND mi_defines MI_INTERPOSE)
-    else()
-      # use zone's on macOS
-      message(STATUS "  Use zone's to override malloc (MI_INTERPOSE=OFF)")
-      list(APPEND mi_sources src/alloc-override-osx.c)
    endif()
  endif()
 endif()
@ -247,7 +253,7 @@ if (MI_BUILD_TESTS MATCHES "ON")
  target_compile_definitions(mimalloc-test-stress PRIVATE ${mi_defines})
  target_compile_options(mimalloc-test-stress PRIVATE ${mi_cflags})
  target_include_directories(mimalloc-test-stress PRIVATE include)
-  target_link_libraries(mimalloc-test-stress PRIVATE mimalloc-static ${mi_libraries})
+  target_link_libraries(mimalloc-test-stress PRIVATE mimalloc ${mi_libraries})

  enable_testing()
  add_test(test_api, mimalloc-test-api)
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@ -10,10 +10,6 @@ terms of the MIT license. A copy of the license can be found in the file

 #include "mimalloc-types.h"

-#if defined(MI_MALLOC_OVERRIDE) && (defined(__APPLE__) || defined(__OpenBSD__) || defined(__DragonFly__))
-#define MI_TLS_RECURSE_GUARD
-#endif
-
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
 #else
@ -33,7 +29,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #else
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
-#define mi_decl_cache_align     
+#define mi_decl_cache_align
 #endif


@ -51,6 +47,7 @@ void       _mi_random_init(mi_random_ctx_t* ctx);
 void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
 uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
 uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
+uintptr_t  _os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);

 // init.c
@ -237,7 +234,7 @@ static inline size_t _mi_wsize_from_size(size_t size) {


 // Overflow detecting multiply
-static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {  
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
 #include <limits.h>   // UINT_MAX, ULONG_MAX
 #if (SIZE_MAX == UINT_MAX)
@ -270,26 +267,76 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
 }


-/* -----------------------------------------------------------
-  The thread local default heap
----------------------------------------------------------- */
+/* ----------------------------------------------------------------------------------------
+The thread local default heap: `_mi_get_default_heap` returns the thread local heap.
+On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
+__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
+that the storage will always be available (allocated on the thread stacks).
+On some platforms though we cannot use that when overriding `malloc` since the underlying
+TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
+We try to circumvent this in an efficient way:
+- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
+           loader itself calls `malloc` even before the modules are initialized.
+- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
+- DragonFly: not yet working.
+------------------------------------------------------------------------------------------- */

 extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
-extern mi_heap_t _mi_heap_main;         // statically allocated main backing heap
 extern bool _mi_process_is_initialized;
+mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap

+#if defined(MI_MALLOC_OVERRIDE)
+#if defined(__MACH__) // OSX
+#define MI_TLS_SLOT               89  // seems unused? 
+// other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+// see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#elif defined(__OpenBSD__)
+// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16) 
+// see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)  
+#elif defined(__DragonFly__)
+#warning "mimalloc is not working correctly on DragonFly yet."
+#define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
+#endif
+#endif
+
+#if defined(MI_TLS_SLOT)
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept;   // forward declaration
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+#include <pthread.h>
+static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
+  pthread_t self = pthread_self();
+  #if defined(__DragonFly__)
+  if (self==NULL) {
+    static mi_heap_t* pheap_main = _mi_heap_main_get();
+    return &pheap_main;
+  }
+  #endif
+  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
+}
+#elif defined(MI_TLS_PTHREAD)
+#include <pthread.h>
+extern pthread_key_t _mi_heap_default_key;
+#else
 extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+#endif

 static inline mi_heap_t* mi_get_default_heap(void) {
-#ifdef MI_TLS_RECURSE_GUARD
-  // on some BSD platforms, like macOS, the dynamic loader calls `malloc`
-  // to initialize thread local data. To avoid recursion, we need to avoid
-  // accessing the thread local `_mi_default_heap` until our module is loaded
-  // and use the statically allocated main heap until that time.
-  // TODO: patch ourselves dynamically to avoid this check every time?
-  if (!_mi_process_is_initialized) return &_mi_heap_main;
-#endif
+#if defined(MI_TLS_SLOT)
+  mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  mi_heap_t* heap = *mi_tls_pthread_heap_slot();
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#elif defined(MI_TLS_PTHREAD)
+  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#else
+  #if defined(MI_TLS_RECURSE_GUARD)
+  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
+  #endif
  return _mi_heap_default;
+#endif
 }

 static inline bool mi_heap_is_default(const mi_heap_t* heap) {
@ -306,6 +353,8 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
 }

 static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  extern mi_heap_t _mi_heap_main;
+  mi_assert_internal(_mi_heap_main.cookie != 0);
  return ((uintptr_t)p ^ _mi_heap_main.cookie);
 }

@ -351,7 +400,7 @@ static inline uintptr_t _mi_segment_page_idx_of(const mi_segment_t* segment, con

 // Get the page containing the pointer
 static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
-  uintptr_t idx = _mi_segment_page_idx_of(segment, p);  
+  uintptr_t idx = _mi_segment_page_idx_of(segment, p);
  return &((mi_segment_t*)segment)->pages[idx];
 }

@ -424,14 +473,14 @@ static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t*
  return mi_tf_make(block, mi_tf_delayed(tf));
 }

-// are all blocks in a page freed? 
+// are all blocks in a page freed?
 // note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
 static inline bool mi_page_all_free(const mi_page_t* page) {
  mi_assert_internal(page != NULL);
  return (page->used == 0);
 }

-// are there any available blocks? 
+// are there any available blocks?
 static inline bool mi_page_has_any_available(const mi_page_t* page) {
  mi_assert_internal(page != NULL && page->reserved > 0);
  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
@ -479,11 +528,11 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
 /* -------------------------------------------------------------------
 Encoding/Decoding the free list next pointers

-This is to protect against buffer overflow exploits where the 
-free list is mutated. Many hardened allocators xor the next pointer `p` 
+This is to protect against buffer overflow exploits where the
+free list is mutated. Many hardened allocators xor the next pointer `p`
 with a secret key `k1`, as `p^k1`. This prevents overwriting with known
-values but might be still too weak: if the attacker can guess 
-the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`). 
+values but might be still too weak: if the attacker can guess
+the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`).
 Moreover, if multiple blocks can be read as well, the attacker can
 xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot
 about the pointers (and subsequently `k1`).
@ -491,9 +540,9 @@ about the pointers (and subsequently `k1`).
 Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`.
 Since these operations are not associative, the above approaches do not
 work so well any more even if the `p` can be guesstimated. For example,
-for the read case we can subtract two entries to discard the `+k1` term, 
+for the read case we can subtract two entries to discard the `+k1` term,
 but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best.
-We include the left-rotation since xor and addition are otherwise linear 
+We include the left-rotation since xor and addition are otherwise linear
 in the lowest bit. Finally, both keys are unique per page which reduces
 the re-use of keys by a large factor.

@ -619,9 +668,8 @@ static inline size_t _mi_os_numa_node_count(void) {


 // -------------------------------------------------------------------
-// Getting the thread id should be performant
-// as it is called in the fast path of `_mi_free`,
-// so we specialize for various platforms.
+// Getting the thread id should be performant as it is called in the
+// fast path of `_mi_free` and we specialize for various platforms.
 // -------------------------------------------------------------------
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@ -630,24 +678,55 @@ static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
  // Windows: works on Intel and ARM in both 32- and 64-bit
  return (uintptr_t)NtCurrentTeb();
 }
-#elif (defined(__GNUC__) || defined(__clang__)) && \
+
+#elif defined(__GNUC__) && \
      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))
-// TLS register on x86 is in the FS or GS register
-// see: https://akkadia.org/drepper/tls.pdf
+
+// TLS register on x86 is in the FS or GS register, see: https://akkadia.org/drepper/tls.pdf
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
+  void* res;
+  const size_t ofs = (slot*sizeof(void*));
+#if defined(__i386__)
+  __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // 32-bit always uses GS
+#elif defined(__MACH__) && defined(__x86_64__)
+  __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
+#elif defined(__x86_64__)
+  __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
+#elif defined(__arm__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+  res = tcb[slot];
+#elif defined(__aarch64__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+  res = tcb[slot];
+#endif
+  return res;
+}
+
+// setting is only used on macOSX for now
+static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  const size_t ofs = (slot*sizeof(void*));
+#if defined(__i386__)
+  __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
+#elif defined(__MACH__) && defined(__x86_64__)
+  __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOSX uses GS
+#elif defined(__x86_64__)
+  __asm__("movq %1,%%fs:%1" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
+#elif defined(__arm__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+  tcb[slot] = value;
+#elif defined(__aarch64__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+  tcb[slot] = value;
+#endif
+}
+
 static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
-  uintptr_t tid;
-  #if defined(__i386__)
-  __asm__("movl %%gs:0, %0" : "=r" (tid) : : );  // 32-bit always uses GS
-  #elif defined(__MACH__)
-  __asm__("movq %%gs:0, %0" : "=r" (tid) : : );  // x86_64 macOS uses GS
-  #elif defined(__x86_64__)
-  __asm__("movq %%fs:0, %0" : "=r" (tid) : : );  // x86_64 Linux, BSD uses FS
-  #elif defined(__arm__)
-  asm volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid));
-  #elif defined(__aarch64__)
-  asm volatile ("mrs %0, tpidr_el0" : "=r" (tid));
-  #endif
-  return tid;
+  // in all our targets, slot 0 is the pointer to the thread control block
+  return (uintptr_t)mi_tls_slot(0);
 }
 #else
 // otherwise use standard C
--- a/include/mimalloc-new-delete.h
+++ b/include/mimalloc-new-delete.h
@ -32,8 +32,8 @@ terms of the MIT license. A copy of the license can be found in the file
  void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }

  #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
-  void operator delete  (void* p, std::size_t n) { mi_free_size(p,n); };
-  void operator delete[](void* p, std::size_t n) { mi_free_size(p,n); };
+  void operator delete  (void* p, std::size_t n) noexcept { mi_free_size(p,n); };
+  void operator delete[](void* p, std::size_t n) noexcept { mi_free_size(p,n); };
  #endif

  #if (__cplusplus > 201402L || defined(__cpp_aligned_new))
--- a/src/alloc-override-osx.c
+++ b/src/alloc-override-osx.c
@ -17,6 +17,12 @@ terms of the MIT license. A copy of the license can be found in the file
 /* ------------------------------------------------------
   Override system malloc on macOS
   This is done through the malloc zone interface.
+   It seems we also need to interpose (see `alloc-override.c`)
+   or otherwise we get zone errors as there are usually 
+   already allocations done by the time we take over the 
+   zone. Unfortunately, that means we need to replace
+   the `free` with a checked free (`cfree`) impacting 
+   performance.
 ------------------------------------------------------ */

 #include <AvailabilityMacros.h>
@ -35,34 +41,42 @@ extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_im
 ------------------------------------------------------ */

 static size_t zone_size(malloc_zone_t* zone, const void* p) {
+  UNUSED(zone); UNUSED(p);
  return 0; // as we cannot guarantee that `p` comes from us, just return 0
 }

 static void* zone_malloc(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone);
  return mi_malloc(size);
 }

 static void* zone_calloc(malloc_zone_t* zone, size_t count, size_t size) {
+  UNUSED(zone);
  return mi_calloc(count, size);
 }

 static void* zone_valloc(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone);
  return mi_malloc_aligned(size, _mi_os_page_size());
 }

 static void zone_free(malloc_zone_t* zone, void* p) {
+  UNUSED(zone);
  return mi_free(p);
 }

 static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
+  UNUSED(zone);
  return mi_realloc(p, newsize);
 }

 static void* zone_memalign(malloc_zone_t* zone, size_t alignment, size_t size) {
+  UNUSED(zone);
  return mi_malloc_aligned(size,alignment);
 }

 static void zone_destroy(malloc_zone_t* zone) {
+  UNUSED(zone);
  // todo: ignore for now?
 }

@ -83,11 +97,13 @@ static void zone_batch_free(malloc_zone_t* zone, void** ps, unsigned count) {
 }

 static size_t zone_pressure_relief(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone); UNUSED(size);
  mi_collect(false);
  return 0;
 }

 static void zone_free_definite_size(malloc_zone_t* zone, void* p, size_t size) {
+  UNUSED(size);
  zone_free(zone,p);
 }

@ -102,34 +118,43 @@ static kern_return_t intro_enumerator(task_t task, void* p,
                            vm_range_recorder_t recorder)
 {
  // todo: enumerate all memory
+  UNUSED(task); UNUSED(p); UNUSED(type_mask); UNUSED(zone_address);
+  UNUSED(reader); UNUSED(recorder);
  return KERN_SUCCESS;
 }

 static size_t intro_good_size(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone);
  return mi_good_size(size);
 }

 static boolean_t intro_check(malloc_zone_t* zone) {
+  UNUSED(zone);
  return true;
 }

 static void intro_print(malloc_zone_t* zone, boolean_t verbose) {
+  UNUSED(zone); UNUSED(verbose);
  mi_stats_print(NULL);
 }

 static void intro_log(malloc_zone_t* zone, void* p) {
+  UNUSED(zone); UNUSED(p);
  // todo?
 }

 static void intro_force_lock(malloc_zone_t* zone) {
+  UNUSED(zone);
  // todo?
 }

 static void intro_force_unlock(malloc_zone_t* zone) {
+  UNUSED(zone);
  // todo?
 }

 static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
+  UNUSED(zone);
  // todo...
  stats->blocks_in_use = 0;
  stats->size_in_use = 0;
@ -138,6 +163,7 @@ static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
 }

 static boolean_t intro_zone_locked(malloc_zone_t* zone) {
+  UNUSED(zone);
  return false;
 }

@ -161,7 +187,6 @@ static malloc_zone_t* mi_get_default_zone()
  }
 }

-
 static void __attribute__((constructor)) _mi_macos_override_malloc()
 {
  static malloc_introspection_t intro;
@ -201,6 +226,7 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
  zone.free_definite_size = &zone_free_definite_size;
  zone.pressure_relief = &zone_pressure_relief;
  intro.zone_locked = &intro_zone_locked;
+  intro.statistics = &intro_statistics;

  // force the purgeable zone to exist to avoid strange bugs
  if (malloc_default_purgeable_zone) {
@ -225,6 +251,7 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
    malloc_zone_unregister(purgeable_zone);
    malloc_zone_register(purgeable_zone);
  }
+
 }

 #endif // MI_MALLOC_OVERRIDE
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@ -13,7 +13,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
 #endif

-#if defined(MI_MALLOC_OVERRIDE) && !defined(_WIN32)
+#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32)) // || (defined(__MACH__) && !defined(MI_INTERPOSE)))

 // ------------------------------------------------------
 // Override system malloc
@ -47,26 +47,31 @@ terms of the MIT license. A copy of the license can be found in the file
    const void* replacement;
    const void* target;
  };
-  #define MI_INTERPOSEX(oldfun,newfun)  { (const void*)&newfun, (const void*)&oldfun }
-  #define MI_INTERPOSE_MI(fun)         MI_INTERPOSEX(fun,mi_##fun)
+  #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
+  #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
  __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
  {
    MI_INTERPOSE_MI(malloc),
    MI_INTERPOSE_MI(calloc),
    MI_INTERPOSE_MI(realloc),
-    MI_INTERPOSE_MI(free),
    MI_INTERPOSE_MI(strdup),
-    MI_INTERPOSE_MI(strndup)
+    MI_INTERPOSE_MI(strndup),
+    MI_INTERPOSE_MI(realpath),
+    MI_INTERPOSE_MI(posix_memalign),
+    MI_INTERPOSE_MI(reallocf),
+    MI_INTERPOSE_MI(valloc),
+    // some code allocates from a zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
+    MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us
  };
 #elif defined(_MSC_VER)
  // cannot override malloc unless using a dll.
  // we just override new/delete which does work in a static library.
 #else
  // On all other systems forward to our API
-  void* malloc(size_t size)              mi_attr_noexcept  MI_FORWARD1(mi_malloc, size);
-  void* calloc(size_t size, size_t n)    mi_attr_noexcept  MI_FORWARD2(mi_calloc, size, n);
-  void* realloc(void* p, size_t newsize) mi_attr_noexcept  MI_FORWARD2(mi_realloc, p, newsize);
-  void  free(void* p)                    mi_attr_noexcept  MI_FORWARD0(mi_free, p);
+  void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size);
+  void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n);
+  void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize);
+  void  free(void* p)                    MI_FORWARD0(mi_free, p);
 #endif

 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__MACH__)
@ -94,8 +99,8 @@ terms of the MIT license. A copy of the license can be found in the file
  void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { UNUSED(tag); return mi_new_nothrow(n); }

  #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
-  void operator delete  (void* p, std::size_t n) MI_FORWARD02(mi_free_size,p,n);
-  void operator delete[](void* p, std::size_t n) MI_FORWARD02(mi_free_size,p,n);
+  void operator delete  (void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n);
+  void operator delete[](void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n);
  #endif

  #if (__cplusplus > 201402L || defined(__cpp_aligned_new)) && (!defined(__GNUC__) || (__GNUC__ > 5))
@ -194,4 +199,3 @@ int posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_me
 #endif

 #endif // MI_MALLOC_OVERRIDE && !_WIN32
-
--- a/src/alloc.c
+++ b/src/alloc.c
@ -212,7 +212,7 @@ static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* bl
  size_t delta;
  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
-  return (ok ? bsize - delta : 0); 
+  return (ok ? bsize - delta : 0);
 }

 static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
@ -259,7 +259,7 @@ static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, co
  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
  padding->delta = (uint32_t)new_delta;
 }
-#else 
+#else
 static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
  UNUSED(page);
  UNUSED(block);
@ -359,7 +359,7 @@ static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block
    }
    else if (mi_unlikely(mi_page_is_in_full(page))) {
      _mi_page_unfull(page);
-    }    
+    }
  }
  else {
    _mi_free_block_mt(page,block);
@ -401,7 +401,7 @@ void mi_free(void* p) mi_attr_noexcept
      "(this may still be a valid very large allocation (over 64MiB))\n", p);
    if (mi_likely(_mi_ptr_cookie(segment) == segment->cookie)) {
      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-    }
+    } 
  }
 #endif
 #if (MI_DEBUG!=0 || MI_SECURE>=4)
@ -421,11 +421,11 @@ void mi_free(void* p) mi_attr_noexcept
  mi_heap_stat_decrease(heap, malloc, bsize);
  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { // huge page stats are accounted for in `_mi_page_retire`
    mi_heap_stat_decrease(heap, normal[_mi_bin(bsize)], 1);
-  }  
+  }
 #endif

  if (mi_likely(tid == segment->thread_id && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
-    // local, and not full or aligned    
+    // local, and not full or aligned
    if (mi_unlikely(mi_check_is_double_free(page,block))) return;
    mi_check_padding(page, block);
    #if (MI_DEBUG!=0)
@ -436,7 +436,7 @@ void mi_free(void* p) mi_attr_noexcept
    page->used--;
    if (mi_unlikely(mi_page_all_free(page))) {
      _mi_page_retire(page);
-    }    
+    }
  }
  else {
    // non-local, aligned blocks, or a full page; use the more generic path
@ -473,7 +473,7 @@ size_t mi_usable_size(const void* p) mi_attr_noexcept {
  const mi_segment_t* const segment = _mi_ptr_segment(p);
  const mi_page_t* const page = _mi_segment_page_of(segment, p);
  const mi_block_t* const block = (const mi_block_t*)p;
-  const size_t size = mi_page_usable_size_of(page, block);  
+  const size_t size = mi_page_usable_size_of(page, block);
  if (mi_unlikely(mi_page_has_aligned(page))) {
    ptrdiff_t const adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
    mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
--- a/src/init.c
+++ b/src/init.c
@ -34,7 +34,7 @@ const mi_page_t _mi_page_empty = {

 #if defined(MI_PADDING) && (MI_INTPTR_SIZE >= 8)
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
-#elif defined(MI_PADDING) 
+#elif defined(MI_PADDING)
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
 #else
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY() }
@ -107,32 +107,28 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 #define tld_main_stats  ((mi_stats_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,stats)))
 #define tld_main_os     ((mi_os_tld_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,os)))

+extern mi_heap_t _mi_heap_main;
+
 static mi_tld_t tld_main = {
  0, false,
  &_mi_heap_main,
-  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0}, 
-    0, 0, 0, 0, 0, 0, NULL, 
-    tld_main_stats, tld_main_os 
+  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
+    0, 0, 0, 0, 0, 0, NULL,
+    tld_main_stats, tld_main_os
  }, // segments
  { 0, tld_main_stats },  // os
  { MI_STATS_NULL }       // stats
 };

-#if MI_INTPTR_SIZE==8
-#define MI_INIT_COOKIE  (0xCDCDCDCDCDCDCDCDUL)
-#else
-#define MI_INIT_COOKIE  (0xCDCDCDCDUL)
-#endif
-
 mi_heap_t _mi_heap_main = {
  &tld_main,
  MI_SMALL_PAGES_EMPTY,
  MI_PAGE_QUEUES_EMPTY,
  ATOMIC_VAR_INIT(NULL),
  0,                // thread id
-  MI_INIT_COOKIE,   // initial cookie
-  { MI_INIT_COOKIE, MI_INIT_COOKIE }, // the key of the main heap can be fixed (unlike page keys that need to be secure!)
-  { {0}, {0}, 0 },  // random
+  0,                // initial cookie
+  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  { {0x846ca68b}, {0}, 0 },  // random
  0,                // page count
  false             // can reclaim
 };
@ -142,6 +138,22 @@ bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
 mi_stats_t _mi_stats_main = { MI_STATS_NULL };


+static void mi_heap_main_init(void) {
+  if (_mi_heap_main.cookie == 0) {
+    _mi_heap_main.thread_id = _mi_thread_id();
+    _mi_heap_main.cookie = _os_random_weak((uintptr_t)&mi_heap_main_init);
+    _mi_random_init(&_mi_heap_main.random);
+    _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
+    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
+  }
+}
+
+mi_heap_t* _mi_heap_main_get(void) {
+  mi_heap_main_init();
+  return &_mi_heap_main;
+}
+
+
 /* -----------------------------------------------------------
  Initialization and freeing of the thread local heaps
 ----------------------------------------------------------- */
@ -154,14 +166,16 @@ typedef struct mi_thread_data_s {

 // Initialize the thread local default heap, called from `mi_thread_init`
 static bool _mi_heap_init(void) {
-  if (mi_heap_is_initialized(_mi_heap_default)) return true;
+  if (mi_heap_is_initialized(mi_get_default_heap())) return true;
  if (_mi_is_main_thread()) {
+    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
    // the main heap is statically allocated
+    mi_heap_main_init();
    _mi_heap_set_default_direct(&_mi_heap_main);
-    mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
+    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
  }
  else {
-    // use `_mi_os_alloc` to allocate directly from the OS    
+    // use `_mi_os_alloc` to allocate directly from the OS
    mi_thread_data_t* td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t),&_mi_stats_main); // Todo: more efficient allocation?
    if (td == NULL) {
      _mi_error_message(ENOMEM, "failed to allocate thread local heap memory\n");
@ -176,7 +190,7 @@ static bool _mi_heap_init(void) {
    heap->cookie  = _mi_heap_random_next(heap) | 1;
    heap->keys[0] = _mi_heap_random_next(heap);
    heap->keys[1] = _mi_heap_random_next(heap);
-    heap->tld = tld;    
+    heap->tld = tld;
    tld->heap_backing = heap;
    tld->segments.stats = &tld->stats;
    tld->segments.os = &tld->os;
@ -253,14 +267,15 @@ static void _mi_thread_done(mi_heap_t* default_heap);
  // use thread local storage keys to detect thread ending
  #include <windows.h>
  #include <fibersapi.h>
-  static DWORD mi_fls_key;
+  static DWORD mi_fls_key = (DWORD)(-1);
  static void NTAPI mi_fls_done(PVOID value) {
    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
  }
 #elif defined(MI_USE_PTHREADS)
-  // use pthread locol storage keys to detect thread ending
+  // use pthread local storage keys to detect thread ending
+  // (and used with MI_TLS_PTHREADS for the default heap)
  #include <pthread.h>
-  static pthread_key_t mi_pthread_key;
+  pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
  static void mi_pthread_done(void* value) {
    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
  }
@ -280,8 +295,10 @@ static void mi_process_setup_auto_thread_done(void) {
  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
    mi_fls_key = FlsAlloc(&mi_fls_done);
  #elif defined(MI_USE_PTHREADS)
-    pthread_key_create(&mi_pthread_key, &mi_pthread_done);
+    mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
+    pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
  #endif
+  _mi_heap_set_default_direct(&_mi_heap_main);
 }


@ -323,21 +340,31 @@ static void _mi_thread_done(mi_heap_t* heap) {

 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
  mi_assert_internal(heap != NULL);
+  #if defined(MI_TLS_SLOT)
+  mi_tls_slot_set(MI_TLS_SLOT,heap);
+  #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  *mi_tls_pthread_heap_slot() = heap;
+  #elif defined(MI_TLS_PTHREAD)
+  // we use _mi_heap_default_key
+  #else
  _mi_heap_default = heap;
+  #endif

  // ensure the default heap is passed to `_mi_thread_done`
  // setting to a non-NULL value also ensures `mi_thread_done` is called.
  #if defined(_WIN32) && defined(MI_SHARED_LIB)
    // nothing to do as it is done in DllMain
  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    mi_assert_internal(mi_fls_key != 0);
    FlsSetValue(mi_fls_key, heap);
  #elif defined(MI_USE_PTHREADS)
-    pthread_setspecific(mi_pthread_key, heap);
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
+    pthread_setspecific(_mi_heap_default_key, heap);
+  }
  #endif
 }


-
 // --------------------------------------------------------
 // Run functions on process init/done, and thread init/done
 // --------------------------------------------------------
@ -389,11 +416,16 @@ static void mi_allocator_done() {

 // Called once by the process loader
 static void mi_process_load(void) {
+  mi_heap_main_init();
+  #if defined(MI_TLS_RECURSE_GUARD)
+  volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
+  UNUSED(dummy);
+  #endif
  os_preloading = false;
  atexit(&mi_process_done);
  _mi_options_init();
  mi_process_init();
-  //mi_stats_reset();
+  //mi_stats_reset();-
  if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");

  // show message from the redirector (if present)
@ -408,22 +440,12 @@ static void mi_process_load(void) {
 void mi_process_init(void) mi_attr_noexcept {
  // ensure we are called once
  if (_mi_process_is_initialized) return;
-  // access _mi_heap_default before setting _mi_process_is_initialized to ensure
-  // that the TLS slot is allocated without getting into recursion on macOS
-  // when using dynamic linking with interpose.
-  mi_get_default_heap();
  _mi_process_is_initialized = true;
-
-  _mi_heap_main.thread_id = _mi_thread_id();
-  _mi_verbose_message("process init: 0x%zx\n", _mi_heap_main.thread_id);
-  _mi_random_init(&_mi_heap_main.random);
-  #ifndef __APPLE__  // TODO: fix this? cannot update cookie if allocation already happened..
-  _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
-  _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
-  _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
-  #endif
  mi_process_setup_auto_thread_done();
+
+  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
  _mi_os_init();
+  mi_heap_main_init();
  #if (MI_DEBUG)
  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
  #endif
--- a/src/options.c
+++ b/src/options.c
@ -70,7 +70,11 @@ static mi_option_desc_t options[_mi_option_last] =
  { 1, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
  { 0, UNINIT, MI_OPTION(abandoned_page_reset) },// reset free page memory when a thread terminates
  { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
+#if defined(__NetBSD__)
+  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+#else
  { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+#endif
  { 100, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
  { 0,   UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes.
  { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
@ -239,16 +243,30 @@ static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT
 // inside the C runtime causes another message.
 static mi_decl_thread bool recurse = false;

+static bool mi_recurse_enter(void) {
+  #ifdef MI_TLS_RECURSE_GUARD
+  if (_mi_preloading()) return true;
+  #endif
+  if (recurse) return false;
+  recurse = true;
+  return true;
+}
+
+static void mi_recurse_exit(void) {
+  #ifdef MI_TLS_RECURSE_GUARD
+  if (_mi_preloading()) return;
+  #endif
+  recurse = false;
+}
+
 void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) {
-  if (recurse) return;
+  if (!mi_recurse_enter()) return;
  if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) { // TODO: use mi_out_stderr for stderr?
    out = mi_out_get_default(&arg);
  }
-  recurse = true;
  if (prefix != NULL) out(prefix,arg);
  out(message,arg);
-  recurse = false;
-  return;
+  mi_recurse_exit();
 }

 // Define our own limited `fprintf` that avoids memory allocation.
@ -256,14 +274,12 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me
 static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
  char buf[512];
  if (fmt==NULL) return;
-  if (recurse) return;
-  recurse = true;
+  if (!mi_recurse_enter()) return;
  vsnprintf(buf,sizeof(buf)-1,fmt,args);
-  recurse = false;
+  mi_recurse_exit();
  _mi_fputs(out,arg,prefix,buf);
 }

-
 void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) {
  va_list args;
  va_start(args,fmt);
@ -290,7 +306,7 @@ void _mi_verbose_message(const char* fmt, ...) {
 static void mi_show_error_message(const char* fmt, va_list args) {
  if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
-  mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);  
+  mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);
 }

 void _mi_warning_message(const char* fmt, ...) {
--- a/src/random.c
+++ b/src/random.c
@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file

 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
-and to avoid implementations that use a lock. We only use the OS provided 
+and to avoid implementations that use a lock. We only use the OS provided
 random source to initialize the initial seeds. Since we do not need ultimate
 performance but we do rely on the security (for secret cookies in secure mode)
 we use a cryptographically secure generator (chacha20).
@ -21,11 +21,11 @@ we use a cryptographically secure generator (chacha20).


 /* ----------------------------------------------------------------------------
-Chacha20 implementation as the original algorithm with a 64-bit nonce 
+Chacha20 implementation as the original algorithm with a 64-bit nonce
 and counter: https://en.wikipedia.org/wiki/Salsa20
 The input matrix has sixteen 32-bit values:
 Position  0 to  3: constant key
-Position  4 to 11: the key 
+Position  4 to 11: the key
 Position 12 to 13: the counter.
 Position 14 to 15: the nonce.

@ -44,8 +44,8 @@ static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d
  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
 }

-static void chacha_block(mi_random_ctx_t* ctx) 
-{  
+static void chacha_block(mi_random_ctx_t* ctx)
+{
  // scramble into `x`
  uint32_t x[16];
  for (size_t i = 0; i < 16; i++) {
@ -72,8 +72,8 @@ static void chacha_block(mi_random_ctx_t* ctx)
  ctx->input[12] += 1;
  if (ctx->input[12] == 0) {
    ctx->input[13] += 1;
-    if (ctx->input[13] == 0) {  // and keep increasing into the nonce 
-      ctx->input[14] += 1;  
+    if (ctx->input[13] == 0) {  // and keep increasing into the nonce
+      ctx->input[14] += 1;
    }
  }
 }
@ -83,7 +83,7 @@ static uint32_t chacha_next32(mi_random_ctx_t* ctx) {
    chacha_block(ctx);
    ctx->output_available = 16; // (assign again to suppress static analysis warning)
  }
-  const uint32_t x = ctx->output[16 - ctx->output_available];  
+  const uint32_t x = ctx->output[16 - ctx->output_available];
  ctx->output[16 - ctx->output_available] = 0; // reset once the data is handed out
  ctx->output_available--;
  return x;
@ -94,9 +94,9 @@ static inline uint32_t read32(const uint8_t* p, size_t idx32) {
  return ((uint32_t)p[i+0] | (uint32_t)p[i+1] << 8 | (uint32_t)p[i+2] << 16 | (uint32_t)p[i+3] << 24);
 }

-static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce) 
+static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce)
 {
-  // since we only use chacha for randomness (and not encryption) we 
+  // since we only use chacha for randomness (and not encryption) we
  // do not _need_ to read 32-bit values as little endian but we do anyways
  // just for being compatible :-)
  memset(ctx, 0, sizeof(*ctx));
@ -110,7 +110,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
  ctx->input[12] = 0;
  ctx->input[13] = 0;
  ctx->input[14] = (uint32_t)nonce;
-  ctx->input[15] = (uint32_t)(nonce >> 32);  
+  ctx->input[15] = (uint32_t)(nonce >> 32);
 }

 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
@ -184,7 +184,7 @@ static bool os_random_buf(void* buf, size_t buf_len) {
  arc4random_buf(buf, buf_len);
  return true;
 }
-#elif defined(__linux__) 
+#elif defined(__linux__)
 #include <sys/syscall.h>
 #include <unistd.h>
 #include <sys/types.h>
@ -241,8 +241,8 @@ static bool os_random_buf(void* buf, size_t buf_len) {
 #include <time.h>
 #endif

-static uintptr_t os_random_weak(uintptr_t extra_seed) {
-  uintptr_t x = (uintptr_t)&os_random_weak ^ extra_seed; // ASLR makes the address random
+uintptr_t _os_random_weak(uintptr_t extra_seed) {
+  uintptr_t x = (uintptr_t)&_os_random_weak ^ extra_seed; // ASLR makes the address random
  #if defined(_WIN32)
    LARGE_INTEGER pcount;
    QueryPerformanceCounter(&pcount);
@ -267,10 +267,10 @@ static uintptr_t os_random_weak(uintptr_t extra_seed) {
 void _mi_random_init(mi_random_ctx_t* ctx) {
  uint8_t key[32];
  if (!os_random_buf(key, sizeof(key))) {
-    // if we fail to get random data from the OS, we fall back to a 
+    // if we fail to get random data from the OS, we fall back to a
    // weak random source based on the current time
    _mi_warning_message("unable to use secure randomness\n");
-    uintptr_t x = os_random_weak(0);
+    uintptr_t x = _os_random_weak(0);
    for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
      x = _mi_random_shuffle(x);
      ((uint32_t*)key)[i] = (uint32_t)x;
@ -280,7 +280,7 @@ void _mi_random_init(mi_random_ctx_t* ctx) {
 }

 /* --------------------------------------------------------
-test vectors from <https://tools.ietf.org/html/rfc8439> 
+test vectors from <https://tools.ietf.org/html/rfc8439>
 ----------------------------------------------------------- */
 /*
 static bool array_equals(uint32_t* x, uint32_t* y, size_t n) {
--- a/src/segment.c
+++ b/src/segment.c
@ -17,9 +17,9 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_

 /* --------------------------------------------------------------------------------
  Segment allocation
-  We allocate pages inside bigger "segments" (4mb on 64-bit). This is to avoid 
-  splitting VMA's on Linux and reduce fragmentation on other OS's. 
-  Each thread owns its own segments. 
+  We allocate pages inside bigger "segments" (4mb on 64-bit). This is to avoid
+  splitting VMA's on Linux and reduce fragmentation on other OS's.
+  Each thread owns its own segments.

  Currently we have:
  - small pages (64kb), 64 in one segment
@ -154,14 +154,14 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
  for (size_t i = 0; i < segment->capacity; i++) {
    const mi_page_t* const page = &segment->pages[i];
    if (!page->segment_in_use) {
-      nfree++;      
+      nfree++;
    }
    if (page->segment_in_use || page->is_reset) {
      mi_assert_expensive(!mi_pages_reset_contains(page, tld));
    }
  }
  mi_assert_internal(nfree + segment->used == segment->capacity);
-  mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
+  // mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
  mi_assert_internal(segment->page_kind == MI_PAGE_HUGE ||
                     (mi_segment_page_size(segment) * segment->capacity == segment->segment_size));
  return true;
@ -287,7 +287,7 @@ static void mi_pages_reset_add(mi_segment_t* segment, mi_page_t* page, mi_segmen
  mi_assert_expensive(!mi_pages_reset_contains(page, tld));
  mi_assert_internal(_mi_page_segment(page)==segment);
  if (!mi_option_is_enabled(mi_option_page_reset)) return;
-  if (segment->mem_is_fixed || page->segment_in_use || !page->is_committed || page->is_reset) return;  
+  if (segment->mem_is_fixed || page->segment_in_use || !page->is_committed || page->is_reset) return;

  if (mi_option_get(mi_option_reset_delay) == 0) {
    // reset immediately?
@ -296,7 +296,7 @@ static void mi_pages_reset_add(mi_segment_t* segment, mi_page_t* page, mi_segmen
  else {
    // otherwise push on the delayed page reset queue
    mi_page_queue_t* pq = &tld->pages_reset;
-    // push on top 
+    // push on top
    mi_page_reset_set_expire(page);
    page->next = pq->first;
    page->prev = NULL;
@ -317,7 +317,7 @@ static void mi_pages_reset_remove(mi_page_t* page, mi_segments_tld_t* tld) {
  mi_page_queue_t* pq = &tld->pages_reset;
  mi_assert_internal(pq!=NULL);
  mi_assert_internal(!page->segment_in_use);
-  mi_assert_internal(mi_pages_reset_contains(page, tld));  
+  mi_assert_internal(mi_pages_reset_contains(page, tld));
  if (page->prev != NULL) page->prev->next = page->next;
  if (page->next != NULL) page->next->prev = page->prev;
  if (page == pq->last)  pq->last = page->prev;
@ -333,19 +333,19 @@ static void mi_pages_reset_remove_all_in_segment(mi_segment_t* segment, bool for
    if (!page->segment_in_use && page->is_committed && !page->is_reset) {
      mi_pages_reset_remove(page, tld);
      if (force_reset) {
-        mi_page_reset(segment, page, 0, tld); 
+        mi_page_reset(segment, page, 0, tld);
      }
    }
    else {
      mi_assert_internal(mi_page_not_in_queue(page,tld));
-    }    
+    }
  }
 }

 static void mi_reset_delayed(mi_segments_tld_t* tld) {
  if (!mi_option_is_enabled(mi_option_page_reset)) return;
  mi_msecs_t now = _mi_clock_now();
-  mi_page_queue_t* pq = &tld->pages_reset;  
+  mi_page_queue_t* pq = &tld->pages_reset;
  // from oldest up to the first that has not expired yet
  mi_page_t* page = pq->last;
  while (page != NULL && mi_page_reset_is_expired(page,now)) {
@ -359,7 +359,7 @@ static void mi_reset_delayed(mi_segments_tld_t* tld) {
  pq->last = page;
  if (page != NULL){
    page->next = NULL;
-  } 
+  }
  else {
    pq->first = NULL;
  }
@ -540,7 +540,7 @@ void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
  }
  mi_assert_internal(tld->cache_count == 0);
  mi_assert_internal(tld->cache == NULL);
-#if MI_DEBUG>=2 
+#if MI_DEBUG>=2
  if (!_mi_is_main_thread()) {
    mi_assert_internal(tld->pages_reset.first == NULL);
    mi_assert_internal(tld->pages_reset.last == NULL);
@ -684,7 +684,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,

 static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
  UNUSED(force);
-  mi_assert(segment != NULL);  
+  mi_assert(segment != NULL);
  // note: don't reset pages even on abandon as the whole segment is freed? (and ready for reuse)
  bool force_reset = (force && mi_option_is_enabled(mi_option_abandoned_page_reset));
  mi_pages_reset_remove_all_in_segment(segment, force_reset, tld);
@ -716,7 +716,7 @@ static bool mi_segment_has_free(const mi_segment_t* segment) {

 static void mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
  mi_assert_internal(_mi_page_segment(page) == segment);
-  mi_assert_internal(!page->segment_in_use);    
+  mi_assert_internal(!page->segment_in_use);
  // set in-use before doing unreset to prevent delayed reset
  mi_pages_reset_remove(page, tld);
  page->segment_in_use = true;
@ -756,7 +756,7 @@ static void mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_seg
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);

 // clear page data; can be called on abandoned segments
-static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool allow_reset, mi_segments_tld_t* tld) 
+static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool allow_reset, mi_segments_tld_t* tld)
 {
  mi_assert_internal(page->segment_in_use);
  mi_assert_internal(mi_page_all_free(page));
@ -791,7 +791,7 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool a
  segment->used--;

  // add to the free page list for reuse/reset
-  if (allow_reset) {  
+  if (allow_reset) {
    mi_pages_reset_add(segment, page, tld);
  }

@ -848,12 +848,12 @@ Note: the current implementation is one possible design;
 another way might be to keep track of abandoned segments
 in the regions. This would have the advantage of keeping
 all concurrent code in one place and not needing to deal
-with ABA issues. The drawback is that it is unclear how to 
-scan abandoned segments efficiently in that case as they 
+with ABA issues. The drawback is that it is unclear how to
+scan abandoned segments efficiently in that case as they
 would be spread among all other segments in the regions.
 ----------------------------------------------------------- */

-// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers 
+// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers
 // to put in a tag that increments on update to avoid the A-B-A problem.
 #define MI_TAGGED_MASK   MI_SEGMENT_MASK
 typedef uintptr_t        mi_tagged_segment_t;
@ -869,7 +869,7 @@ static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_se
 }

 // This is a list of visited abandoned pages that were full at the time.
-// this list migrates to `abandoned` when that becomes NULL. The use of 
+// this list migrates to `abandoned` when that becomes NULL. The use of
 // this list reduces contention and the rate at which segments are visited.
 static mi_decl_cache_align volatile _Atomic(mi_segment_t*)       abandoned_visited; // = NULL

@ -895,7 +895,7 @@ static void mi_abandoned_visited_push(mi_segment_t* segment) {
 }

 // Move the visited list to the abandoned list.
-static bool mi_abandoned_visited_revisit(void) 
+static bool mi_abandoned_visited_revisit(void)
 {
  // quick check if the visited list is empty
  if (mi_atomic_read_ptr_relaxed(mi_segment_t,&abandoned_visited)==NULL) return false;
@ -961,12 +961,12 @@ static mi_segment_t* mi_abandoned_pop(void) {
  segment = mi_tagged_segment_ptr(ts);
  if (mi_likely(segment == NULL)) {
    if (mi_likely(!mi_abandoned_visited_revisit())) { // try to swap in the visited list on NULL
-      return NULL;  
+      return NULL;
    }
  }

  // Do a pop. We use a reader count to prevent
-  // a segment to be decommitted while a read is still pending, 
+  // a segment to be decommitted while a read is still pending,
  // and a tagged pointer to prevent A-B-A link corruption.
  // (this is called from `memory.c:_mi_mem_free` for example)
  mi_atomic_increment(&abandoned_readers);  // ensure no segment gets decommitted
@ -1031,7 +1031,7 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
 ----------------------------------------------------------- */

 // Possibly clear pages and check if free space is available
-static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free) 
+static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free)
 {
  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
  bool has_page = false;
@ -1039,17 +1039,17 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
  size_t pages_used_empty = 0;
  for (size_t i = 0; i < segment->capacity; i++) {
    mi_page_t* page = &segment->pages[i];
-    if (page->segment_in_use) {      
+    if (page->segment_in_use) {
      pages_used++;
      // ensure used count is up to date and collect potential concurrent frees
-      _mi_page_free_collect(page, false); 
+      _mi_page_free_collect(page, false);
      if (mi_page_all_free(page)) {
        // if everything free already, page can be reused for some block size
        // note: don't clear the page yet as we can only OS reset it once it is reclaimed
        pages_used_empty++;
        has_page = true;
      }
-      else if (page->xblock_size == block_size && mi_page_has_any_available(page)) {  
+      else if (page->xblock_size == block_size && mi_page_has_any_available(page)) {
        // a page has available free blocks of the right size
        has_page = true;
      }
@ -1058,7 +1058,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
      // whole empty page
      has_page = true;
    }
-  }  
+  }
  mi_assert_internal(pages_used == segment->used && pages_used >= pages_used_empty);
  if (all_pages_free != NULL) {
    *all_pages_free = ((pages_used - pages_used_empty) == 0);
@ -1107,7 +1107,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
          if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
        }
      }
-    }   
+    }
    else if (page->is_committed && !page->is_reset) {  // not in-use, and not reset yet
      // note: do not reset as this includes pages that were not touched before
      // mi_pages_reset_add(segment, page, tld);
@ -1148,17 +1148,17 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
      // free the segment (by forced reclaim) to make it available to other threads.
      // note1: we prefer to free a segment as that might lead to reclaiming another
      // segment that is still partially used.
-      // note2: we could in principle optimize this by skipping reclaim and directly 
+      // note2: we could in principle optimize this by skipping reclaim and directly
      // freeing but that would violate some invariants temporarily)
      mi_segment_reclaim(segment, heap, 0, NULL, tld);
    }
    else if (has_page && segment->page_kind == page_kind) {
-      // found a free page of the right kind, or page of the right block_size with free space 
+      // found a free page of the right kind, or page of the right block_size with free space
      // we return the result of reclaim (which is usually `segment`) as it might free
      // the segment due to concurrent frees (in which case `NULL` is returned).
      return mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
    }
-    else if (segment->abandoned_visits >= 3) {  
+    else if (segment->abandoned_visits >= 3) {
      // always reclaim on 3rd visit to limit the list length.
      mi_segment_reclaim(segment, heap, 0, NULL, tld);
    }
@ -1172,12 +1172,12 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,


 /* -----------------------------------------------------------
-   Reclaim or allocate  
+   Reclaim or allocate
 ----------------------------------------------------------- */

-static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) 
+static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 {
-  mi_assert_internal(page_kind <= MI_PAGE_LARGE);  
+  mi_assert_internal(page_kind <= MI_PAGE_LARGE);
  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
  // 1. try to get a segment from our cache
  mi_segment_t* segment = mi_segment_cache_pop(MI_SEGMENT_SIZE, tld);
@ -1198,7 +1198,7 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_s
    return segment;
  }
  // 3. otherwise allocate a fresh segment
-  return mi_segment_alloc(0, page_kind, page_shift, tld, os_tld);  
+  return mi_segment_alloc(0, page_kind, page_shift, tld, os_tld);
 }


@ -1223,11 +1223,11 @@ static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_segments_tld_t*
 // Allocate a page inside a segment. Requires that the page has free pages
 static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tld_t* tld) {
  mi_assert_internal(mi_segment_has_free(segment));
-  return mi_segment_find_free(segment, tld);  
+  return mi_segment_find_free(segment, tld);
 }

 static mi_page_t* mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
-  // find an available segment the segment free queue 
+  // find an available segment the segment free queue
  mi_segment_queue_t* const free_queue = mi_segment_free_queue_of_kind(kind, tld);
  if (mi_segment_queue_is_empty(free_queue)) {
    // possibly allocate or reclaim a fresh segment
@ -1312,7 +1312,7 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block
 }

 /* -----------------------------------------------------------
-   Page allocation 
+   Page allocation
 ----------------------------------------------------------- */

 mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
--- a/test/test-stress.c
+++ b/test/test-stress.c
@ -188,7 +188,7 @@ static void test_stress(void) {
        free_items(p);
      }
    }
-    mi_collect(false);
+    // mi_collect(false);
 #ifndef NDEBUG
    if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
 #endif
@ -206,7 +206,7 @@ static void leak(intptr_t tid) {
  }
 }

-static void test_leak(void) {  
+static void test_leak(void) {
  for (int n = 0; n < ITER; n++) {
    run_os_threads(THREADS, &leak);
    mi_collect(false);
@ -242,14 +242,14 @@ int main(int argc, char** argv) {

  // Run ITER full iterations where half the objects in the transfer buffer survive to the next round.
  srand(0x7feb352d);
-  mi_stats_reset();
+  // mi_stats_reset();
 #ifdef STRESS
    test_stress();
 #else
    test_leak();
-#endif  
+#endif

-  mi_collect(true);
+  // mi_collect(true);
  mi_stats_print(NULL);
  //bench_end_program();
  return 0;
@ -262,7 +262,7 @@ static void (*thread_entry_fun)(intptr_t) = &stress;

 #include <windows.h>

-static DWORD WINAPI thread_entry(LPVOID param) {  
+static DWORD WINAPI thread_entry(LPVOID param) {
  thread_entry_fun((intptr_t)param);
  return 0;
 }