merge from dev-exp

2024-12-26 21:04:27 +08:00 · 2019-11-21 17:03:30 -08:00 · 2019-11-21 17:03:30 -08:00 · 1066be1594
commit 1066be1594
parent aa61e6381d 50575b12c0
30 changed files with 1167 additions and 356 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -6,18 +6,15 @@ set(CMAKE_CXX_STANDARD 17)

 option(MI_OVERRIDE          "Override the standard malloc interface" ON)
 option(MI_INTERPOSE         "Use interpose to override standard malloc on macOS" ON)
-option(MI_SEE_ASM           "Generate assembly files" OFF)
-option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode" OFF)
+option(MI_DEBUG_FULL        "Use full internal heap invariant checking in DEBUG mode" OFF)
+option(MI_SECURE            "Use full security mitigations (like guard pages, allocation randomization, double-free mitigation, and free-list corruption detection)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library" OFF)
-option(MI_SECURE            "Use security mitigations (like guard pages and randomization)" OFF)
-option(MI_SECURE_FULL       "Use full security mitigations, may be more expensive (includes double-free mitigation)" OFF)
+option(MI_SEE_ASM           "Generate assembly files" OFF)
 option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
 option(MI_BUILD_TESTS       "Build test executables" ON)
+option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)

 include("cmake/mimalloc-config-version.cmake")
-include("CheckIncludeFile")
-
-set(mi_install_dir "lib/mimalloc-${mi_version}")

 set(mi_sources
    src/stats.c
@ -32,29 +29,33 @@ set(mi_sources
    src/options.c
    src/init.c)

-# Set default build type
+# -----------------------------------------------------------------------------
+# Converience: set default build type depending on the build directory
+# -----------------------------------------------------------------------------
+
 if (NOT CMAKE_BUILD_TYPE)
-  if ("${CMAKE_BINARY_DIR}" MATCHES ".*(D|d)ebug$")
-    message(STATUS "No build type selected, default to *** Debug ***")
+  if ("${CMAKE_BINARY_DIR}" MATCHES ".*(D|d)ebug$" OR  MI_DEBUG_FULL MATCHES "ON")
+    message(STATUS "No build type selected, default to: Debug")
    set(CMAKE_BUILD_TYPE "Debug")
  else()
-    message(STATUS "No build type selected, default to *** Release ***")
+    message(STATUS "No build type selected, default to: Release")
    set(CMAKE_BUILD_TYPE "Release")
  endif()
-else()
-  message(STATUS "Build type specified as *** ${CMAKE_BUILD_TYPE} ***")
 endif()

 if("${CMAKE_BINARY_DIR}" MATCHES ".*(S|s)ecure$")
+  message(STATUS "Default to secure build")
  set(MI_SECURE "ON")
 endif()

+# -----------------------------------------------------------------------------
+# Process options
+# -----------------------------------------------------------------------------
+
 if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
  set(MI_USE_CXX "ON")
 endif()

-
-# Options
 if(MI_OVERRIDE MATCHES "ON")
  message(STATUS "Override standard malloc (MI_OVERRIDE=ON)")
  if(APPLE)
@ -70,15 +71,9 @@ if(MI_OVERRIDE MATCHES "ON")
  endif()
 endif()

-if(MI_SECURE_FULL MATCHES "ON")
-  message(STATUS "Set full secure build (may be more expensive) (MI_SECURE_FULL=ON)")
+if(MI_SECURE MATCHES "ON")
+  message(STATUS "Set full secure build (MI_SECURE=ON)")
  list(APPEND mi_defines MI_SECURE=4)
-  set(MI_SECURE "ON")
-else()
-  if(MI_SECURE MATCHES "ON")
-    message(STATUS "Set secure build (MI_SECURE=ON)")
-    list(APPEND mi_defines MI_SECURE=3)
-  endif()
 endif()

 if(MI_SEE_ASM MATCHES "ON")
@ -87,7 +82,12 @@ if(MI_SEE_ASM MATCHES "ON")
 endif()

 if(MI_CHECK_FULL MATCHES "ON")
-  message(STATUS "Set debug level to full invariant checking (MI_CHECK_FULL=ON)")
+  message(STATUS "The MI_CHECK_FULL option is deprecated, use MI_DEBUG_FULL instead")
+  set(MI_DEBUG_FULL "ON")
+endif()
+
+if(MI_DEBUG_FULL MATCHES "ON")
+  message(STATUS "Set debug level to full internal invariant checking (MI_DEBUG_FULL=ON)")
  list(APPEND mi_defines MI_DEBUG=3)   # full invariant checking
 endif()

@ -97,16 +97,6 @@ if(MI_USE_CXX MATCHES "ON")
  set_source_files_properties(src/static.c test/test-api.c PROPERTIES LANGUAGE CXX )
 endif()

-CHECK_INCLUDE_FILE("numaif.h" MI_HAVE_NUMA_H)
-if(MI_HAVE_NUMA_H)
-  list(APPEND mi_defines MI_HAS_NUMA)
-  list(APPEND mi_libraries numa)
-else()
-  if (NOT(WIN32))
-    message(WARNING "Compiling without using NUMA optimized allocation (on Linux, install libnuma-dev?)")
-  endif()
-endif()
-
 # Compiler flags
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")
  list(APPEND mi_cflags -Wall -Wextra -Wno-unknown-pragmas)
@ -122,19 +112,6 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")
  endif()
 endif()

-if(NOT(CMAKE_BUILD_TYPE MATCHES "Release|release|RelWithDebInfo|relwithdebinfo"))
-  string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type)
-  set(mi_basename "mimalloc-${build_type}")
-else()
-  if(MI_SECURE MATCHES "ON")
-    set(mi_basename "mimalloc-secure")
-  else()
-    set(mi_basename "mimalloc")
-  endif()
-endif()
-message(STATUS "Output library name   : ${mi_basename}")
-message(STATUS "Installation directory: ${mi_install_dir}")
-
 # extra needed libraries
 if(WIN32)
  list(APPEND mi_libraries psapi shell32 user32)
@ -147,9 +124,28 @@ else()
 endif()

 # -----------------------------------------------------------------------------
-# Main targets
+# Install and output names
 # -----------------------------------------------------------------------------

+set(mi_install_dir "${CMAKE_INSTALL_PREFIX}/lib/mimalloc-${mi_version}")
+if(MI_SECURE MATCHES "ON")
+  set(mi_basename "mimalloc-secure")
+else()
+  set(mi_basename "mimalloc")
+endif()
+string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LC)
+if(NOT(CMAKE_BUILD_TYPE_LC MATCHES "^(release|relwithdebinfo|minsizerel)$"))
+  set(mi_basename "${mi_basename}-${CMAKE_BUILD_TYPE_LC}") #append build type (e.g. -debug) if not a release version
+endif()
+message(STATUS "")
+message(STATUS "Library base name: ${mi_basename}")
+message(STATUS "Build type       : ${CMAKE_BUILD_TYPE_LC}")
+message(STATUS "Install directory: ${mi_install_dir}")
+message(STATUS "")
+
+# -----------------------------------------------------------------------------
+# Main targets
+# -----------------------------------------------------------------------------

 # shared library
 add_library(mimalloc SHARED ${mi_sources})
@ -251,7 +247,7 @@ endif()
 if (MI_OVERRIDE MATCHES "ON")
  target_compile_definitions(mimalloc PRIVATE MI_MALLOC_OVERRIDE)
  if(NOT WIN32)
-    # It is only possible to override malloc on Windows when building as a DLL. (src/alloc-override.c)
+    # It is only possible to override malloc on Windows when building as a DLL.
    target_compile_definitions(mimalloc-static PRIVATE MI_MALLOC_OVERRIDE)
    target_compile_definitions(mimalloc-obj PRIVATE MI_MALLOC_OVERRIDE)
  endif()
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -35,22 +35,32 @@ jobs:
        CC: gcc
        CXX: g++
        BuildType: debug
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_CHECK_FULL=ON
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
      Release:
        CC: gcc
        CXX: g++
        BuildType: release
        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+      Secure:
+        CC: gcc
+        CXX: g++
+        BuildType: secure
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
      Debug Clang:
        CC: clang
        CXX: clang++
        BuildType: debug-clang
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_CHECK_FULL=ON
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
      Release Clang:
        CC: clang
        CXX: clang++
        BuildType: release-clang
        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+      Secure Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: secure-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON

  steps:
  - task: CMake@1
--- a/bin/mimalloc-redirect.dll
+++ b/bin/mimalloc-redirect.dll
--- a/bin/mimalloc-redirect.lib
+++ b/bin/mimalloc-redirect.lib
--- a/bin/mimalloc-redirect32.dll
+++ b/bin/mimalloc-redirect32.dll
--- a/bin/mimalloc-redirect32.lib
+++ b/bin/mimalloc-redirect32.lib
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@ -1,5 +1,5 @@
 set(mi_version_major 1)
-set(mi_version_minor 1)
+set(mi_version_minor 2)
 set(mi_version ${mi_version_major}.${mi_version_minor})

 set(PACKAGE_VERSION ${mi_version})
--- a/ide/vs2019/mimalloc-override.vcxproj.filters
+++ b/ide/vs2019/mimalloc-override.vcxproj.filters
@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\src\options.c">
+      <Filter>Header Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-aligned.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-override.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-posix.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\heap.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\init.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\os.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page-queue.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\segment.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\stats.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\arena.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\bitmap.inc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-atomic.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-internal.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-new-delete.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-override.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-types.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{f1fccf27-17b9-42dd-ba51-6070baff85c6}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{39cb7e38-69d0-43fb-8406-6a0f7cefc3b4}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
--- a/ide/vs2019/mimalloc-test-stress.vcxproj
+++ b/ide/vs2019/mimalloc-test-stress.vcxproj
@ -149,8 +149,8 @@
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
-    <ProjectReference Include="mimalloc-override.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
+    <ProjectReference Include="mimalloc.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
    </ProjectReference>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/ide/vs2019/mimalloc.vcxproj.filters
+++ b/ide/vs2019/mimalloc.vcxproj.filters
@ -0,0 +1,78 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\src\alloc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-aligned.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-override.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-override-osx.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-posix.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\heap.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\init.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\options.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\os.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page-queue.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\segment.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\stats.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\arena.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\bitmap.inc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-atomic.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-internal.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-new-delete.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-override.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-types.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{2b556b10-f559-4b2d-896e-142652adbf0c}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{852a14ae-6dde-4e95-8077-ca705e97e5af}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@ -241,7 +241,7 @@ static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x)
 #endif
 #elif defined(__wasi__)
  #include <sched.h>
-  static inline void mi_atomic_yield() {
+  static inline void mi_atomic_yield(void) {
    sched_yield();
  }
 #else
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@ -22,13 +22,13 @@ terms of the MIT license. A copy of the license can be found in the file

 #if defined(_MSC_VER)
 #define mi_decl_noinline   __declspec(noinline)
-#define mi_attr_noreturn 
+#define mi_attr_noreturn
 #elif defined(__GNUC__) || defined(__clang__)
 #define mi_decl_noinline   __attribute__((noinline))
 #define mi_attr_noreturn   __attribute__((noreturn))
 #else
 #define mi_decl_noinline
-#define mi_attr_noreturn   
+#define mi_attr_noreturn
 #endif


@ -55,8 +55,6 @@ size_t     _mi_os_page_size(void);
 void       _mi_os_init(void);                                      // called from process init
 void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
 void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
-int        _mi_os_numa_node(mi_os_tld_t* tld);
-int        _mi_os_numa_node_count(void);

 bool      _mi_os_protect(void* addr, size_t size);
 bool      _mi_os_unprotect(void* addr, size_t size);
@ -77,6 +75,7 @@ void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t*
 void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
 bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
 void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
+
 uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); // page start for any page

 // "page.c"
@ -103,11 +102,11 @@ uint8_t    _mi_bsr(uintptr_t x);                // bit-scan-right, used on BSD i
 void       _mi_heap_destroy_pages(mi_heap_t* heap);
 void       _mi_heap_collect_abandon(mi_heap_t* heap);
 uintptr_t  _mi_heap_random(mi_heap_t* heap);
+void       _mi_heap_set_default_direct(mi_heap_t* heap);

 // "stats.c"
 void       _mi_stats_done(mi_stats_t* stats);

-typedef int64_t mi_msecs_t;
 mi_msecs_t  _mi_clock_now(void);
 mi_msecs_t  _mi_clock_end(mi_msecs_t start);
 mi_msecs_t  _mi_clock_start(void);
@ -409,56 +408,86 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {

 // -------------------------------------------------------------------
 // Encoding/Decoding the free list next pointers
+// Note: we pass a `null` value to be used as the `NULL` value for the 
+// end of a free list. This is to prevent the cookie itself to ever 
+// be present among user blocks (as `cookie^0==cookie`).
 // -------------------------------------------------------------------

 static inline bool mi_is_in_same_segment(const void* p, const void* q) {
  return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
 }

-static inline mi_block_t* mi_block_nextx( uintptr_t cookie, const mi_block_t* block ) {
+static inline bool mi_is_in_same_page(const void* p, const void* q) {
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  if (_mi_ptr_segment(q) != segment) return false;
+  return (_mi_segment_page_of(segment, p) == _mi_segment_page_of(segment, q));
+}
+
+static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, uintptr_t cookie ) {
  #ifdef MI_ENCODE_FREELIST
-  return (mi_block_t*)(block->next ^ cookie);
+  mi_block_t* b = (mi_block_t*)(block->next ^ cookie);
+  if (mi_unlikely((void*)b==null)) { b = NULL; }
+  return b;
  #else
-  UNUSED(cookie);
+  UNUSED(cookie); UNUSED(null);
  return (mi_block_t*)block->next;
  #endif
 }

-static inline void mi_block_set_nextx(uintptr_t cookie, mi_block_t* block, const mi_block_t* next) {  
+static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, uintptr_t cookie) {
  #ifdef MI_ENCODE_FREELIST
+  if (mi_unlikely(next==NULL)) { next = (mi_block_t*)null; }
  block->next = (mi_encoded_t)next ^ cookie;
  #else
-  UNUSED(cookie);
+  UNUSED(cookie); UNUSED(null);
  block->next = (mi_encoded_t)next;
  #endif
 }

 static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
  #ifdef MI_ENCODE_FREELIST
-  mi_block_t* next = mi_block_nextx(page->cookie,block);
+  mi_block_t* next = mi_block_nextx(page,block,page->cookie);
  // check for free list corruption: is `next` at least in our segment range?
-  // TODO: it is better to check if it is actually inside our page but that is more expensive 
-  // to calculate. Perhaps with a relative free list this becomes feasible?
-  if (next!=NULL && !mi_is_in_same_segment(block, next)) {
+  // TODO: check if `next` is `page->block_size` aligned?
+  if (next!=NULL && !mi_is_in_same_page(block, next)) {
    _mi_fatal_error("corrupted free list entry of size %zub at %p: value 0x%zx\n", page->block_size, block, (uintptr_t)next);
    next = NULL;
-  }   
+  }
  return next;
  #else
  UNUSED(page);
-  return mi_block_nextx(0, block);
+  return mi_block_nextx(page,block,0);
  #endif
 }

 static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
  #ifdef MI_ENCODE_FREELIST
-  mi_block_set_nextx(page->cookie,block,next);
+  mi_block_set_nextx(page,block,next, page->cookie);
  #else
  UNUSED(page);
-  mi_block_set_nextx(0, block, next);
+  mi_block_set_nextx(page,block, next,0);
  #endif
 }

+
+// -------------------------------------------------------------------
+// Optimize numa node access for the common case (= one node)
+// -------------------------------------------------------------------
+
+int    _mi_os_numa_node_get(mi_os_tld_t* tld);
+size_t _mi_os_numa_node_count_get(void);
+
+extern size_t _mi_numa_node_count;
+static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
+  if (mi_likely(_mi_numa_node_count == 1)) return 0;
+  else return _mi_os_numa_node_get(tld);
+}
+static inline size_t _mi_os_numa_node_count(void) {
+  if (mi_likely(_mi_numa_node_count>0)) return _mi_numa_node_count;
+  else return _mi_os_numa_node_count_get();
+}
+
+
 // -------------------------------------------------------------------
 // Getting the thread id should be performant
 // as it is called in the fast path of `_mi_free`,
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@ -26,7 +26,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // #define MI_SECURE 1  // guard page around metadata
 // #define MI_SECURE 2  // guard page around each mimalloc page
 // #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
-// #define MI_SECURE 4  // experimental, may be more expensive: checks for double free. (cmake -DMI_SECURE_FULL=ON)
+// #define MI_SECURE 4  // checks for double free. (may be more expensive)

 #if !defined(MI_SECURE)
 #define MI_SECURE 0
@ -35,7 +35,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // Define MI_DEBUG for debug mode
 // #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free.
 // #define MI_DEBUG 2  // + internal assertion checks
-// #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_CHECK_FULL=ON)
+// #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
 #if !defined(MI_DEBUG)
 #if !defined(NDEBUG) || defined(_DEBUG)
 #define MI_DEBUG 2
@ -401,7 +401,6 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)

-
 // ------------------------------------------------------
 // Thread Local data
 // ------------------------------------------------------
@ -416,6 +415,16 @@ typedef struct mi_span_queue_s {

 #define MI_SEGMENT_BIN_MAX (35)     // 35 == mi_segment_bin(MI_SLICES_PER_SEGMENT)

+typedef int64_t  mi_msecs_t;
+
+
+// OS thread local data
+typedef struct mi_os_tld_s {
+  size_t                region_idx;   // start point for next allocation
+  mi_stats_t*           stats;        // points to tld stats
+} mi_os_tld_t;
+
+
 // Segments thread local data
 typedef struct mi_segments_tld_s {
  mi_span_queue_t     spans[MI_SEGMENT_BIN_MAX+1];  // free slice spans inside segments
@ -427,14 +436,9 @@ typedef struct mi_segments_tld_s {
  size_t              cache_size;   // total size of all segments in the cache
  mi_segment_t*       cache;        // (small) cache of segments
  mi_stats_t*         stats;        // points to tld stats
+  mi_os_tld_t*        os;           // points to os stats
 } mi_segments_tld_t;

-// OS thread local data
-typedef struct mi_os_tld_s {
-  size_t              region_idx;   // start point for next allocation
-  mi_stats_t*         stats;        // points to tld stats
-} mi_os_tld_t;
-
 // Thread local data
 struct mi_tld_s {
  unsigned long long  heartbeat;     // monotonic heartbeat count
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H

-#define MI_MALLOC_VERSION 110   // major + 2 digits minor
+#define MI_MALLOC_VERSION 120   // major + 2 digits minor

 // ------------------------------------------------------
 // Compiler specific attributes
@ -230,7 +230,7 @@ mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_b
 mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
 mi_decl_export bool mi_is_redirected() mi_attr_noexcept;

-mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
 mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;

 // deprecated
@ -270,13 +270,13 @@ typedef enum mi_option_e {
  mi_option_reserve_huge_os_pages,
  mi_option_segment_cache,
  mi_option_page_reset,
-  mi_option_cache_reset,
+  mi_option_segment_reset,
  mi_option_reset_decommits,
  mi_option_eager_commit_delay,
  mi_option_allow_decommit,
-  mi_option_segment_reset,
+  mi_option_reset_delay,
+  mi_option_use_numa_nodes,
  mi_option_os_tag,
-  mi_option_max_numa_node,
  mi_option_max_errors,
  _mi_option_last
 } mi_option_t;
--- a/readme.md
+++ b/readme.md
@ -1,7 +1,7 @@

 <img align="left" width="100" height="100" src="doc/mimalloc-logo.png"/>

-[<img align="right" src="https://dev.azure.com/Daan0324/mimalloc/_apis/build/status/microsoft.mimalloc?branchName=master"/>](https://dev.azure.com/Daan0324/mimalloc/_build?definitionId=1&_a=summary)
+[<img align="right" src="https://dev.azure.com/Daan0324/mimalloc/_apis/build/status/microsoft.mimalloc?branchName=dev"/>](https://dev.azure.com/Daan0324/mimalloc/_build?definitionId=1&_a=summary)

 # mimalloc

@ -37,7 +37,7 @@ Notable aspects of the design include:
  programs.
 - __secure__: _mimalloc_ can be built in secure mode, adding guard pages,
  randomized allocation, encrypted free lists, etc. to protect against various
-  heap vulnerabilities. The performance penalty is only around 3% on average
+  heap vulnerabilities. The performance penalty is usually around 10% on average
  over our benchmarks.
 - __first-class heaps__: efficiently create and use multiple heaps to allocate across different regions.
  A heap can be destroyed at once instead of deallocating each object separately.  
@ -56,6 +56,7 @@ Enjoy!

 ### Releases

+* 2019-11-22, `v1.2.0`: stable release 1.2: bug fixes, improved secure mode (free list corruption checks, double free mitigation). Improved dynamic overriding on Windows.
 * 2019-10-07, `v1.1.0`: stable release 1.1.
 * 2019-09-01, `v1.0.8`: pre-release 8: more robust windows dynamic overriding, initial huge page support.
 * 2019-08-10, `v1.0.6`: pre-release 6: various performance improvements.
@ -64,7 +65,7 @@ Enjoy!

 ## Windows

-Open `ide/vs2017/mimalloc.sln` in Visual Studio 2017 and build.
+Open `ide/vs2019/mimalloc.sln` in Visual Studio 2019 and build (or `ide/vs2017/mimalloc.sln`).
 The `mimalloc` project builds a static library (in `out/msvc-x64`), while the
 `mimalloc-override` project builds a DLL for overriding malloc
 in the entire program.
@ -97,7 +98,7 @@ maintains detailed statistics as:
 This will name the shared library as `libmimalloc-debug.so`.

 Finally, you can build a _secure_ version that uses guard pages, encrypted
-free lists, etc, as:
+free lists, etc., as:
 ```
 > mkdir -p out/secure
 > cd out/secure
@ -138,6 +139,9 @@ target_link_libraries(myapp PUBLIC mimalloc-static)
 ```
 to link with the static library. See `test\CMakeLists.txt` for an example.

+For best performance in C++ programs, it is also recommended to override the
+global `new` and `delete` operators. For convience, mimalloc provides
+[mimalloc-new-delete.h](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.

 You can pass environment variables to print verbose messages (`MIMALLOC_VERBOSE=1`)
 and statistics (`MIMALLOC_SHOW_STATS=1`) (in the debug version):
@ -188,18 +192,18 @@ or via environment variables.
 - `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates.
 - `MIMALLOC_VERBOSE=1`: show verbose messages.
 - `MIMALLOC_SHOW_ERRORS=1`: show error and warning messages.
- `MIMALLOC_LARGE_OS_PAGES=1`: use large OS pages when available; for some workloads this can significantly 
+- `MIMALLOC_LARGE_OS_PAGES=1`: use large OS pages when available; for some workloads this can significantly
   improve performance. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
   to explicitly allow large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
   the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
   can have fragmented memory.
 - `MIMALLOC_EAGER_REGION_COMMIT=1`: on Windows, commit large (256MiB) regions eagerly. On Windows, these regions
-   show in the working set even though usually just a small part is committed to physical memory. This is why it 
-   turned off by default on Windows as it looks not good in the task manager. However, in reality it is always better 
+   show in the working set even though usually just a small part is committed to physical memory. This is why it
+   turned off by default on Windows as it looks not good in the task manager. However, in reality it is always better
   to turn it on as it improves performance and has no other drawbacks.
 - `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where N is the number of 1GiB huge OS pages. This reserves the huge pages at
-   startup and can give quite a performance improvement on long running workloads. Usually it is better to not use 
-   `MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving 
+   startup and can give quite a performance improvement on long running workloads. Usually it is better to not use
+   `MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving
   contiguous physical memory can take a long time when memory is fragmented. Still experimental.

 [linux-huge]: https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/5/html/tuning_and_optimizing_red_hat_enterprise_linux_for_oracle_9i_and_10g_databases/sect-oracle_9i_and_10g_tuning_guide-large_memory_optimization_big_pages_and_huge_pages-configuring_huge_pages_in_red_hat_enterprise_linux_4_or_5
@ -211,7 +215,7 @@ Overriding the standard `malloc` can be done either _dynamically_ or _statically

 ## Dynamic override

-This is the recommended way to override the standard malloc interface. 
+This is the recommended way to override the standard malloc interface.

 ### Linux, BSD

@ -244,29 +248,31 @@ resolved to the _mimalloc_ library.
 Note that certain security restrictions may apply when doing this from
 the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash).

-Note: unfortunately, at this time, dynamic overriding on macOS seems broken but it is actively worked on to fix this 
+Note: unfortunately, at this time, dynamic overriding on macOS seems broken but it is actively worked on to fix this
 (see issue [`#50`](https://github.com/microsoft/mimalloc/issues/50)).

 ### Windows

 On Windows you need to link your program explicitly with the mimalloc
-DLL and use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch). 
-Moreover, you need to ensure the `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) is available 
-in the same folder as the mimalloc DLL at runtime (as it as referred to by the mimalloc DLL). 
-The redirection DLL's ensure all calls to the C runtime malloc API get redirected to mimalloc. 
+DLL and use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
+Moreover, you need to ensure the `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) is available
+in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency).
+The redirection DLL ensures that all calls to the C runtime malloc API get redirected to
+mimalloc (in `mimalloc-override.dll`).

 To ensure the mimalloc DLL is loaded at run-time it is easiest to insert some
-call to the mimalloc API in the `main` function, like `mi_version()` 
+call to the mimalloc API in the `main` function, like `mi_version()`
 (or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project
-for an example on how to use this.
+for an example on how to use this. For best performance on Windows with C++, it
+is highly recommended to also override the `new`/`delete` operations (as described
+in the introduction).

 The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic
-overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc successfully redirected.
+overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected.

-(Note: in principle, it should be possible to patch existing executables 
-that are linked with the dynamic C runtime (`ucrtbase.dll`) by just putting the mimalloc DLL into
-the import table (and putting `mimalloc-redirect.dll` in the same folder) 
-Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388)). 
+(Note: in principle, it is possible to patch existing executables
+that are linked with the dynamic C runtime (`ucrtbase.dll`) by just putting the `mimalloc-override.dll` into the import table (and putting `mimalloc-redirect.dll` in the same folder)
+Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388)).


 ## Static override
@ -282,6 +288,12 @@ object file. For example:
 > gcc -o myprogram mimalloc-override.o  myfile1.c ...
 ```

+Another way to override statically that works on all platforms, is to
+link statically to mimalloc (as shown in the introduction) and include a
+header file in each source file that re-defines `malloc` etc. to `mi_malloc`.
+This is provided by [`mimalloc-override.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-override.h). This only works reliably though if all sources are
+under your control or otherwise mixing of pointers from different heaps may occur!
+

 # Performance

--- a/src/alloc.c
+++ b/src/alloc.c
@ -157,7 +157,7 @@ static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, con
 }

 static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  mi_block_t* n = mi_block_nextx(page->cookie, block); // pretend it is freed, and get the decoded first field
+  mi_block_t* n = mi_block_nextx(page, block, page->cookie); // pretend it is freed, and get the decoded first field
  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&        // quick check: aligned pointer?
      (n==NULL || mi_is_in_same_segment(block, n)))    // quick check: in same segment or NULL?
  { 
@ -230,14 +230,14 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
  }
  else {
    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* heap = page->heap;
+    mi_heap_t* heap = (mi_heap_t*)mi_atomic_read_ptr(mi_atomic_cast(void*, &page->heap));
    mi_assert_internal(heap != NULL);
    if (heap != NULL) {
      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
      mi_block_t* dfree;
      do {
        dfree = (mi_block_t*)heap->thread_delayed_free;
-        mi_block_set_nextx(heap->cookie,block,dfree);
+        mi_block_set_nextx(heap,block,dfree, heap->cookie);
      } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree));
    }

--- a/src/arena.c
+++ b/src/arena.c
@ -33,6 +33,7 @@ of 256MiB in practice.

 #include "bitmap.inc.c"  // atomic bitmap

+
 // os.c
 void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
 void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
@ -40,7 +41,7 @@ void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
 void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize);
 void  _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats);

-int   _mi_os_numa_node_count(void);
+bool  _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats); 


 /* -----------------------------------------------------------
@ -61,13 +62,15 @@ typedef uintptr_t mi_block_info_t;
 typedef struct mi_arena_s {
  uint8_t* start;                         // the start of the memory area
  size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
-  size_t   field_count;                   // number of bitmap fields
+  size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
  int      numa_node;                     // associated NUMA node
  bool     is_zero_init;                  // is the arena zero initialized?
+  bool     is_committed;                  // is the memory committed
  bool     is_large;                      // large OS page allocated
  volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks
  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
-  mi_bitmap_field_t  blocks_map[1];       // bitmap of in-use blocks 
+  mi_bitmap_field_t* blocks_committed;    // if `!is_committed`, are the blocks committed?
+  mi_bitmap_field_t  blocks_inuse[1];       // in-place bitmap of in-use blocks (of size `field_count`)
 } mi_arena_t;


@ -109,7 +112,7 @@ static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t*
  size_t idx = mi_atomic_read(&arena->search_idx);  // start from last search
  for (size_t visited = 0; visited < fcount; visited++, idx++) {
    if (idx >= fcount) idx = 0;  // wrap around
-    if (mi_bitmap_try_claim_field(arena->blocks_map, idx, blocks, bitmap_idx)) {
+    if (mi_bitmap_try_find_claim_field(arena->blocks_inuse, idx, blocks, bitmap_idx)) {
      mi_atomic_write(&arena->search_idx, idx);  // start search from here next time
      return true;
    }
@ -121,8 +124,8 @@ static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t*
 /* -----------------------------------------------------------
  Arena cache
 ----------------------------------------------------------- */
-#define MI_CACHE_MAX (8)
-#define MI_MAX_NUMA  (64)
+#define MI_CACHE_MAX (64)
+#define MI_MAX_NUMA  (16)

 #define MI_SLOT_IN_USE ((void*)1)

@ -215,25 +218,42 @@ static bool mi_cache_push(void* start, size_t size, size_t memid, bool is_commit
 ----------------------------------------------------------- */

 static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, 
-                                 bool* commit, bool* large, bool* is_zero, size_t* memid) 
+                                 bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld) 
 {
  mi_bitmap_index_t bitmap_index;
-  if (mi_arena_alloc(arena, needed_bcount, &bitmap_index)) {
-    // claimed it! set the dirty bits (todo: no need for an atomic op here?)
-    *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
-    *memid   = mi_memid_create(arena_index, bitmap_index);
-    *commit  = true;           // TODO: support commit on demand?
-    *large   = arena->is_large;
-    return (arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE));
+  if (!mi_arena_alloc(arena, needed_bcount, &bitmap_index)) return NULL;
+
+  // claimed it! set the dirty bits (todo: no need for an atomic op here?)
+  void* p  = arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE);
+  *memid   = mi_memid_create(arena_index, bitmap_index);
+  *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
+  *large   = arena->is_large;
+  if (arena->is_committed) {
+    // always committed
+    *commit = true;
  }
-  return NULL;
+  else if (commit) {
+    // ensure commit now
+    bool any_uncommitted;
+    mi_bitmap_claim(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
+    if (any_uncommitted) {
+      bool commit_zero;
+      _mi_os_commit(p, needed_bcount * MI_ARENA_BLOCK_SIZE, &commit_zero, tld->stats);
+      if (commit_zero) *is_zero = true;
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    *commit = mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
+  }
+  return p;
 }

 void* _mi_arena_alloc_aligned(size_t size, size_t alignment, 
                              bool* commit, bool* large, bool* is_zero, 
                              size_t* memid, mi_os_tld_t* tld) 
 {
-  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(commit != NULL && large != NULL && is_zero != NULL && memid != NULL && tld != NULL);
  mi_assert_internal(size > 0);
  *memid   = MI_MEMID_OS;
  *is_zero = false;
@ -258,7 +278,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
      if ((arena->numa_node<0 || arena->numa_node==numa_node) && // numa local?
          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
      { 
-        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid);
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid, tld);
        mi_assert_internal((uintptr_t)p % alignment == 0);
        if (p != NULL) return p;
      }
@ -270,7 +290,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
      if ((arena->numa_node>=0 && arena->numa_node!=numa_node) && // not numa local!
          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
      {
-        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid);
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid, tld);
        mi_assert_internal((uintptr_t)p % alignment == 0);
        if (p != NULL) return p;
      }
@ -285,9 +305,6 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
  // finally, fall back to the OS
  *is_zero = true;
  *memid   = MI_MEMID_OS;
-  if (*large) {
-    *large = mi_option_is_enabled(mi_option_large_os_pages); // try large OS pages only if enabled and allowed
-  }
  return _mi_os_alloc_aligned(size, alignment, *commit, large, tld);
 }

@ -329,7 +346,7 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool is_committed, bool
      return;
    }
    const size_t blocks = mi_block_count_of_size(size);
-    bool ones = mi_bitmap_unclaim(arena->blocks_map, arena->field_count, blocks, bitmap_idx);
+    bool ones = mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
    if (!ones) {
      _mi_fatal_error("trying to free an already freed block: %p, size %zu\n", p, size);
      return;
@ -389,15 +406,17 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
  arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
  arena->is_large = true;
  arena->is_zero_init = true;
+  arena->is_committed = true;
  arena->search_idx = 0;
-  arena->blocks_dirty = &arena->blocks_map[bcount];
+  arena->blocks_dirty = &arena->blocks_inuse[bcount];
+  arena->blocks_committed = NULL;
  // the bitmaps are already zero initialized due to os_alloc
  // just claim leftover blocks if needed
  size_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
  if (post > 0) {
    // don't use leftover bits at the end
    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
-    mi_bitmap_claim(arena->blocks_map, fields, post, postidx, NULL); 
+    mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL); 
  }
  
  mi_arena_add(arena);
@ -405,22 +424,22 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
 }


-// reserve huge pages evenly among all numa nodes. 
-int mi_reserve_huge_os_pages_interleave(size_t pages, size_t timeout_msecs) mi_attr_noexcept {
+// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
  if (pages == 0) return 0;

  // pages per numa node
-  int numa_count = _mi_os_numa_node_count();
+  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
  if (numa_count <= 0) numa_count = 1;
  const size_t pages_per = pages / numa_count;
  const size_t pages_mod = pages % numa_count;
  const size_t timeout_per = (timeout_msecs / numa_count) + 50;
  
  // reserve evenly among numa nodes
-  for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
    size_t node_pages = pages_per;  // can be 0
-    if ((size_t)numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node, timeout_per);
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
    if (err) return err;
    if (pages < node_pages) {
      pages = 0;
@ -437,7 +456,7 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
  UNUSED(max_secs);
  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
  if (pages_reserved != NULL) *pages_reserved = 0;
-  int err = mi_reserve_huge_os_pages_interleave(pages, (size_t)(max_secs * 1000.0));  
+  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));  
  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
  return err;
 }
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@ -8,11 +8,11 @@ terms of the MIT license. A copy of the license can be found in the file
 /* ----------------------------------------------------------------------------
 This file is meant to be included in other files for efficiency.
 It implements a bitmap that can set/reset sequences of bits atomically
-and is used to concurrently claim memory ranges. 
+and is used to concurrently claim memory ranges.

 A bitmap is an array of fields where each field is a machine word (`uintptr_t`)

-A current limitation is that the bit sequences cannot cross fields 
+A current limitation is that the bit sequences cannot cross fields
 and that the sequence must be smaller or equal to the bits in a field.
 ---------------------------------------------------------------------------- */
 #pragma once
@ -59,7 +59,7 @@ static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {


 // The bit mask for a given number of blocks at a specified bit index.
-static uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+static inline uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
  if (count == MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
  return ((((uintptr_t)1 << count) - 1) << bitidx);
@ -104,10 +104,30 @@ static inline size_t mi_bsr(uintptr_t x) {
  Claim a bit sequence atomically
 ----------------------------------------------------------- */

-// Try to atomically claim a sequence of `count` bits in a single 
+// Try to atomically claim a sequence of `count` bits at in `idx`
+// in the bitmap field. Returns `true` on success.
+static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t bitmap_fields, const size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  mi_assert_internal(bitidx + count <= MI_BITMAP_FIELD_BITS);
+
+  mi_bitmap_field_t field = mi_atomic_read_relaxed(&bitmap[idx]);
+  if ((field & mask) == 0) { // free?
+    if (mi_atomic_cas_strong(&bitmap[idx], (field|mask), field)) {
+      // claimed!
+      return true;
+    }
+  }
+  return false;
+}
+
+
+// Try to atomically claim a sequence of `count` bits in a single
 // field at `idx` in `bitmap`. Returns `true` on success.
-static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx) 
-{  
+static inline bool mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
+{
  mi_assert_internal(bitmap_idx != NULL);
  volatile _Atomic(uintptr_t)* field = &bitmap[idx];
  uintptr_t map  = mi_atomic_read(field);
@ -136,7 +156,7 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, con
        continue;
      }
      else {
-        // success, we claimed the bits!        
+        // success, we claimed the bits!
        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
        return true;
      }
@ -160,9 +180,9 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, con

 // Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
 // For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
-static inline bool mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t* bitmap_idx) {
+static inline bool mi_bitmap_try_find_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t* bitmap_idx) {
  for (size_t idx = 0; idx < bitmap_fields; idx++) {
-    if (mi_bitmap_try_claim_field(bitmap, idx, count, bitmap_idx)) {
+    if (mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
      return true;
    }
  }
@ -170,39 +190,51 @@ static inline bool mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields,
 }

 // Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously
+// Returns `true` if all `count` bits were 1 previously.
 static inline bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
  const size_t idx = mi_bitmap_index_field(bitmap_idx);
  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  mi_assert_internal((bitmap[idx] & mask) == mask);
+  // mi_assert_internal((bitmap[idx] & mask) == mask);
  uintptr_t prev = mi_atomic_and(&bitmap[idx], ~mask);
  return ((prev & mask) == mask);
 }


 // Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
 static inline bool mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
  const size_t idx = mi_bitmap_index_field(bitmap_idx);
  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  // mi_assert_internal((bitmap[idx] & mask) == 0);
+  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
  uintptr_t prev = mi_atomic_or(&bitmap[idx], mask);
  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
  return ((prev & mask) == 0);
 }

-// Returns `true` if all `count` bits were 1
-static inline bool mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
+static inline bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
  const size_t idx = mi_bitmap_index_field(bitmap_idx);
  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  // mi_assert_internal((bitmap[idx] & mask) == 0);
-  return ((mi_atomic_read(&bitmap[idx]) & mask) == mask);
+  mi_bitmap_field_t field = mi_atomic_read_relaxed(&bitmap[idx]);
+  if (any_ones != NULL) *any_ones = ((field & mask) != 0);
+  return ((field & mask) == mask);
 }

-#endif
+static inline bool mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+static inline bool mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
+
+
+#endif
--- a/src/heap.c
+++ b/src/heap.c
@ -223,7 +223,7 @@ static void mi_heap_free(mi_heap_t* heap) {

  // reset default
  if (mi_heap_is_default(heap)) {
-    _mi_heap_default = heap->tld->heap_backing;
+    _mi_heap_set_default_direct(heap->tld->heap_backing);
  }
  // and free the used memory
  mi_free(heap);
@ -354,8 +354,8 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
  mi_assert(mi_heap_is_initialized(heap));
  if (!mi_heap_is_initialized(heap)) return NULL;
  mi_assert_expensive(mi_heap_is_valid(heap));
-  mi_heap_t* old   = _mi_heap_default;
-  _mi_heap_default = heap;
+  mi_heap_t* old = mi_get_default_heap(); 
+  _mi_heap_set_default_direct(heap);
  return old;
 }

--- a/src/init.c
+++ b/src/init.c
@ -19,7 +19,7 @@ const mi_page_t _mi_page_empty = {
  0,
  #endif
  0,       // used
-  NULL, 
+  NULL,
  ATOMIC_VAR_INIT(0), ATOMIC_VAR_INIT(0),
  0, NULL, NULL, NULL
  #ifndef MI_ENCODE_FREELIST
@ -103,28 +103,31 @@ const mi_heap_t _mi_heap_empty = {
 };

 #define tld_empty_stats  ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats)))
+#define tld_empty_os     ((mi_os_tld_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,os)))

 static const mi_tld_t tld_empty = {
  0,
  false,
  NULL,
-  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, 0, NULL, tld_empty_stats }, // segments
+  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, 0, NULL, tld_empty_stats, tld_empty_os }, // segments
  { 0, tld_empty_stats }, // os
  { MI_STATS_NULL }       // stats
 };

+// the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;



 #define tld_main_stats  ((mi_stats_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,stats)))
+#define tld_main_os     ((mi_os_tld_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,os)))

 static mi_tld_t tld_main = {
  0, false,
  &_mi_heap_main,
-  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments
-  { 0, tld_main_stats },   // os
-  { MI_STATS_NULL }        // stats
+  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats, tld_main_os }, // segments
+  { 0, tld_main_stats },  // os
+  { MI_STATS_NULL }             // stats
 };

 mi_heap_t _mi_heap_main = {
@ -214,7 +217,7 @@ uintptr_t _mi_random_init(uintptr_t seed /* can be zero */) {

 typedef struct mi_thread_data_s {
  mi_heap_t  heap;  // must come first due to cast in `_mi_heap_done`
-  mi_tld_t   tld;
+  mi_tld_t   tld;  
 } mi_thread_data_t;

 // Initialize the thread local default heap, called from `mi_thread_init`
@ -222,8 +225,8 @@ static bool _mi_heap_init(void) {
  if (mi_heap_is_initialized(_mi_heap_default)) return true;
  if (_mi_is_main_thread()) {
    // the main heap is statically allocated
-    _mi_heap_default = &_mi_heap_main;
-    mi_assert_internal(_mi_heap_default->tld->heap_backing == _mi_heap_default);
+    _mi_heap_set_default_direct(&_mi_heap_main);
+    mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
  }
  else {
    // use `_mi_os_alloc` to allocate directly from the OS
@ -242,26 +245,26 @@ static bool _mi_heap_init(void) {
    heap->tld = tld;    
    tld->heap_backing = heap;
    tld->segments.stats = &tld->stats;
+    tld->segments.os = &tld->os;
    tld->os.stats = &tld->stats;
-    _mi_heap_default = heap;
+    _mi_heap_set_default_direct(heap);
  }
  return false;
 }

 // Free the thread local default heap (called from `mi_thread_done`)
-static bool _mi_heap_done(void) {
-  mi_heap_t* heap = _mi_heap_default;
+static bool _mi_heap_done(mi_heap_t* heap) {
  if (!mi_heap_is_initialized(heap)) return true;

  // reset default heap
-  _mi_heap_default = (_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);

  // todo: delete all non-backing heaps?

  // switch to backing heap and free it
  heap = heap->tld->heap_backing;
  if (!mi_heap_is_initialized(heap)) return false;
-  
+
  // collect if not the main thread
  if (heap != &_mi_heap_main) {
    _mi_heap_collect_abandon(heap);
@ -301,6 +304,8 @@ static bool _mi_heap_done(void) {
 // to set up the thread local keys.
 // --------------------------------------------------------

+static void _mi_thread_done(mi_heap_t* default_heap);
+
 #ifdef __wasi__
 // no pthreads in the WebAssembly Standard Interface
 #elif !defined(_WIN32)
@ -315,14 +320,14 @@ static bool _mi_heap_done(void) {
  #include <fibersapi.h>
  static DWORD mi_fls_key;
  static void NTAPI mi_fls_done(PVOID value) {
-    if (value!=NULL) mi_thread_done();
+    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
  }
 #elif defined(MI_USE_PTHREADS)
  // use pthread locol storage keys to detect thread ending
  #include <pthread.h>
  static pthread_key_t mi_pthread_key;
  static void mi_pthread_done(void* value) {
-    if (value!=NULL) mi_thread_done();
+    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
  }
 #elif defined(__wasi__)
 // no pthreads in the WebAssembly Standard Interface
@ -356,6 +361,8 @@ void mi_thread_init(void) mi_attr_noexcept
  mi_process_init();

  // initialize the thread local default heap
+  // (this will call `_mi_heap_set_default_direct` and thus set the
+  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
  if (_mi_heap_init()) return;  // returns true if already initialized

  // don't further initialize for the main thread
@ -363,33 +370,38 @@ void mi_thread_init(void) mi_attr_noexcept

  _mi_stat_increase(&mi_get_default_heap()->tld->stats.threads, 1);

-  // set hooks so our mi_thread_done() will be called
-  #if defined(_WIN32) && defined(MI_SHARED_LIB)
-    // nothing to do as it is done in DllMain
-  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
-    FlsSetValue(mi_fls_key, (void*)(_mi_thread_id()|1)); // set to a dummy value so that `mi_fls_done` is called
-  #elif defined(MI_USE_PTHREADS)
-    pthread_setspecific(mi_pthread_key, (void*)(_mi_thread_id()|1)); // set to a dummy value so that `mi_pthread_done` is called
-  #endif
-
  //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }

 void mi_thread_done(void) mi_attr_noexcept {
+  _mi_thread_done(mi_get_default_heap());
+}
+
+static void _mi_thread_done(mi_heap_t* heap) {
  // stats
-  mi_heap_t* heap = mi_get_default_heap();
  if (!_mi_is_main_thread() && mi_heap_is_initialized(heap))  {
    _mi_stat_decrease(&heap->tld->stats.threads, 1);
  }
-
  // abandon the thread local heap
-  if (_mi_heap_done()) return; // returns true if already ran
-
-  //if (!_mi_is_main_thread()) {
-  //  _mi_verbose_message("thread done: 0x%zx\n", _mi_thread_id());
-  //}
+  if (_mi_heap_done(heap)) return; // returns true if already ran
 }

+void _mi_heap_set_default_direct(mi_heap_t* heap)  {
+  mi_assert_internal(heap != NULL);
+  _mi_heap_default = heap;
+
+  // ensure the default heap is passed to `_mi_thread_done`
+  // setting to a non-NULL value also ensures `mi_thread_done` is called.
+  #if defined(_WIN32) && defined(MI_SHARED_LIB)
+    // nothing to do as it is done in DllMain
+  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    FlsSetValue(mi_fls_key, heap);
+  #elif defined(MI_USE_PTHREADS)
+    pthread_setspecific(mi_pthread_key, heap);
+  #endif
+}
+
+

 // --------------------------------------------------------
 // Run functions on process init/done, and thread init/done
@ -409,7 +421,7 @@ bool mi_is_redirected() mi_attr_noexcept {
 }

 // Communicate with the redirection module on Windows
-#if defined(_WIN32) && defined(MI_SHARED_LIB) 
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -455,11 +467,6 @@ static void mi_process_load(void) {
  if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
    _mi_fputs(NULL,NULL,msg);
  }
-
-  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
-    size_t pages = mi_option_get(mi_option_reserve_huge_os_pages);    
-    mi_reserve_huge_os_pages_interleave(pages, pages*500);
-  }
 }

 // Initialize the process; called by thread_init or the process loader
@ -469,7 +476,7 @@ void mi_process_init(void) mi_attr_noexcept {
  // access _mi_heap_default before setting _mi_process_is_initialized to ensure
  // that the TLS slot is allocated without getting into recursion on macOS
  // when using dynamic linking with interpose.
-  mi_heap_t* h = _mi_heap_default;
+  mi_heap_t* h = mi_get_default_heap();
  _mi_process_is_initialized = true;

  _mi_heap_main.thread_id = _mi_thread_id();
@ -484,8 +491,14 @@ void mi_process_init(void) mi_attr_noexcept {
  #if (MI_DEBUG)
  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
  #endif
+  _mi_verbose_message("secure level: %d\n", MI_SECURE);
  mi_thread_init();
  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
+
+  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
+    size_t pages = mi_option_get(mi_option_reserve_huge_os_pages);
+    mi_reserve_huge_os_pages_interleave(pages, 0, pages*500);
+  }
 }

 // Called when the process is done (through `at_exit`)
@ -512,7 +525,7 @@ static void mi_process_done(void) {


 #if defined(_WIN32) && defined(MI_SHARED_LIB)
-  // Windows DLL: easy to hook into process_init and thread_done  
+  // Windows DLL: easy to hook into process_init and thread_done
  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
    UNUSED(reserved);
    UNUSED(inst);
--- a/src/memory.c
+++ b/src/memory.c
@ -0,0 +1,485 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+This implements a layer between the raw OS memory (VirtualAlloc/mmap/sbrk/..)
+and the segment and huge object allocation by mimalloc. There may be multiple
+implementations of this (one could be the identity going directly to the OS,
+another could be a simple cache etc), but the current one uses large "regions".
+In contrast to the rest of mimalloc, the "regions" are shared between threads and
+need to be accessed using atomic operations.
+We need this memory layer between the raw OS calls because of:
+1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
+   to reuse memory effectively.
+2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
+   an OS allocation/free is still (much) too expensive relative to the accesses 
+   in that object :-( (`malloc-large` tests this). This means we need a cheaper 
+   way to reuse memory.
+3. This layer allows for NUMA aware allocation.
+
+Possible issues:
+- (2) can potentially be addressed too with a small cache per thread which is much
+  simpler. Generally though that requires shrinking of huge pages, and may overuse
+  memory per thread. (and is not compatible with `sbrk`).
+- Since the current regions are per-process, we need atomic operations to
+  claim blocks which may be contended
+- In the worst case, we need to search the whole region map (16KiB for 256GiB)
+  linearly. At what point will direct OS calls be faster? Is there a way to
+  do this better without adding too much complexity?
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+
+#include "bitmap.inc.c"
+
+// Internal raw OS interface
+size_t  _mi_os_large_page_size();
+bool    _mi_os_protect(void* addr, size_t size);
+bool    _mi_os_unprotect(void* addr, size_t size);
+bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool    _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
+bool    _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
+bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+
+// arena.c
+void    _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats);
+void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
+void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
+
+
+
+// Constants
+#if (MI_INTPTR_SIZE==8)
+#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 48KiB for the region map 
+#elif (MI_INTPTR_SIZE==4)
+#define MI_HEAP_REGION_MAX_SIZE    (3 * GiB)    // ~ KiB for the region map
+#else
+#error "define the maximum heap space allowed for regions on this platform"
+#endif
+
+#define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE
+
+#define MI_REGION_MAX_BLOCKS      MI_BITMAP_FIELD_BITS
+#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB  (64MiB on 32 bits)
+#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  // 1024  (48 on 32 bits)
+#define MI_REGION_MAX_OBJ_BLOCKS  (MI_REGION_MAX_BLOCKS/4)                    // 64MiB
+#define MI_REGION_MAX_OBJ_SIZE    (MI_REGION_MAX_OBJ_BLOCKS*MI_SEGMENT_SIZE)  
+
+// Region info is a pointer to the memory region and two bits for 
+// its flags: is_large, and is_committed.
+typedef union mi_region_info_u {
+  uintptr_t value;
+  struct {
+    bool  valid;
+    bool  is_large;
+    int   numa_node;
+  };
+} mi_region_info_t;
+
+
+// A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
+// a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
+typedef struct mem_region_s {
+  volatile _Atomic(uintptr_t)        info;        // is_large, and associated numa node + 1 (so 0 is no association)
+  volatile _Atomic(void*)            start;       // start of the memory area (and flags)
+  mi_bitmap_field_t                  in_use;      // bit per in-use block
+  mi_bitmap_field_t                  dirty;       // track if non-zero per block
+  mi_bitmap_field_t                  commit;      // track if committed per block (if `!info.is_committed))
+  mi_bitmap_field_t                  reset;       // track reset per block
+  volatile _Atomic(uintptr_t)        arena_memid; // if allocated from a (huge page) arena-
+} mem_region_t;
+
+// The region map
+static mem_region_t regions[MI_REGION_MAX];
+
+// Allocated regions
+static volatile _Atomic(uintptr_t) regions_count; // = 0;        
+
+
+/* ----------------------------------------------------------------------------
+Utility functions
+-----------------------------------------------------------------------------*/
+
+// Blocks (of 4MiB) needed for the given size.
+static size_t mi_region_block_count(size_t size) {
+  return _mi_divide_up(size, MI_SEGMENT_SIZE);
+}
+
+/*
+// Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
+static size_t mi_good_commit_size(size_t size) {
+  if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
+  return _mi_align_up(size, _mi_os_large_page_size());
+}
+*/
+
+// Return if a pointer points into a region reserved by us.
+bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  if (p==NULL) return false;
+  size_t count = mi_atomic_read_relaxed(&regions_count);
+  for (size_t i = 0; i < count; i++) {
+    uint8_t* start = (uint8_t*)mi_atomic_read_ptr_relaxed(&regions[i].start);
+    if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
+  }
+  return false;
+}
+
+
+static void* mi_region_blocks_start(const mem_region_t* region, mi_bitmap_index_t bit_idx) {
+  void* start = mi_atomic_read_ptr(&region->start);
+  mi_assert_internal(start != NULL);
+  return ((uint8_t*)start + (bit_idx * MI_SEGMENT_SIZE));  
+}
+
+static size_t mi_memid_create(mem_region_t* region, mi_bitmap_index_t bit_idx) {
+  mi_assert_internal(bit_idx < MI_BITMAP_FIELD_BITS);
+  size_t idx = region - regions;
+  mi_assert_internal(&regions[idx] == region);
+  return (idx*MI_BITMAP_FIELD_BITS + bit_idx)<<1;
+}
+
+static size_t mi_memid_create_from_arena(size_t arena_memid) {
+  return (arena_memid << 1) | 1;
+}
+
+
+static bool mi_memid_is_arena(size_t id, mem_region_t** region, mi_bitmap_index_t* bit_idx, size_t* arena_memid) {
+  if ((id&1)==1) {
+    if (arena_memid != NULL) *arena_memid = (id>>1);
+    return true;
+  }
+  else {
+    size_t idx = (id >> 1) / MI_BITMAP_FIELD_BITS;
+    *bit_idx   = (mi_bitmap_index_t)(id>>1) % MI_BITMAP_FIELD_BITS;
+    *region    = &regions[idx];
+    return false;
+  }
+}
+
+
+/* ----------------------------------------------------------------------------
+  Allocate a region is allocated from the OS (or an arena)
+-----------------------------------------------------------------------------*/
+
+static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
+{
+  // not out of regions yet?
+  if (mi_atomic_read_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;
+
+  // try to allocate a fresh region from the OS
+  bool region_commit = (commit && mi_option_is_enabled(mi_option_eager_region_commit));
+  bool region_large = (commit && allow_large);
+  bool is_zero = false;
+  size_t arena_memid = 0;
+  void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
+  if (start == NULL) return false;
+  mi_assert_internal(!(region_large && !allow_large));
+  mi_assert_internal(!region_large || region_commit);
+
+  // claim a fresh slot
+  const uintptr_t idx = mi_atomic_increment(&regions_count);
+  if (idx >= MI_REGION_MAX) {
+    mi_atomic_decrement(&regions_count);
+    _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);
+    return false;
+  }
+
+  // allocated, initialize and claim the initial blocks
+  mem_region_t* r = &regions[idx];
+  r->arena_memid  = arena_memid;
+  mi_atomic_write(&r->in_use, 0);
+  mi_atomic_write(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL));
+  mi_atomic_write(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0));
+  mi_atomic_write(&r->reset, 0);
+  *bit_idx = 0;
+  mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
+  mi_atomic_write_ptr(&r->start, start);
+
+  // and share it 
+  mi_region_info_t info;
+  info.valid = true;
+  info.is_large = region_large;
+  info.numa_node = _mi_os_numa_node(tld);
+  mi_atomic_write(&r->info, info.value); // now make it available to others
+  *region = r;
+  return true;
+}
+
+/* ----------------------------------------------------------------------------
+  Try to claim blocks in suitable regions
+-----------------------------------------------------------------------------*/
+
+static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool allow_large ) {
+  // initialized at all?
+  mi_region_info_t info;
+  info.value = mi_atomic_read_relaxed(&region->info);
+  if (info.value==0) return false;
+
+  // numa correct
+  if (numa_node >= 0) {  // use negative numa node to always succeed
+    int rnode = info.numa_node;
+    if (rnode >= 0 && rnode != numa_node) return false;
+  }
+
+  // check allow-large
+  if (!allow_large && info.is_large) return false;
+
+  return true;
+}
+
+
+static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
+{
+  // try all regions for a free slot  
+  const size_t count = mi_atomic_read(&regions_count);
+  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? 
+  for (size_t visited = 0; visited < count; visited++, idx++) {
+    if (idx >= count) idx = 0;  // wrap around
+    mem_region_t* r = &regions[idx];
+    if (mi_region_is_suitable(r, numa_node, allow_large)) {
+      if (mi_bitmap_try_find_claim_field(&r->in_use, 0, blocks, bit_idx)) {
+        tld->region_idx = idx;    // remember the last found position
+        *region = r;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+
+static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(blocks <= MI_BITMAP_FIELD_BITS);
+  mem_region_t* region;
+  mi_bitmap_index_t bit_idx;
+  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
+  // try to claim in existing regions
+  if (!mi_region_try_claim(numa_node, blocks, *is_large, &region, &bit_idx, tld)) {
+    // otherwise try to allocate a fresh region
+    if (!mi_region_try_alloc_os(blocks, *commit, *is_large, &region, &bit_idx, tld)) {
+      // out of regions or memory
+      return NULL;
+    }
+  }
+  
+  
+  // found a region and claimed `blocks` at `bit_idx`
+  mi_assert_internal(region != NULL);
+  mi_assert_internal(mi_bitmap_is_claimed(&region->in_use, 1, blocks, bit_idx));
+
+  mi_region_info_t info;
+  info.value = mi_atomic_read(&region->info);
+  void* start = mi_atomic_read_ptr(&region->start);
+  mi_assert_internal(!(info.is_large && !*is_large));
+  mi_assert_internal(start != NULL);
+
+  *is_zero = mi_bitmap_unclaim(&region->dirty, 1, blocks, bit_idx);  
+  *is_large = info.is_large;
+  *memid = mi_memid_create(region, bit_idx);
+  void* p = (uint8_t*)start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);
+
+  // commit
+  if (*commit) {
+    // ensure commit
+    bool any_uncommitted;
+    mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_uncommitted);
+    if (any_uncommitted) {
+      mi_assert_internal(!info.is_large);
+      bool commit_zero;
+      _mi_mem_commit(p, blocks * MI_SEGMENT_SIZE, &commit_zero, tld);
+      if (commit_zero) *is_zero = true;
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    *commit = mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx);
+  }  
+  mi_assert_internal(mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx));
+
+  // unreset reset blocks
+  if (mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx)) {
+    mi_assert_internal(!info.is_large);
+    mi_assert_internal(!mi_option_is_enabled(mi_option_eager_commit) || *commit); 
+    mi_bitmap_unclaim(&region->reset, 1, blocks, bit_idx);
+    bool reset_zero;
+    _mi_mem_unreset(p, blocks * MI_SEGMENT_SIZE, &reset_zero, tld);
+    if (reset_zero) *is_zero = true;
+  }
+  mi_assert_internal(!mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx));
+
+  #if (MI_DEBUG>=2)
+  if (*commit) { ((uint8_t*)p)[0] = 0; }
+  #endif
+  
+  // and return the allocation  
+  mi_assert_internal(p != NULL);  
+  return p;
+}
+
+
+/* ----------------------------------------------------------------------------
+ Allocation
+-----------------------------------------------------------------------------*/
+
+// Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
+// (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
+void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  *memid = 0;
+  *is_zero = false;
+  bool default_large = false;
+  if (large==NULL) large = &default_large;  // ensure `large != NULL`  
+  if (size == 0) return NULL;
+  size = _mi_align_up(size, _mi_os_page_size());
+
+  // allocate from regions if possible
+  size_t arena_memid;
+  const size_t blocks = mi_region_block_count(size);
+  if (blocks <= MI_REGION_MAX_OBJ_BLOCKS && alignment <= MI_SEGMENT_ALIGN) {
+    void* p = mi_region_try_alloc(blocks, commit, large, is_zero, memid, tld);
+    mi_assert_internal(p == NULL || (uintptr_t)p % alignment == 0);    
+    if (p != NULL) {
+      #if (MI_DEBUG>=2)
+      if (*commit) { ((uint8_t*)p)[0] = 0; }
+      #endif
+      return p;
+    }
+    _mi_warning_message("unable to allocate from region: size %zu\n", size);
+  }
+
+  // and otherwise fall back to the OS
+  void* p = _mi_arena_alloc_aligned(size, alignment, commit, large, is_zero, &arena_memid, tld);
+  *memid = mi_memid_create_from_arena(arena_memid);
+  mi_assert_internal( p == NULL || (uintptr_t)p % alignment == 0);
+  if (p != NULL && *commit) { ((uint8_t*)p)[0] = 0; }
+  return p;
+}
+
+
+
+/* ----------------------------------------------------------------------------
+Free
+-----------------------------------------------------------------------------*/
+
+// Free previously allocated memory with a given id.
+void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_reset, mi_os_tld_t* tld) {
+  mi_assert_internal(size > 0 && tld != NULL);
+  if (p==NULL) return;
+  if (size==0) return;
+  size = _mi_align_up(size, _mi_os_page_size());
+  
+  size_t arena_memid = 0;
+  mi_bitmap_index_t bit_idx;
+  mem_region_t* region;
+  if (mi_memid_is_arena(id,&region,&bit_idx,&arena_memid)) {
+   // was a direct arena allocation, pass through
+    _mi_arena_free(p, size, arena_memid, tld->stats);
+  }
+  else {
+    // allocated in a region
+    mi_assert_internal(size <= MI_REGION_MAX_OBJ_SIZE); if (size > MI_REGION_MAX_OBJ_SIZE) return;
+    const size_t blocks = mi_region_block_count(size);
+    mi_assert_internal(blocks + bit_idx <= MI_BITMAP_FIELD_BITS);
+    mi_region_info_t info;
+    info.value = mi_atomic_read(&region->info);
+    mi_assert_internal(info.value != 0);
+    void* blocks_start = mi_region_blocks_start(region, bit_idx);
+    mi_assert_internal(blocks_start == p); // not a pointer in our area?
+    mi_assert_internal(bit_idx + blocks <= MI_BITMAP_FIELD_BITS);
+    if (blocks_start != p || bit_idx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?
+
+    // committed?
+    if (full_commit && (size % MI_SEGMENT_SIZE) == 0) {
+      mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, NULL);
+    }
+
+    if (any_reset) {
+      // set the is_reset bits if any pages were reset
+      mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, NULL);
+    }
+
+    // reset the blocks to reduce the working set.
+    if (!info.is_large && mi_option_is_enabled(mi_option_segment_reset) &&
+        mi_option_is_enabled(mi_option_eager_commit))  // cannot reset halfway committed segments, use only `option_page_reset` instead            
+    {
+      bool any_unreset;
+      mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, &any_unreset);
+      if (any_unreset) {
+        _mi_mem_reset(p, blocks * MI_SEGMENT_SIZE, tld);
+      }
+    }    
+
+    // and unclaim
+    bool all_unclaimed = mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
+    mi_assert_internal(all_unclaimed); UNUSED(all_unclaimed);
+  }
+}
+
+
+/* ----------------------------------------------------------------------------
+  collection
+-----------------------------------------------------------------------------*/
+void _mi_mem_collect(mi_os_tld_t* tld) {
+  // free every region that has no segments in use.
+  uintptr_t rcount = mi_atomic_read_relaxed(&regions_count);
+  for (size_t i = 0; i < rcount; i++) {
+    mem_region_t* region = &regions[i];
+    if (mi_atomic_read_relaxed(&region->info) != 0) {
+      // if no segments used, try to claim the whole region
+      uintptr_t m;
+      do {
+        m = mi_atomic_read_relaxed(&region->in_use);
+      } while(m == 0 && !mi_atomic_cas_weak(&region->in_use, MI_BITMAP_FIELD_FULL, 0 ));
+      if (m == 0) {
+        // on success, free the whole region
+        void* start = mi_atomic_read_ptr(&regions[i].start);
+        size_t arena_memid = mi_atomic_read_relaxed(&regions[i].arena_memid);
+        memset(&regions[i], 0, sizeof(mem_region_t));
+        // and release the whole region
+        mi_atomic_write(&region->info, 0);
+        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {          
+          _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);
+        }
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------------
+  Other
+-----------------------------------------------------------------------------*/
+
+bool _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld) {
+  return _mi_os_reset(p, size, tld->stats);
+}
+
+bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
+  return _mi_os_unreset(p, size, is_zero, tld->stats);
+}
+
+bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
+  return _mi_os_commit(p, size, is_zero, tld->stats);
+}
+
+bool _mi_mem_decommit(void* p, size_t size, mi_os_tld_t* tld) {
+  return _mi_os_decommit(p, size, tld->stats);
+}
+
+bool _mi_mem_protect(void* p, size_t size) {
+  return _mi_os_protect(p, size);
+}
+
+bool _mi_mem_unprotect(void* p, size_t size) {
+  return _mi_os_unprotect(p, size);
+}
--- a/src/options.c
+++ b/src/options.c
@ -60,15 +60,15 @@ static mi_option_desc_t options[_mi_option_last] =
  { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
  { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
-  { 0, UNINIT, MI_OPTION(page_reset) },
-  { 0, UNINIT, MI_OPTION(cache_reset) },
-  { 0, UNINIT, MI_OPTION(reset_decommits) },     // note: cannot enable this if secure is on
-  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
-  { 1, UNINIT, MI_OPTION(allow_decommit) },      // decommit pages when not eager committed
+  { 0, UNINIT, MI_OPTION(page_reset) },          // reset pages on free
  { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
+  { 1, UNINIT, MI_OPTION(reset_decommits) },     // reset decommits memory
+  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+  { 0, UNINIT, MI_OPTION(allow_decommit) },      // decommit pages when not eager committed
+  { 500,UNINIT, MI_OPTION(reset_delay) },        // reset delay in milli-seconds
+  { 0,   UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes. 
  { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
-  { 256, UNINIT, MI_OPTION(max_numa_node) },     // maximum allowed numa node
-  { 16, UNINIT, MI_OPTION(max_errors) }          // maximum errors that are output
+  { 16,  UNINIT, MI_OPTION(max_errors) }         // maximum errors that are output
 };

 static void mi_option_init(mi_option_desc_t* desc);
@ -84,7 +84,7 @@ void _mi_options_init(void) {
      mi_option_desc_t* desc = &options[option];
      _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
    }
-  }
+  }  
  mi_max_error_count = mi_option_get(mi_option_max_errors);
 }

--- a/src/os.c
+++ b/src/os.c
@ -299,7 +299,10 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
  #if !defined(MAP_ANONYMOUS)
  #define MAP_ANONYMOUS  MAP_ANON
  #endif
-  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  #if !defined(MAP_NORESERVE)
+  #define MAP_NORESERVE  0
+  #endif
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
  int fd = -1;
  #if defined(MAP_ALIGNED)  // BSD
  if (try_alignment > 0) {
@ -625,31 +628,41 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
  }
  #elif defined(__wasi__)
  // WebAssembly guests can't control memory protection
+  #elif defined(MAP_FIXED)
+  if (!commit) {
+    // use mmap with MAP_FIXED to discard the existing memory (and reduce commit charge)
+    void* p = mmap(start, size, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), -1, 0);
+    if (p != start) { err = errno; }
+  }
+  else {
+    // for commit, just change the protection
+    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
+    if (err != 0) { err = errno; }
+  }
  #else
  err = mprotect(start, csize, (commit ? (PROT_READ | PROT_WRITE) : PROT_NONE));
  if (err != 0) { err = errno; }
  #endif
  if (err != 0) {
-    _mi_warning_message("commit/decommit error: start: 0x%p, csize: 0x%x, err: %i\n", start, csize, err);
+    _mi_warning_message("%s error: start: 0x%p, csize: 0x%x, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
  }
  mi_assert_internal(err == 0);
  return (err == 0);
 }

 bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
-  return mi_os_commitx(addr, size, true, false /* conservative? */, is_zero, stats);
+  return mi_os_commitx(addr, size, true, false /* liberal */, is_zero, stats);
 }

 bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats) {
  bool is_zero;
-  return mi_os_commitx(addr, size, false, true /* conservative? */, &is_zero, stats);
+  return mi_os_commitx(addr, size, false, true /* conservative */, &is_zero, stats);
 }

 bool _mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
-  return mi_os_commitx(addr, size, true, true /* conservative? */, is_zero, stats);
+  return mi_os_commitx(addr, size, true, true /* conservative */, is_zero, stats);
 }

-
 // Signal to the OS that the address range is no longer in use
 // but may be used later again. This will release physical memory
 // pages and reduce swapping while keeping the memory committed.
@ -708,7 +721,7 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
 // We page align to a conservative area inside the range to reset.
 bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
  if (mi_option_is_enabled(mi_option_reset_decommits)) {
-    return _mi_os_decommit(addr,size,stats);
+    return _mi_os_decommit(addr, size, stats);
  }
  else {
    return mi_os_resetx(addr, size, true, stats);
@ -799,9 +812,9 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
  const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;

  mi_win_enable_large_os_pages();
-  
+
  #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
-  MEM_EXTENDED_PARAMETER params[3] = { {0,0},{0,0},{0,0} };  
+  MEM_EXTENDED_PARAMETER params[3] = { {0,0},{0,0},{0,0} };
  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
  static bool mi_huge_pages_available = true;
  if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
@ -831,7 +844,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
  // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
  if (pVirtualAlloc2 != NULL && numa_node >= 0) {
    params[0].Type = MemExtendedParameterNumaNode;
-    params[0].ULong = (unsigned)numa_node;    
+    params[0].ULong = (unsigned)numa_node;
    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
  }
  #endif
@ -840,28 +853,35 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
 }

 #elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8)
-#ifdef MI_HAS_NUMA
-#include <numaif.h> // mbind, and use -lnuma
+#include <sys/syscall.h>
+#ifndef MPOL_PREFERRED
+#define MPOL_PREFERRED 1
+#endif
+#if defined(SYS_mbind)
+static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags);
+}
+#else
+static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  UNUSED(start); UNUSED(len); UNUSED(mode); UNUSED(nmask); UNUSED(maxnode); UNUSED(flags);
+  return 0;
+}
 #endif
 static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
  mi_assert_internal(size%GiB == 0);
  bool is_large = true;
  void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
  if (p == NULL) return NULL;
-  #ifdef MI_HAS_NUMA
  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
    uintptr_t numa_mask = (1UL << numa_node);
-    // TODO: does `mbind` work correctly for huge OS pages? should we 
+    // TODO: does `mbind` work correctly for huge OS pages? should we
    // use `set_mempolicy` before calling mmap instead?
    // see: <https://lkml.org/lkml/2017/2/9/875>
-    long err = mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
+    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
    if (err != 0) {
      _mi_warning_message("failed to bind huge (1GiB) pages to NUMA node %d: %s\n", numa_node, strerror(errno));
    }
  }
-  #else
-  UNUSED(numa_node);
-  #endif
  return p;
 }
 #else
@ -870,7 +890,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
 }
 #endif

-#if (MI_INTPTR_SIZE >= 8) 
+#if (MI_INTPTR_SIZE >= 8)
 // To ensure proper alignment, use our own area for huge OS pages
 static _Atomic(uintptr_t)  mi_huge_start; // = 0

@ -913,7 +933,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
  size_t size = 0;
  uint8_t* start = mi_os_claim_huge_pages(pages, &size);
  if (start == NULL) return NULL; // or 32-bit systems
-  
+
  // Allocate one page at the time but try to place them contiguously
  // We allocate one page at the time to be able to abort if it takes too long
  // or to at least allocate as many as available on the system.
@ -933,11 +953,11 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
      }
      break;
    }
-    
+
    // success, record it
    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
-    
+
    // check for timeout
    if (max_msecs > 0) {
      mi_msecs_t elapsed = _mi_clock_end(start_t);
@ -971,88 +991,76 @@ void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
 }

 /* ----------------------------------------------------------------------------
-Support NUMA aware allocation 
+Support NUMA aware allocation
 -----------------------------------------------------------------------------*/
 #ifdef WIN32
-static int mi_os_numa_nodex() {
+static size_t mi_os_numa_nodex() {
  PROCESSOR_NUMBER pnum;
  USHORT numa_node = 0;
  GetCurrentProcessorNumberEx(&pnum);
  GetNumaProcessorNodeEx(&pnum,&numa_node);
-  return (int)numa_node;
+  return numa_node;
 }

-static int mi_os_numa_node_countx(void) {
+static size_t mi_os_numa_node_countx(void) {
  ULONG numa_max = 0;
  GetNumaHighestNodeNumber(&numa_max);
-  return (int)(numa_max + 1);
+  return (numa_max + 1);
 }
 #elif defined(__linux__)
-#include <dirent.h>
-#include <stdlib.h>
-#include <sys/syscall.h>
+#include <sys/syscall.h>  // getcpu
+#include <stdio.h>        // access

-static int mi_os_numa_nodex(void) {
+static size_t mi_os_numa_nodex(void) {
 #ifdef SYS_getcpu
-  unsigned node = 0;
-  unsigned ncpu = 0;
-  int err = syscall(SYS_getcpu, &ncpu, &node, NULL);
+  unsigned long node = 0;
+  unsigned long ncpu = 0;
+  long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
  if (err != 0) return 0;
-  return (int)node;
+  return node;
 #else
  return 0;
 #endif
 }
-
-static int mi_os_numa_node_countx(void) {
-  DIR* d = opendir("/sys/devices/system/node");
-  if (d==NULL) return 1;
-  
-  struct dirent* de;
-  int max_node_num = 0;
-  while ((de = readdir(d)) != NULL) {
-  	int node_num;
-  	if (strncmp(de->d_name, "node", 4) == 0) {
-		  node_num = (int)strtol(de->d_name+4, NULL, 0);
-			if (max_node_num < node_num) max_node_num = node_num;
-    }
+static size_t mi_os_numa_node_countx(void) {
+  char buf[128];
+  unsigned node = 0;
+  for(node = 0; node < 256; node++) {
+    // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
+    snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
+    if (access(buf,R_OK) != 0) break;
  }
-  closedir(d);
-  return (max_node_num + 1);
+  return (node+1);
 }
 #else
-static int mi_os_numa_nodex(void) {
+static size_t mi_os_numa_nodex(void) {
  return 0;
 }
-static int mi_os_numa_node_countx(void) {
+static size_t mi_os_numa_node_countx(void) {
  return 1;
 }
 #endif

-int _mi_os_numa_node_count(void) {
-  static int numa_node_count = 0;   // cache the node count 
-  if (mi_unlikely(numa_node_count <= 0)) {
-    int ncount = mi_os_numa_node_countx();    
-    int ncount0 = ncount;
-    // never more than max numa node and at least 1
-    int nmax = 1 + (int)mi_option_get(mi_option_max_numa_node);
-    if (ncount > nmax) ncount = nmax;
-    if (ncount <= 0)   ncount = 1;
-    numa_node_count = ncount;
-    _mi_verbose_message("using %i numa regions (%i nodes detected)\n", numa_node_count, ncount0);
+size_t _mi_numa_node_count = 0;   // cache the node count
+
+size_t _mi_os_numa_node_count_get(void) {
+  if (mi_unlikely(_mi_numa_node_count <= 0)) {
+    long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
+    if (ncount <= 0) ncount = (long)mi_os_numa_node_countx();        // or detect dynamically
+    _mi_numa_node_count = (size_t)(ncount <= 0 ? 1 : ncount);
+    _mi_verbose_message("using %zd numa regions\n", _mi_numa_node_count);
  }
-  mi_assert_internal(numa_node_count >= 1);
-  return numa_node_count;
+  mi_assert_internal(_mi_numa_node_count >= 1);
+  return _mi_numa_node_count;
 }

-int _mi_os_numa_node(mi_os_tld_t* tld) {
+int _mi_os_numa_node_get(mi_os_tld_t* tld) {
  UNUSED(tld);
-  int numa_count = _mi_os_numa_node_count();
+  size_t numa_count = _mi_os_numa_node_count();
  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
  // never more than the node count and >= 0
-  int numa_node = mi_os_numa_nodex();
+  size_t numa_node = mi_os_numa_nodex();
  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
-  if (numa_node < 0) numa_node = 0;  
-  return numa_node;
+  return (int)numa_node;
 }

--- a/src/page-queue.c
+++ b/src/page-queue.c
@ -260,7 +260,7 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
  page->heap->page_count--;
  page->next = NULL;
  page->prev = NULL;
-  page->heap = NULL;
+  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
  mi_page_set_in_full(page,false);
 }

@ -274,7 +274,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));

  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
-  page->heap = heap;
+  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap);
  page->next = queue->first;
  page->prev = NULL;
  if (queue->first != NULL) {
@ -338,7 +338,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
  // set append pages to new heap and count
  size_t count = 0;
  for (mi_page_t* page = append->first; page != NULL; page = page->next) {
-    page->heap = heap;
+    mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap);
    count++;
  }

--- a/src/page.c
+++ b/src/page.c
@ -75,6 +75,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {

  mi_segment_t* segment = _mi_page_segment(page);
  uint8_t* start = _mi_page_start(segment,page,NULL);
+
  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
  
  mi_assert_internal(mi_page_list_is_valid(page,page->free));
@ -227,7 +228,10 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
 void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
  mi_assert_expensive(mi_page_is_valid_init(page));
  mi_assert_internal(page->heap == NULL);
+
  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  mi_assert_internal(!page->is_reset);  
+
  _mi_page_free_collect(page,false);
  mi_page_queue_t* pq = mi_page_queue(heap, page->block_size);
  mi_page_queue_push(heap, pq, page);
@ -282,7 +286,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {

  // and free them all
  while(block != NULL) {
-    mi_block_t* next = mi_block_nextx(heap->cookie,block);
+    mi_block_t* next = mi_block_nextx(heap,block, heap->cookie);
    // use internal free instead of regular one to keep stats etc correct
    if (!_mi_free_delayed_block(block)) {
      // we might already start delayed freeing while another thread has not yet
@ -290,7 +294,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
      mi_block_t* dfree;
      do {
        dfree = (mi_block_t*)heap->thread_delayed_free;
-        mi_block_set_nextx(heap->cookie, block, dfree);
+        mi_block_set_nextx(heap, block, dfree, heap->cookie);
      } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree));

    }
@ -341,19 +345,25 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(pq == mi_page_queue_of(page));
  mi_assert_internal(page->heap != NULL);
+  
+#if MI_DEBUG > 1
+  mi_heap_t* pheap = (mi_heap_t*)mi_atomic_read_ptr(mi_atomic_cast(void*, &page->heap));
+#endif
+
+  // remove from our page list
+  mi_segments_tld_t* segments_tld = &page->heap->tld->segments;
+  mi_page_queue_remove(pq, page);
+
+  // page is no longer associated with our heap
+  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);

-  _mi_page_use_delayed_free(page,MI_NEVER_DELAYED_FREE);
 #if MI_DEBUG>1
  // check there are no references left..
-  for (mi_block_t* block = (mi_block_t*)page->heap->thread_delayed_free; block != NULL; block = mi_block_nextx(page->heap->cookie,block)) {
+  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->cookie)) {
    mi_assert_internal(_mi_ptr_page(block) != page);
  }
 #endif

-  // and then remove from our page list
-  mi_segments_tld_t* segments_tld = &page->heap->tld->segments;
-  mi_page_queue_remove(pq, page);
-
  // and abandon it
  mi_assert_internal(page->heap == NULL);
  _mi_segment_page_abandon(page,segments_tld);
@ -588,7 +598,9 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  mi_assert_internal(block_size > 0);
  // set fields
  size_t page_size;
+
  _mi_segment_page_start(segment, page, &page_size);
+
  page->block_size = block_size;
  mi_assert_internal(page->block_size <= page_size);
  mi_assert_internal(page_size <= page->slice_count*MI_SEGMENT_SLICE_SIZE);
@ -755,6 +767,7 @@ static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size) {
  if (page != NULL) {
    mi_assert_internal(mi_page_immediate_available(page));
    mi_assert_internal(page->block_size == block_size);
+
    if (pq == NULL) {
      // huge pages are directly abandoned
      mi_assert_internal(_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
--- a/src/segment.c
+++ b/src/segment.c
@ -17,8 +17,6 @@ static void mi_segment_map_allocated_at(const mi_segment_t* segment);
 static void mi_segment_map_freed_at(const mi_segment_t* segment);


-
-
 /* -----------------------------------------------------------
  Segment allocation

@ -191,10 +189,12 @@ static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) {
 }
 #endif

+
 /* -----------------------------------------------------------
 Segment size calculations
 ----------------------------------------------------------- */

+
 static size_t mi_segment_size(mi_segment_t* segment) {
  return segment->segment_slices * MI_SEGMENT_SLICE_SIZE;
 }
@ -212,8 +212,9 @@ uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* pa
  /*
  if (idx == 0) {
    // the first page starts after the segment info (and possible guard page)
-    p     += segment->segment_info_size;
+    p += segment->segment_info_size;
    psize -= segment->segment_info_size;
+
    // for small and medium objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
    // to ensure this, we over-estimate and align with the OS page size
    const size_t asize = _mi_os_page_size();
@ -234,11 +235,12 @@ uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* pa
  */

  if (page_size != NULL) *page_size = psize;
-  mi_assert_internal(_mi_ptr_page(p) == page);
+  mi_assert_internal(page->block_size == 0 || _mi_ptr_page(p) == page);
  mi_assert_internal(_mi_ptr_segment(p) == segment);
  return p;
 }

+
 static size_t mi_segment_calculate_slices(size_t required, size_t* pre_size, size_t* info_slices) {
  size_t page_size = _mi_os_page_size();
  size_t isize     = _mi_align_up(sizeof(mi_segment_t), page_size);
@ -283,6 +285,7 @@ static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
  if (MI_SECURE>0) {
    _mi_os_unprotect(segment, mi_segment_size(segment)); // ensure no more guard pages are set
  }
+
  // _mi_os_free(segment, mi_segment_size(segment), /*segment->memid,*/ tld->stats);
  _mi_arena_free(segment, mi_segment_size(segment), segment->memid, segment->mem_is_committed || (~segment->commit_mask == 0), segment->mem_is_fixed, tld->stats);
 }
@ -330,9 +333,7 @@ static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld)
  }

  mi_assert_internal(segment->segment_slices == MI_SLICES_PER_SEGMENT);
-  if (!segment->mem_is_fixed && mi_option_is_enabled(mi_option_cache_reset)) {
-    _mi_os_reset((uint8_t*)segment + mi_segment_info_size(segment), mi_segment_size(segment) - mi_segment_info_size(segment), tld->stats);
-  }
+  mi_assert_internal(segment->next == NULL);  
  segment->next = tld->cache;
  tld->cache = segment;
  tld->cache_count++;
@ -706,7 +707,6 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t
   Page allocation
 ----------------------------------------------------------- */

-
 static mi_page_t* mi_segments_page_alloc(mi_page_kind_t page_kind, size_t required, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 {
  mi_assert_internal(required <= MI_LARGE_OBJ_SIZE_MAX && page_kind <= MI_PAGE_LARGE);
@ -896,7 +896,7 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
          slice = mi_segment_page_clear(page, tld);   // set slice again due to coalesceing
        }
        else {
-          // otherwise reclaim it
+          // otherwise reclaim it          
          _mi_page_reclaim(heap,page);
        }
      }
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -13,7 +13,7 @@ if (NOT CMAKE_BUILD_TYPE)
 endif()

 # Import mimalloc (if installed)
-find_package(mimalloc 1.0 REQUIRED NO_SYSTEM_ENVIRONMENT_PATH)
+find_package(mimalloc 1.2 REQUIRED NO_SYSTEM_ENVIRONMENT_PATH)
 message(STATUS "Found mimalloc installed at: ${MIMALLOC_TARGET_DIR}")

 # overriding with a dynamic library
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@ -181,9 +181,9 @@ int main() {
  // mi_bins();

  // detect double frees and heap corruption
-  //double_free1();
-  //double_free2();
-  //corrupt_free();
+  // double_free1();
+  // double_free2();
+  // corrupt_free();

  void* p1 = malloc(78);
  void* p2 = malloc(24);
--- a/test/readme.md
+++ b/test/readme.md
@ -1,7 +1,7 @@
 Testing allocators is difficult as bugs may only surface after particular
 allocation patterns. The main approach to testing _mimalloc_ is therefore
 to have extensive internal invariant checking (see `page_is_valid` in `page.c`
-for example), which is enabled in debug mode with `-DMI_CHECK_FULL=ON`.
+for example), which is enabled in debug mode with `-DMI_DEBUG_FULL=ON`.
 The main testing strategy is then to run [`mimalloc-bench`][bench] using full
 invariant checking to catch any potential problems over a wide range of intensive
 allocation benchmarks and programs.
--- a/test/test-stress.c
+++ b/test/test-stress.c
@ -6,7 +6,8 @@ terms of the MIT license.

 /* This is a stress test for the allocator, using multiple threads and
   transferring objects between threads. This is not a typical workload
-   but uses a random linear size distribution. Do not use this test as a benchmark! 
+   but uses a random linear size distribution. Timing can also depend on
+   (random) thread scheduling. Do not use this test as a benchmark!
 */

 #include <stdio.h>
@ -16,17 +17,35 @@ terms of the MIT license.
 #include <string.h>
 #include <mimalloc.h>

+// > mimalloc-test-stress [THREADS] [SCALE] [ITER]
+//
 // argument defaults
-static int THREADS = 8;    // more repeatable if THREADS <= #processors
-static int N       = 200;    // scaling factor
+static int THREADS = 32;      // more repeatable if THREADS <= #processors
+static int SCALE   = 50;      // scaling factor
+static int ITER    = 10;      // N full iterations re-creating all threads

 // static int THREADS = 8;    // more repeatable if THREADS <= #processors
-// static int N       = 100;  // scaling factor
+// static int SCALE   = 100;  // scaling factor

+static bool   allow_large_objects = true;    // allow very large objects?
+static size_t use_one_size = 0;              // use single object size of N uintptr_t?
+
+
+#ifdef USE_STD_MALLOC
+#define custom_malloc(s)      malloc(s)
+#define custom_realloc(p,s)   realloc(p,s)
+#define custom_free(p)        free(p)
+#else
+#define custom_malloc(s)      mi_malloc(s)
+#define custom_realloc(p,s)   mi_realloc(p,s)
+#define custom_free(p)        mi_free(p)
+#endif
+
+// transfer pointer between threads
 #define TRANSFERS     (1000)
-
 static volatile void* transfer[TRANSFERS];

+
 #if (UINTPTR_MAX != UINT32_MAX)
 const uintptr_t cookie = 0xbf58476d1ce4e5b9UL;
 #else
@ -39,21 +58,21 @@ typedef uintptr_t* random_t;

 static uintptr_t pick(random_t r) {
  uintptr_t x = *r;
-  #if (UINTPTR_MAX > UINT32_MAX)
-    // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
+#if (UINTPTR_MAX > UINT32_MAX)
+  // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
  x ^= x >> 30;
  x *= 0xbf58476d1ce4e5b9UL;
  x ^= x >> 27;
  x *= 0x94d049bb133111ebUL;
  x ^= x >> 31;
-  #else
-    // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
+#else
+  // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
  x ^= x >> 16;
  x *= 0x7feb352dUL;
  x ^= x >> 15;
  x *= 0x846ca68bUL;
  x ^= x >> 16;
-  #endif
+#endif
  *r = x;
  return x;
 }
@ -64,12 +83,13 @@ static bool chance(size_t perc, random_t r) {

 static void* alloc_items(size_t items, random_t r) {
  if (chance(1, r)) {
-    if (chance(1, r)) items *= 1000;       // 0.01% giant
-    else if (chance(10, r)) items *= 100;  // 0.1% huge
-    else items *= 10;                      // 1% large objects;
+    if (chance(1, r) && allow_large_objects) items *= 10000;       // 0.01% giant
+    else if (chance(10, r) && allow_large_objects) items *= 1000;  // 0.1% huge
+    else items *= 100;                                             // 1% large objects;
  }
-  if (items==40) items++;              // pthreads uses that size for stack increases
-  uintptr_t* p = (uintptr_t*)mi_malloc(items*sizeof(uintptr_t));
+  if (items == 40) items++;              // pthreads uses that size for stack increases
+  if (use_one_size > 0) items = (use_one_size / sizeof(uintptr_t));
+  uintptr_t* p = (uintptr_t*)custom_malloc(items * sizeof(uintptr_t));
  if (p != NULL) {
    for (uintptr_t i = 0; i < items; i++) p[i] = (items - i) ^ cookie;
  }
@ -81,42 +101,42 @@ static void free_items(void* p) {
    uintptr_t* q = (uintptr_t*)p;
    uintptr_t items = (q[0] ^ cookie);
    for (uintptr_t i = 0; i < items; i++) {
-      if ((q[i]^cookie) != items - i) {
+      if ((q[i] ^ cookie) != items - i) {
        fprintf(stderr, "memory corruption at block %p at %zu\n", p, i);
        abort();
      }
    }
  }
-  mi_free(p);
+  custom_free(p);
 }


 static void stress(intptr_t tid) {
  //bench_start_thread();
-  uintptr_t r = tid ^ 42;
-  const size_t max_item = 128;  // in words
-  const size_t max_item_retained = 10*max_item;
-  size_t allocs = 25*N*(tid%8 + 1); // some threads do more
-  size_t retain = allocs/2;
+  uintptr_t r = tid * 43;
+  const size_t max_item_shift = 5; // 128  
+  const size_t max_item_retained_shift = max_item_shift + 2;
+  size_t allocs = 100 * ((size_t)SCALE) * (tid % 8 + 1); // some threads do more
+  size_t retain = allocs / 2;
  void** data = NULL;
  size_t data_size = 0;
  size_t data_top = 0;
-  void** retained = (void**)mi_malloc(retain*sizeof(void*));
+  void** retained = (void**)custom_malloc(retain * sizeof(void*));
  size_t retain_top = 0;

-  while (allocs>0 || retain>0) {
+  while (allocs > 0 || retain > 0) {
    if (retain == 0 || (chance(50, &r) && allocs > 0)) {
      // 50%+ alloc
      allocs--;
      if (data_top >= data_size) {
        data_size += 100000;
-        data = (void**)mi_realloc(data, data_size*sizeof(void*));
+        data = (void**)custom_realloc(data, data_size * sizeof(void*));
      }
-      data[data_top++] = alloc_items((pick(&r) % max_item) + 1, &r);
+      data[data_top++] = alloc_items( 1ULL << (pick(&r) % max_item_shift), &r);
    }
    else {
      // 25% retain
-      retained[retain_top++] = alloc_items(10*((pick(&r) % max_item_retained) + 1), &r);
+      retained[retain_top++] = alloc_items( 1ULL << (pick(&r) % max_item_retained_shift), &r);
      retain--;
    }
    if (chance(66, &r) && data_top > 0) {
@ -126,7 +146,7 @@ static void stress(intptr_t tid) {
      data[idx] = NULL;
    }
    if (chance(25, &r) && data_top > 0) {
-      // 25% transfer-swap
+      // 25% exchange a local pointer with the (shared) transfer buffer.
      size_t data_idx = pick(&r) % data_top;
      size_t transfer_idx = pick(&r) % TRANSFERS;
      void* p = data[data_idx];
@ -141,38 +161,54 @@ static void stress(intptr_t tid) {
  for (size_t i = 0; i < data_top; i++) {
    free_items(data[i]);
  }
-  mi_free(retained);
-  mi_free(data);
+  custom_free(retained);
+  custom_free(data);
  //bench_end_thread();
 }

 static void run_os_threads(size_t nthreads);

 int main(int argc, char** argv) {
-  if (argc>=2) {
+  // > mimalloc-test-stress [THREADS] [SCALE] [ITER]
+  if (argc >= 2) {
    char* end;
    long n = strtol(argv[1], &end, 10);
    if (n > 0) THREADS = n;
  }
-  if (argc>=3) {
+  if (argc >= 3) {
    char* end;
    long n = (strtol(argv[2], &end, 10));
-    if (n > 0) N = n;
+    if (n > 0) SCALE = n;
  }
-  printf("start with %i threads with a %i%% load-per-thread\n", THREADS, N);  
+  if (argc >= 4) {
+    char* end;
+    long n = (strtol(argv[3], &end, 10));
+    if (n > 0) ITER = n;
+  }
+  printf("start with %d threads with a %d%% load-per-thread and %d iterations\n", THREADS, SCALE, ITER);
  //int res = mi_reserve_huge_os_pages(4,1);
  //printf("(reserve huge: %i\n)", res);

  //bench_start_program();
+
+  // Run ITER full iterations where half the objects in the transfer buffer survive to the next round.
  mi_stats_reset();
-  memset((void*)transfer, 0, TRANSFERS*sizeof(void*));
-  run_os_threads(THREADS);
-  for (int i = 0; i < TRANSFERS; i++) {
-    free_items((void*)transfer[i]);
+  uintptr_t r = 43 * 43;
+  for (int n = 0; n < ITER; n++) {
+    run_os_threads(THREADS);
+    for (int i = 0; i < TRANSFERS; i++) {
+      if (chance(50, &r) || n + 1 == ITER) { // free all on last run, otherwise free half of the transfers
+        void* p = atomic_exchange_ptr(&transfer[i], NULL);
+        free_items(p);
+      }
+    }
+    mi_collect(false);
+#ifndef NDEBUG
+    if ((n + 1) % 10 == 0) { printf("- iterations: %3d\n", n + 1); }
+#endif
  }
-  #ifndef NDEBUG
-  mi_collect(false);
-  #endif
+
+  mi_collect(true);
  mi_stats_print(NULL);
  //bench_end_program();
  return 0;
@ -189,8 +225,8 @@ static DWORD WINAPI thread_entry(LPVOID param) {
 }

 static void run_os_threads(size_t nthreads) {
-  DWORD* tids = (DWORD*)malloc(nthreads * sizeof(DWORD));
-  HANDLE* thandles = (HANDLE*)malloc(nthreads * sizeof(HANDLE));
+  DWORD* tids = (DWORD*)custom_malloc(nthreads * sizeof(DWORD));
+  HANDLE* thandles = (HANDLE*)custom_malloc(nthreads * sizeof(HANDLE));
  for (uintptr_t i = 0; i < nthreads; i++) {
    thandles[i] = CreateThread(0, 4096, &thread_entry, (void*)(i), 0, &tids[i]);
  }
@ -200,16 +236,16 @@ static void run_os_threads(size_t nthreads) {
  for (size_t i = 0; i < nthreads; i++) {
    CloseHandle(thandles[i]);
  }
-  free(tids);
-  free(thandles);
+  custom_free(tids);
+  custom_free(thandles);
 }

 static void* atomic_exchange_ptr(volatile void** p, void* newval) {
-  #if (INTPTR_MAX == UINT32_MAX)
+#if (INTPTR_MAX == UINT32_MAX)
  return (void*)InterlockedExchange((volatile LONG*)p, (LONG)newval);
-  #else
+#else
  return (void*)InterlockedExchange64((volatile LONG64*)p, (LONG64)newval);
-  #endif
+#endif
 }
 #else

@ -222,8 +258,8 @@ static void* thread_entry(void* param) {
 }

 static void run_os_threads(size_t nthreads) {
-  pthread_t* threads = (pthread_t*)mi_malloc(nthreads*sizeof(pthread_t));
-  memset(threads, 0, sizeof(pthread_t)*nthreads);
+  pthread_t* threads = (pthread_t*)custom_malloc(nthreads * sizeof(pthread_t));
+  memset(threads, 0, sizeof(pthread_t) * nthreads);
  //pthread_setconcurrency(nthreads);
  for (uintptr_t i = 0; i < nthreads; i++) {
    pthread_create(&threads[i], NULL, &thread_entry, (void*)i);
@ -231,6 +267,7 @@ static void run_os_threads(size_t nthreads) {
  for (size_t i = 0; i < nthreads; i++) {
    pthread_join(threads[i], NULL);
  }
+  custom_free(threads);
 }

 static void* atomic_exchange_ptr(volatile void** p, void* newval) {