Initial commit of separate memory region layer and improved large OS pages support, see 'memory.c'

2024-12-27 13:33:18 +08:00 · 2019-07-02 07:23:24 -07:00 · 2019-07-02 07:23:24 -07:00 · 06bcea1761
commit 06bcea1761
parent d6901558cd
18 changed files with 693 additions and 297 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -15,6 +15,7 @@ set(mi_install_dir "lib/mimalloc-${mi_version}")
 set(mi_sources
    src/stats.c
    src/os.c
    src/memory.c
    src/segment.c
    src/page.c
    src/alloc.c
--- a/ide/vs2017/mimalloc-override.vcxproj
+++ b/ide/vs2017/mimalloc-override.vcxproj
@ -225,6 +225,7 @@
    <ClCompile Include="..\..\src\alloc.c" />
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\memory.c" />
    <ClCompile Include="..\..\src\options.c" />
    <ClCompile Include="..\..\src\os.c" />
    <ClCompile Include="..\..\src\page-queue.c">
--- a/ide/vs2017/mimalloc-override.vcxproj.filters
+++ b/ide/vs2017/mimalloc-override.vcxproj.filters
@ -58,5 +58,8 @@
    <ClCompile Include="..\..\src\init.c">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\memory.c">
      <Filter>Source Files</Filter>
    </ClCompile>
  </ItemGroup>
 </Project>
--- a/ide/vs2017/mimalloc.vcxproj
+++ b/ide/vs2017/mimalloc.vcxproj
@ -224,6 +224,7 @@
    <ClCompile Include="..\..\src\alloc.c" />
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\memory.c" />
    <ClCompile Include="..\..\src\options.c" />
    <ClCompile Include="..\..\src\page-queue.c">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
--- a/ide/vs2017/mimalloc.vcxproj.filters
+++ b/ide/vs2017/mimalloc.vcxproj.filters
@ -50,6 +50,9 @@
    <ClCompile Include="..\..\src\init.c">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\memory.c">
      <Filter>Source Files</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@ -39,6 +39,15 @@ static inline bool mi_atomic_compare_exchange(volatile uintptr_t* p, uintptr_t e
 // Atomically exchange a value.
 static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exchange);
 // Atomically read a value
 static inline uintptr_t mi_atomic_read(volatile uintptr_t* p);
 // Atomically read a pointer
 static inline void* mi_atomic_read_ptr(volatile void** p);
 // Atomically write a value
 static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x);
 static inline void mi_atomic_yield(void);
 // Atomically compare and exchange a pointer; returns `true` if successful.
@ -85,6 +94,15 @@ static inline bool mi_atomic_compare_exchange(volatile uintptr_t* p, uintptr_t e
 static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exchange) {
  return (uintptr_t)RC64(_InterlockedExchange)((volatile intptr_t*)p, (intptr_t)exchange);
 }
 static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) {
  return *p;
 }
 static inline void* mi_atomic_read_ptr(volatile void** p) {
  return (void*)(*p);
 }
 static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) {
  *p = x;
 }
 static inline void mi_atomic_yield(void) {
  YieldProcessor();
 }
@ -147,6 +165,18 @@ static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exch
  MI_USING_STD
  return atomic_exchange_explicit((volatile atomic_uintptr_t*)p, exchange, memory_order_relaxed);
 }
 static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) {
  MI_USING_STD
  return atomic_load_explicit((volatile atomic_uintptr_t*)p, memory_order_relaxed);
 }
 static inline void* mi_atomic_read_ptr(volatile void** p) {
  MI_USING_STD
  return atomic_load_explicit((volatile _Atomic(void*)*)p, memory_order_relaxed);
 }
 static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) {
  MI_USING_STD
  return atomic_store_explicit((volatile atomic_uintptr_t*)p, x, memory_order_relaxed);
 }
 #if defined(__cplusplus)
  #include <thread>
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@ -29,18 +29,21 @@ uintptr_t  _mi_ptr_cookie(const void* p);
 uintptr_t  _mi_random_shuffle(uintptr_t x);
 uintptr_t  _mi_random_init(uintptr_t seed /* can be zero */);
-// "os.c"
+// os.c
 bool       _mi_os_reset(void* p, size_t size);
 void*      _mi_os_alloc(size_t size, mi_stats_t* stats);
 bool       _mi_os_shrink(void* p, size_t oldsize, size_t newsize);
 void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);
 bool       _mi_os_protect(void* addr, size_t size);
 bool       _mi_os_unprotect(void* addr, size_t size);
 void       _mi_os_init(void);  // called from process init
 void*      _mi_os_alloc_aligned(size_t size, size_t alignment, mi_os_tld_t* tld);
 size_t     _mi_os_page_size(void);
 uintptr_t  _mi_align_up(uintptr_t sz, size_t alignment);
 void       _mi_os_init(void);                                      // called from process init
 void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
 void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
 // memory.c
 void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, size_t* id, mi_os_tld_t* tld);
 void*      _mi_mem_alloc(size_t size, size_t* id, mi_os_tld_t* tld);
 void       _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats);
 bool       _mi_mem_reset(void* p, size_t size, mi_stats_t* stats);
 bool       _mi_mem_protect(void* addr, size_t size);
 bool       _mi_mem_unprotect(void* addr, size_t size);
 // "segment.c"
 mi_page_t* _mi_segment_page_alloc(size_t block_wsize, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@ -89,7 +89,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
 #define MI_LARGE_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE)
-#define MI_LARGE_SIZE_MAX                 (MI_LARGE_PAGE_SIZE/8)   // 512kb on 64-bit
+#define MI_LARGE_SIZE_MAX                 (MI_LARGE_PAGE_SIZE/4)   // 1MiB on 64-bit
 #define MI_LARGE_WSIZE_MAX                (MI_LARGE_SIZE_MAX>>MI_INTPTR_SHIFT)
@ -215,6 +215,7 @@ typedef struct mi_segment_s {
  size_t          segment_size;// for huge pages this may be different from `MI_SEGMENT_SIZE`
  size_t          segment_info_size;  // space we are using from the first page for segment meta-data and possible guard pages.
  uintptr_t       cookie;      // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`
  size_t          memid;       // id for the os-level memory manager
  // layout like this to optimize access in `mi_free`
  size_t          page_shift;  // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
@ -322,12 +323,14 @@ typedef struct mi_stats_s {
  mi_stat_count_t reserved;
  mi_stat_count_t committed;
  mi_stat_count_t reset;
  mi_stat_count_t page_committed;
  mi_stat_count_t segments_abandoned;
  mi_stat_count_t pages_abandoned;
  mi_stat_count_t pages_extended;
  mi_stat_count_t mmap_calls;
  mi_stat_count_t mmap_right_align;
  mi_stat_count_t mmap_ensure_aligned;
  mi_stat_count_t commit_calls;
  mi_stat_count_t threads;
  mi_stat_count_t huge;
  mi_stat_count_t malloc;
@ -370,11 +373,13 @@ typedef struct mi_segment_queue_s {
 // Segments thread local data
 typedef struct mi_segments_tld_s {
  mi_segment_queue_t  small_free;   // queue of segments with free small pages
  size_t              count;        // current number of segments;
  size_t              peak_count;   // peak number of segments
  size_t              current_size; // current size of all segments
  size_t              peak_size;    // peak size of all segments
  size_t              cache_count;  // number of segments in the cache
  size_t              cache_size;   // total size of all segments in the cache
-  mi_segment_queue_t  cache;        // (small) cache of segments for small and large pages (to avoid repeated mmap calls)
+  mi_segment_t*       cache;        // (small) cache of segments
  mi_stats_t*         stats;        // points to tld stats
 } mi_segments_tld_t;
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@ -215,8 +215,8 @@ mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_b
 typedef enum mi_option_e {
  mi_option_page_reset,
  mi_option_cache_reset,
-  mi_option_pool_commit,
+  mi_option_eager_commit,
-  mi_option_large_os_pages,
+  mi_option_large_os_pages,  // implies eager commit
  mi_option_secure,
  mi_option_show_stats,
  mi_option_show_errors,
--- a/src/init.c
+++ b/src/init.c
@ -58,6 +58,7 @@ const mi_page_t _mi_page_empty = {
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  { 0, 0 } \
  MI_STAT_COUNT_END_NULL()
@ -90,7 +91,7 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 static mi_tld_t tld_main = {
  0,
  &_mi_heap_main,
-  { { NULL, NULL }, 0, 0, 0, 0, {NULL,NULL}, tld_main_stats }, // segments
+  { { NULL, NULL }, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments
  { 0, NULL, NULL, 0, tld_main_stats },              // os
  { MI_STATS_NULL }                                  // stats
 };
--- a/src/memory.c
+++ b/src/memory.c
@ -0,0 +1,349 @@
 /* ----------------------------------------------------------------------------
 Copyright (c) 2019, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 /* ----------------------------------------------------------------------------
 This implements a layer between the raw OS memory (VirtualAlloc/mmap/sbrk/..)
 and the segment and huge object allocation by mimalloc. In contrast to the
 rest of mimalloc, this uses thread-shared "regions" that are accessed using
 atomic operations. We need this layer because of:
 1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
   to reuse memory 
 2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
   an OS allocation/free is still too expensive relative to the accesses in that
   object :-( (`mallloc-large` tests this). This means we need a cheaper way to 
   reuse memory.
 3. This layer can help with a NUMA aware allocation in the future.
 Possible issues:
 - (2) can potentially be addressed too with a small cache per thread which is much 
  simpler. Generally though that requires shrinking of huge pages, and may overuse
  memory per thread. (and is not compatible with `sbrk`). 
 - Since the current regions are per-process, we need atomic operations to 
  claim blocks which may be contended
 - In the worst case, we need to search the whole region map (16KiB for 256GiB)
  linearly. At what point will direct OS calls be faster? Is there a way to 
  do this better without adding too much complexity?
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
 #include "mimalloc-internal.h"
 #include "mimalloc-atomic.h"
 #include <string.h>  // memset
 // Internal OS interface
 size_t  _mi_os_large_page_size();
 bool  _mi_os_protect(void* addr, size_t size);
 bool  _mi_os_unprotect(void* addr, size_t size);
 bool  _mi_os_commit(void* p, size_t size, mi_stats_t* stats);
 bool  _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
 bool  _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
 void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, mi_os_tld_t* tld);
 // Constants
 #if (MI_INTPTR_SIZE==8)
 #define MI_HEAP_REGION_MAX_SIZE    (256 * (1ULL << 30))  // 256GiB => 16KiB for the region map
 #elif (MI_INTPTR_SIZE==4)
 #define MI_HEAP_REGION_MAX_SIZE    (3 * (1UL << 30))    // 3GiB => 196 bytes for the region map
 #else
 #error "define the maximum heap space allowed for regions on this platform"
 #endif
 #define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE
 #define MI_REGION_MAP_BITS        (MI_INTPTR_SIZE * 8)
 #define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_REGION_MAP_BITS)
 #define MI_REGION_MAX_ALLOC_SIZE  ((MI_REGION_MAP_BITS/4)*MI_SEGMENT_SIZE)  // 64MiB
 #define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)
 #define MI_REGION_MAP_FULL        UINTPTR_MAX
 // A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
 // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
 typedef struct mem_region_s {
  volatile uintptr_t map;    // in-use bit per MI_SEGMENT_SIZE block
  volatile void*     start;  // start of virtual memory area
 } mem_region_t;
 // The region map; 16KiB for a 256GiB HEAP_REGION_MAX
 // TODO: in the future, maintain a map per NUMA node for numa aware allocation
 static mem_region_t regions[MI_REGION_MAX];
 static volatile size_t regions_count = 0;        // allocated regions
 static volatile uintptr_t region_next_idx = 0;
 /* ----------------------------------------------------------------------------
 Utility functions
 -----------------------------------------------------------------------------*/
 // Blocks (of 4MiB) needed for the given size.
 static size_t mi_region_block_count(size_t size) {
  mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE);
  return (size + MI_SEGMENT_SIZE - 1) / MI_SEGMENT_SIZE;
 }
 // The bit mask for a given number of blocks at a specified bit index.
 static uintptr_t mi_region_block_mask(size_t blocks, size_t bitidx) {
  mi_assert_internal(blocks + bitidx <= MI_REGION_MAP_BITS);
  return ((((uintptr_t)1 << blocks) - 1) << bitidx);
 }
 // Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
 static size_t mi_good_commit_size(size_t size) {
  if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
  return _mi_align_up(size, _mi_os_large_page_size());  
 }
 /* ----------------------------------------------------------------------------
 Commit from a region
 -----------------------------------------------------------------------------*/
 // Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`. 
 // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
 // if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call. 
 // (not being able to claim is not considered an error so check for `p != NULL` afterwards).
 static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bitidx, size_t blocks, size_t size, void** p, size_t* id, mi_os_tld_t* tld) {
  size_t mask = mi_region_block_mask(blocks,bitidx);
  mi_assert_internal(mask != 0);
  mi_assert_internal((mask & mi_atomic_read(&region->map)) == mask);
  // ensure the region is reserved
  void* start = mi_atomic_read_ptr(&region->start);
  if (start == NULL) {    
    start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, mi_option_is_enabled(mi_option_eager_commit), tld);
    if (start == NULL) {
      // failure to allocate from the OS! unclaim the blocks and fail
      size_t map;
      do {
        map = mi_atomic_read(&region->map);
      } while (!mi_atomic_compare_exchange(&region->map, map & ~mask, map));
      return false;
    }
    // set the newly allocated region
    if (mi_atomic_compare_exchange_ptr(&region->start, start, NULL)) {
      // update the region count
      mi_atomic_increment(&regions_count);
    }    
    else {
      // failed, another thread allocated just before us, free our allocated memory
      // TODO: should we keep the allocated memory and assign it to some other region?
      _mi_os_free(start, MI_REGION_SIZE, tld->stats);
      start = mi_atomic_read_ptr(&region->start);
    }    
  }
  // Commit the blocks to memory
  mi_assert_internal(start == mi_atomic_read_ptr(&region->start));
  mi_assert_internal(start != NULL);
  void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
  if (!mi_option_is_enabled(mi_option_eager_commit)) {
    _mi_os_commit(blocks_start, mi_good_commit_size(size), tld->stats);  // only commit needed size (unless using large OS pages)
  }
  // and return the allocation
  mi_atomic_write(&region_next_idx,idx);  // next search from here
  *p  = blocks_start;
  *id = (idx*MI_REGION_MAP_BITS) + bitidx;
  return true;
 }
 // Allocate `blocks` in a `region` at `idx` of a given `size`. 
 // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
 // if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call. 
 // (not being able to claim is not considered an error so check for `p != NULL` afterwards).
 static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t blocks, size_t size, void** p, size_t* id, mi_os_tld_t* tld) {
  mi_assert_internal(p != NULL && id != NULL);
  mi_assert_internal(blocks < MI_REGION_MAP_BITS);
  const uintptr_t mask = mi_region_block_mask(blocks,0);
  const size_t bitidx_max = MI_REGION_MAP_BITS - blocks;
  size_t bitidx = 0;
  uintptr_t map;
  uintptr_t newmap;
  do {   // while no atomic claim success and not all bits seen
    // find the first free range of bits
    map = mi_atomic_read(&region->map);
    size_t m = map;
    do {
      // skip ones
      while ((m&1) == 1) { bitidx++; m>>=1; }
      // count zeros
      mi_assert_internal((m&1)==0);
      size_t zeros = 1;
      m >>= 1;
      while(zeros < blocks && (m&1)==0) { zeros++; m>>=1; }
      if (zeros == blocks) break; // found a range that fits
      bitidx += zeros;    
    }
    while(bitidx <= bitidx_max);
    if (bitidx > bitidx_max) {
      return true;  // no error, but could not find a range either
    }
    // try to claim it
    mi_assert_internal( (mask << bitidx) >> bitidx == mask ); // no overflow?
    mi_assert_internal( (map & (mask << bitidx)) == 0);         // fits in zero range
    newmap = map | (mask << bitidx);
    mi_assert_internal((newmap^map) >> bitidx == mask); 
  }
  while(!mi_atomic_compare_exchange(&region->map, newmap, map)); 
  // success, we claimed the blocks atomically
  // now commit the block memory -- this can still fail
  return mi_region_commit_blocks(region, idx, bitidx, blocks, size, p, id, tld);
 }
 // Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim.
 // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
 // if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call. 
 // (not being able to claim is not considered an error so check for `p != NULL` afterwards).
 static bool mi_region_try_alloc_blocks(size_t idx, size_t blocks, size_t size, void** p, size_t* id, mi_os_tld_t* tld)
 {
  // check if there are available blocks in the region..
  mi_assert_internal(idx < MI_REGION_MAX);
  mem_region_t* region = &regions[idx];
  uintptr_t m = mi_atomic_read(&region->map);
  if (m != MI_REGION_MAP_FULL) {  // some bits are zero
    return mi_region_alloc_blocks(region, idx, blocks, size, p, id, tld);
  }
  else {
    return true;  // no error, but no success either
  }
 }
 /* ----------------------------------------------------------------------------
 Allocation
 -----------------------------------------------------------------------------*/
 // Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
 // (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
 void* _mi_mem_alloc_aligned(size_t size, size_t alignment, size_t* id, mi_os_tld_t* tld) 
 {
  mi_assert_internal(id != NULL && tld != NULL);
  mi_assert_internal(size > 0);
  *id = SIZE_MAX;
  // use direct OS allocation for huge blocks or alignment (with `id = SIZE_MAX`)
  if (size > MI_REGION_MAX_ALLOC_SIZE || alignment > MI_SEGMENT_ALIGN) {
    return _mi_os_alloc_aligned(mi_good_commit_size(size), alignment, true, tld);  // round up size
  }
  // always round size to OS page size multiple (so commit/decommit go over the entire range)
  // TODO: use large OS page size here?
  size = _mi_align_up(size, _mi_os_page_size());
  // calculate the number of needed blocks
  size_t blocks = mi_region_block_count(size);
  mi_assert_internal(blocks > 0 && blocks <= 8*MI_INTPTR_SIZE);
  // find a range of free blocks
  void* p = NULL;
  size_t count = mi_atomic_read(&regions_count);
  size_t idx = mi_atomic_read(&region_next_idx);
  for (size_t visited = 0; visited < count; visited++, idx++) {
    if (!mi_region_try_alloc_blocks(idx%count, blocks, size, &p, id, tld)) return NULL; // error
    if (p != NULL) break;    
  }
  if (p == NULL) {
    // no free range in existing regions -- try to extend beyond the count
    for (idx = count; idx < MI_REGION_MAX; idx++) {
      if (!mi_region_try_alloc_blocks(idx, blocks, size, &p, id, tld)) return NULL; // error
      if (p != NULL) break;
    }
  }
  if (p == NULL) {
    // we could not find a place to allocate, fall back to the os directly
    p = _mi_os_alloc_aligned(size, alignment, true, tld);
  }
  mi_assert_internal( p == NULL || (uintptr_t)p % alignment == 0);
  return p;
 }
 // Allocate `size` memory. Return non NULL on success, with a given memory `id`.
 void* _mi_mem_alloc(size_t size, size_t* id, mi_os_tld_t* tld) {
  return _mi_mem_alloc_aligned(size,0,id,tld);
 }
 /* ----------------------------------------------------------------------------
 Free
 -----------------------------------------------------------------------------*/
 // Free previously allocated memory with a given id.
 void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
  mi_assert_internal(size > 0 && stats != NULL);
  if (p==NULL) return;
  if (size==0) return;
  if (id == SIZE_MAX) {
   // was a direct OS allocation, pass through
    _mi_os_free(p, size, stats); 
  }
  else {
    // allocated in a region 
    mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE); if (size > MI_REGION_MAX_ALLOC_SIZE) return;
    // we can align the size up to page size (as we allocate that way too)
    // this ensures we fully commit/decommit/reset
    size = _mi_align_up(size, _mi_os_page_size());
    size_t idx = (id / MI_REGION_MAP_BITS);
    size_t bitidx = (id % MI_REGION_MAP_BITS);
    size_t blocks = mi_region_block_count(size);
    size_t mask = mi_region_block_mask(blocks, bitidx);
    mi_assert_internal(idx < MI_REGION_MAX); if (idx >= MI_REGION_MAX) return; // or `abort`?
    mem_region_t* region = &regions[idx];
    mi_assert_internal((mi_atomic_read(&region->map) & mask) == mask ); // claimed?
    void* start = mi_atomic_read_ptr(&region->start);
    mi_assert_internal(start != NULL); 
    void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
    mi_assert_internal(blocks_start == p); // not a pointer in our area?
    mi_assert_internal(bitidx + blocks <= MI_REGION_MAP_BITS);
    if (blocks_start != p || bitidx + blocks > MI_REGION_MAP_BITS) return; // or `abort`?
    // decommit (or reset) the blocks to reduce the working set.
    // TODO: implement delayed decommit/reset as these calls are too expensive 
    // if the memory is reused soon.
    // reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large
    if (mi_option_is_enabled(mi_option_eager_commit)) {
      // _mi_os_reset(p, size, stats);      // 10x slowdown on malloc-large
    }
    else {      
      // _mi_os_decommit(p, size, stats);  // 17x slowdown on malloc-large
    }
    // TODO: should we free empty regions? 
    // this frees up virtual address space which
    // might be useful on 32-bit systems?
    // and unclaim
    uintptr_t map;
    uintptr_t newmap;
    do {
      map = mi_atomic_read(&region->map);
      newmap = map & ~mask;
    } while (!mi_atomic_compare_exchange(&region->map, newmap, map));
  }
 }
 /* ----------------------------------------------------------------------------
  Other
 -----------------------------------------------------------------------------*/
 bool _mi_mem_reset(void* p, size_t size, mi_stats_t* stats) {
  return _mi_os_reset(p, size, stats);
 }
 bool _mi_mem_protect(void* p, size_t size) {
  return _mi_os_protect(p, size);
 }
 bool _mi_mem_unprotect(void* p, size_t size) {
  return _mi_os_unprotect(p, size);
 }
--- a/src/options.c
+++ b/src/options.c
@ -30,8 +30,8 @@ typedef struct mi_option_desc_s {
 static mi_option_desc_t options[_mi_option_last] = {
  { 0, UNINIT, "page_reset" },
  { 0, UNINIT, "cache_reset" },
-  { 0, UNINIT, "pool_commit" },
+  { 1, UNINIT, "eager_commit" },     // on by default as it seems to be faster in general
-  { 0, UNINIT, "large_os_pages" },   // use large OS pages
+  { 0, UNINIT, "large_os_pages" },   // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
  #if MI_SECURE
  { MI_SECURE, INITIALIZED, "secure" }, // in secure build the environment setting is ignored
  #else
--- a/src/os.c
+++ b/src/os.c
@ -12,7 +12,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc-internal.h"
 #include <string.h>  // memset
 #include <stdio.h>   // debug fprintf
 #include <errno.h>
 /* -----------------------------------------------------------
@ -28,15 +27,37 @@ terms of the MIT license. A copy of the license can be found in the file
  #include <unistd.h>    // sysconf
 #endif
 // page size (initialized properly in `os_init`)
 static size_t os_page_size = 4096;
 // minimal allocation granularity
 static size_t os_alloc_granularity = 4096;
 // if non-zero, use large page allocation
 static size_t large_os_page_size = 0;
 // OS (small) page size
 size_t _mi_os_page_size() {
  return os_page_size;
 }
 // if large OS pages are supported (2 or 4MiB), then return the size, otherwise return the small page size (4KiB)
 size_t _mi_os_large_page_size() {
  return (large_os_page_size != 0 ? large_os_page_size : _mi_os_page_size());
 }
 static bool use_large_os_page(size_t size, size_t alignment) {
  // if we have access, check the size and alignment requirements
  if (large_os_page_size == 0) return false;
  return ((size % large_os_page_size) == 0 && (alignment % large_os_page_size) == 0);
 }
 // round to a good allocation size
 static size_t mi_os_good_alloc_size(size_t size, size_t alignment) {
  UNUSED(alignment);
  if (size >= (SIZE_MAX - os_alloc_granularity)) return size; // possible overflow?
  return _mi_align_up(size, os_alloc_granularity);  
 }
 #if defined(_WIN32)
 // We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
@ -45,11 +66,17 @@ typedef PVOID (*VirtualAlloc2Ptr)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MEM_EXTEN
 static VirtualAlloc2Ptr pVirtualAlloc2 = NULL;
 void _mi_os_init(void) {
-  // Try to get the VirtualAlloc2 function (only supported on Windows 10 and Windows Server 2016)
+  // get the page size
  SYSTEM_INFO si;
  GetSystemInfo(&si);
  if (si.dwPageSize > 0) os_page_size = si.dwPageSize;
  if (si.dwAllocationGranularity > 0) os_alloc_granularity = si.dwAllocationGranularity;
  // get the VirtualAlloc2 function
  HINSTANCE  hDll;
  hDll = LoadLibrary("kernelbase.dll");
  if (hDll!=NULL) {
-    pVirtualAlloc2 = (VirtualAlloc2Ptr)GetProcAddress(hDll, "VirtualAlloc2");
+    // use VirtualAlloc2FromApp as it is available to Windows store apps
    pVirtualAlloc2 = (VirtualAlloc2Ptr)GetProcAddress(hDll, "VirtualAlloc2FromApp");  
    FreeLibrary(hDll);
  }
  // Try to see if large OS pages are supported
@ -86,8 +113,15 @@ void _mi_os_init(void) {
 }
 #else
 void _mi_os_init() {
-  // nothing to do
+  // get the page size
-  use_large_os_page(0, 0); // dummy call to suppress warnings
+  long result = sysconf(_SC_PAGESIZE);
  if (result > 0) {
    os_page_size = (size_t)result;
    os_alloc_granularity = os_page_size;
  }
  if (mi_option_is_enabled(mi_option_large_os_pages)) {
    large_os_page_size = (1UL<<21); // 2MiB
  }
 }
 #endif
@ -116,26 +150,8 @@ static void* mi_align_down_ptr(void* p, size_t alignment) {
  return (void*)_mi_align_down((uintptr_t)p, alignment);
 }
 static void* os_pool_alloc(size_t size, size_t alignment, mi_os_tld_t* tld);
-// cached OS page size
+static bool mi_os_mem_free(void* addr, size_t size, mi_stats_t* stats)
 size_t _mi_os_page_size(void) {
  static size_t page_size = 0;
  if (page_size == 0) {
 #if defined(_WIN32)
    SYSTEM_INFO si;
    GetSystemInfo(&si);
    page_size = (si.dwPageSize > 0 ? si.dwPageSize : 4096);
 #else
    long result = sysconf(_SC_PAGESIZE);
    page_size = (result > 0 ? (size_t)result : 4096);
 #endif
  }
  return page_size;
 }
 static bool mi_munmap(void* addr, size_t size)
 {
  if (addr == NULL || size == 0) return true;
  bool err = false;
@ -144,6 +160,8 @@ static bool mi_munmap(void* addr, size_t size)
 #else
  err = (munmap(addr, size) == -1);
 #endif
  _mi_stat_decrease(&stats->committed, size); // TODO: what if never committed?
  _mi_stat_decrease(&stats->reserved, size);
  if (err) {
    #pragma warning(suppress:4996)
    _mi_warning_message("munmap failed: %s, addr 0x%8li, size %lu\n", strerror(errno), (size_t)addr, size);
@ -154,16 +172,18 @@ static bool mi_munmap(void* addr, size_t size)
  }
 }
-static void* mi_mmap(void* addr, size_t size, int extra_flags, mi_stats_t* stats) {
+static void* mi_os_mem_alloc(void* addr, size_t size, bool commit, int extra_flags, mi_stats_t* stats) {
  UNUSED(stats);
  if (size == 0) return NULL;
  void* p = NULL;
 #if defined(_WIN32)
  int flags = MEM_RESERVE | extra_flags;
  if (commit) flags |= MEM_COMMIT;
  if (use_large_os_page(size, 0)) {
-    p = VirtualAlloc(addr, size, MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT | extra_flags, PAGE_READWRITE);
+    p = VirtualAlloc(addr, size, MEM_LARGE_PAGES | flags, PAGE_READWRITE);
  }
  if (p == NULL) {
-    p = VirtualAlloc(addr, size, MEM_RESERVE | MEM_COMMIT | extra_flags, PAGE_READWRITE);
+    p = VirtualAlloc(addr, size, flags, PAGE_READWRITE);
  }
 #else
  #if !defined(MAP_ANONYMOUS)
@ -179,19 +199,43 @@ static void* mi_mmap(void* addr, size_t size, int extra_flags, mi_stats_t* stats
      flags |= MAP_FIXED;
    #endif
  }
-  p = mmap(addr, size, (PROT_READ | PROT_WRITE), flags, -1, 0);
+  if (large_os_page_size > 0 && use_large_os_page(size, 0) && ((uintptr_t)addr % large_os_page_size) == 0) {
-  if (p == MAP_FAILED) p = NULL;
+    int lflags = flags;
    #ifdef MAP_ALIGNED_SUPER
    lflags |= MAP_ALIGNED_SUPER;
    #endif
    #ifdef MAP_HUGETLB
    lflags |= MAP_HUGETLB;
    #endif
    #ifdef MAP_HUGE_2MB
    lflags |= MAP_HUGE_2MB;
    #endif
    if (lflags != flags) {
      // try large page allocation
      p = mmap(addr, size, (commit ? (PROT_READ | PROT_WRITE) : PROT_NONE), lflags, -1, 0);
      if (p == MAP_FAILED) p = NULL;
    }
  }
  if (p == NULL) {
    p = mmap(addr, size, (commit ? (PROT_READ | PROT_WRITE) : PROT_NONE), flags, -1, 0);
    if (p == MAP_FAILED) p = NULL;
  }
  if (addr != NULL && p != addr) {
-    mi_munmap(p, size);
+    mi_os_mem_free(p, size, stats);
    p = NULL;
  }
 #endif
  UNUSED(stats);
  mi_assert(p == NULL || (addr == NULL && p != addr) || (addr != NULL && p == addr));
-  if (p != NULL) mi_stat_increase(stats->mmap_calls, 1);
+  if (p != NULL) {
    mi_stat_increase(stats->mmap_calls, 1);
    mi_stat_increase(stats->reserved, size);
    if (commit) mi_stat_increase(stats->committed, size);
  }
  return p;
 }
-static void* mi_mmap_aligned(size_t size, size_t alignment, mi_stats_t* stats) {
+static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit, mi_stats_t* stats) {
  if (alignment < _mi_os_page_size() || ((alignment & (~alignment + 1)) != alignment)) return NULL;
  void* p = NULL;
  #if defined(_WIN32) && defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
@ -202,27 +246,33 @@ static void* mi_mmap_aligned(size_t size, size_t alignment, mi_stats_t* stats) {
    MEM_EXTENDED_PARAMETER param = { 0 };
    param.Type = MemExtendedParameterAddressRequirements;
    param.Pointer = &reqs; 
-    DWORD extra_flags = 0;
+    DWORD flags = MEM_RESERVE;
-    if (use_large_os_page(size, alignment)) extra_flags |= MEM_LARGE_PAGES;
+    if (commit) flags |= MEM_COMMIT;
-    p = (*pVirtualAlloc2)(NULL, NULL, size, MEM_RESERVE | MEM_COMMIT | extra_flags, PAGE_READWRITE, &param, 1);  
+    if (use_large_os_page(size, alignment)) flags |= MEM_LARGE_PAGES;
    p = (*pVirtualAlloc2)(NULL, NULL, size, flags, PAGE_READWRITE, &param, 1);  
  }
  #elif defined(MAP_ALIGNED)
  // on BSD, use the aligned mmap api
  size_t n = _mi_bsr(alignment);
-  if ((size_t)1 << n == alignment && n >= 12) {  // alignment is a power of 2 and >= 4096
+  if (((size_t)1 << n) == alignment && n >= 12) {  // alignment is a power of 2 and >= 4096
-    p = mi_mmap(suggest, size, MAP_ALIGNED(n), tld->stats);     // use the NetBSD/freeBSD aligned flags
+    p = mi_os_mem_alloc(suggest, size, commit, MAP_ALIGNED(n), tld->stats);     // use the NetBSD/freeBSD aligned flags
  }
  #else
  UNUSED(size);
  UNUSED(alignment);
  #endif
  UNUSED(stats); // if !STATS
  mi_assert(p == NULL || (uintptr_t)p % alignment == 0);
-  if (p != NULL) mi_stat_increase(stats->mmap_calls, 1);
+  if (p != NULL) {
    mi_stat_increase(stats->mmap_calls, 1);
    mi_stat_increase(stats->reserved, size);
    if (commit) mi_stat_increase(stats->committed, size);
  }
  return p;
 }
-
+// Conservatively OS page align within a given area
-static void* mi_os_page_align_region(void* addr, size_t size, size_t* newsize) {
+static void* mi_os_page_align_area(void* addr, size_t size, size_t* newsize) {
  mi_assert(addr != NULL && size > 0);
  if (newsize != NULL) *newsize = 0;
  if (size == 0 || addr == NULL) return NULL;
@ -242,16 +292,31 @@ static void* mi_os_page_align_region(void* addr, size_t size, size_t* newsize) {
 // but may be used later again. This will release physical memory
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
-bool _mi_os_reset(void* addr, size_t size) {
+bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
  // page align conservatively within the range
  size_t csize;
-  void* start = mi_os_page_align_region(addr,size,&csize);
+  void* start = mi_os_page_align_area(addr,size,&csize);
  if (csize==0) return true;
  UNUSED(stats); // if !STATS
  mi_stat_increase(stats->reset, csize);
 #if defined(_WIN32)
  // Testing shows that for us (on `malloc-large`) MEM_RESET is 2x faster than DiscardVirtualMemory
  // (but this is for an access pattern that immediately reuses the memory)
  /*
  DWORD ok = DiscardVirtualMemory(start, csize);
  return (ok != 0);
  */
  void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE);
  mi_assert(p == start);
-  return (p == start);
+  if (p != start) return false;
  /*
  // VirtualUnlock removes the memory eagerly from the current working set (which MEM_RESET does lazily on demand)
  // TODO: put this behind an option?
  DWORD ok = VirtualUnlock(start, csize); 
  if (ok != 0) return false;
  */
  return true;  
 #else
  #if defined(MADV_FREE)
    static int advice = MADV_FREE;
@ -276,19 +341,19 @@ bool _mi_os_reset(void* addr, size_t size) {
 static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
  // page align conservatively within the range
  size_t csize = 0;
-  void* start = mi_os_page_align_region(addr, size, &csize);
+  void* start = mi_os_page_align_area(addr, size, &csize);
  if (csize==0) return false;
  int err = 0;
 #ifdef _WIN32
  DWORD oldprotect = 0;
  BOOL ok = VirtualProtect(start,csize,protect ? PAGE_NOACCESS : PAGE_READWRITE,&oldprotect);
-  err = (ok ? 0 : -1);
+  err = (ok ? 0 : GetLastError());
 #else
  err = mprotect(start,csize,protect ? PROT_NONE : (PROT_READ|PROT_WRITE));
 #endif
  if (err != 0) {
-    _mi_warning_message("mprotect error: start: 0x%8p, csize: 0x%8zux, errno: %i\n", start, csize, errno);
+    _mi_warning_message("mprotect error: start: 0x%8p, csize: 0x%8zux, err: %i\n", start, csize, err);
  }
  return (err==0);
 }
@ -301,24 +366,48 @@ bool _mi_os_unprotect(void* addr, size_t size) {
  return mi_os_protectx(addr, size, false);
 }
-bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize) {
+// Commit/Decommit memory.
 // We page align to a conservative area inside the range to reset.
 static bool mi_os_commitx(void* addr, size_t size, bool commit, mi_stats_t* stats) {
  // page align conservatively within the range
-  mi_assert_internal(oldsize > newsize && p != NULL);
+  size_t csize;
-  if (oldsize < newsize || p==NULL) return false;
+  void* start = mi_os_page_align_area(addr, size, &csize);
-  if (oldsize == newsize) return true;
+  if (csize == 0) return true;
  int err = 0;
  UNUSED(stats); // if !STATS
  if (commit) {
    mi_stat_increase(stats->committed, csize);
    mi_stat_increase(stats->commit_calls,1);
  }
  else {
    mi_stat_decrease(stats->committed, csize);
  }
-  // oldsize and newsize should be page aligned or we cannot shrink precisely
+#if defined(_WIN32)
-  void* addr = (uint8_t*)p + newsize;
+  if (commit) {
-  size_t size = 0;
+    void* p = VirtualAlloc(start, csize, MEM_COMMIT, PAGE_READWRITE);
-  void* start = mi_os_page_align_region(addr, oldsize - newsize, &size);
+    err = (p == start ? 0 : GetLastError());
-  if (size==0 || start != addr) return false;
+  }
-  
+  else {
-  #ifdef _WIN32
+    BOOL ok = VirtualFree(start, csize, MEM_DECOMMIT);
-  // we cannot shrink on windows
+    err = (ok ? 0 : GetLastError());
-  return false;
+  }
-  #else
+#else
-  return mi_munmap( start, size );
+  err = mprotect(start, csize, (commit ? (PROT_READ | PROT_WRITE) : PROT_NONE));
-  #endif
+#endif
  if (err != 0) {
    _mi_warning_message("commit/decommit error: start: 0x%8p, csize: 0x%8zux, err: %i\n", start, csize, err);
  }
  mi_assert_internal(err == 0);
  return (err == 0);
 }
 bool _mi_os_commit(void* addr, size_t size, mi_stats_t* stats) {
  return mi_os_commitx(addr, size, true, stats);
 }
 bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats) {
  return mi_os_commitx(addr, size, false, stats);
 }
 /* -----------------------------------------------------------
@ -327,22 +416,21 @@ bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize) {
 void* _mi_os_alloc(size_t size, mi_stats_t* stats) {
  if (size == 0) return NULL;
-  void* p = mi_mmap(NULL, size, 0, stats);
+  size = mi_os_good_alloc_size(size, 0);
  void* p = mi_os_mem_alloc(NULL, size, true, 0, stats);
  mi_assert(p!=NULL);
  if (p != NULL) mi_stat_increase(stats->reserved, size);
  return p;
 }
 void  _mi_os_free(void* p, size_t size, mi_stats_t* stats) {
  UNUSED(stats);
-  mi_munmap(p, size);
+  mi_os_mem_free(p, size, stats);
  mi_stat_decrease(stats->reserved, size);
 }
 // Slow but guaranteed way to allocated aligned memory
 // by over-allocating and then reallocating at a fixed aligned
 // address that should be available then.
-static void* mi_os_alloc_aligned_ensured(size_t size, size_t alignment, size_t trie, mi_stats_t* stats)
+static void* mi_os_alloc_aligned_ensured(size_t size, size_t alignment, bool commit, size_t trie, mi_stats_t* stats)
 {
  if (trie >= 3) return NULL; // stop recursion (only on Windows)
  size_t alloc_size = size + alignment;
@ -350,28 +438,28 @@ static void* mi_os_alloc_aligned_ensured(size_t size, size_t alignment, size_t t
  if (alloc_size < size) return NULL;
  // allocate a chunk that includes the alignment
-  void* p = mi_mmap(NULL, alloc_size, 0, stats);
+  void* p = mi_os_mem_alloc(NULL, alloc_size, commit, 0, stats);
  if (p == NULL) return NULL;
  // create an aligned pointer in the allocated area
  void* aligned_p = mi_align_up_ptr(p, alignment);
  mi_assert(aligned_p != NULL);
-#if defined(_WIN32)
+
  // free it and try to allocate `size` at exactly `aligned_p`
-  // note: this may fail in case another thread happens to VirtualAlloc
+  // note: this may fail in case another thread happens to allocate
  // concurrently at that spot. We try up to 3 times to mitigate this.
-  mi_munmap(p, alloc_size);
+  mi_os_mem_free(p, alloc_size, stats);
-  p = mi_mmap(aligned_p, size, 0, stats);
+  p = mi_os_mem_alloc(aligned_p, size, commit, 0, stats);
  if (p != aligned_p) {
-    if (p != NULL) mi_munmap(p, size);
+    if (p != NULL) mi_os_mem_free(p, size, stats);
-    return mi_os_alloc_aligned_ensured(size, alignment, trie++, stats);
+    return mi_os_alloc_aligned_ensured(size, alignment, commit, trie++, stats);
  }
-#else
+#if 0  // could use this on mmap systems
  // we selectively unmap parts around the over-allocated area.
  size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
  size_t mid_size = _mi_align_up(size, _mi_os_page_size());
  size_t post_size = alloc_size - pre_size - mid_size;
-  if (pre_size > 0)  mi_munmap(p, pre_size);
+  if (pre_size > 0)  mi_os_mem_free(p, pre_size, stats);
-  if (post_size > 0) mi_munmap((uint8_t*)aligned_p + mid_size, post_size);
+  if (post_size > 0) mi_os_mem_free((uint8_t*)aligned_p + mid_size, post_size, stats);
 #endif
  mi_assert(((uintptr_t)aligned_p) % alignment == 0);
@ -382,22 +470,21 @@ static void* mi_os_alloc_aligned_ensured(size_t size, size_t alignment, size_t t
 // Since `mi_mmap` is relatively slow we try to allocate directly at first and
 // hope to get an aligned address; only when that fails we fall back
 // to a guaranteed method by overallocating at first and adjusting.
-// TODO: use VirtualAlloc2 with alignment on Windows 10 / Windows Server 2016.
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, mi_os_tld_t* tld)
 void* _mi_os_alloc_aligned(size_t size, size_t alignment, mi_os_tld_t* tld)
 {
  if (size == 0) return NULL;
-  if (alignment < 1024) return _mi_os_alloc(size, tld->stats);
+  size = mi_os_good_alloc_size(size,alignment);
-
+  if (alignment < 1024) return mi_os_mem_alloc(NULL, size, commit, 0, tld->stats);
  void* p = os_pool_alloc(size,alignment,tld);
  if (p != NULL) return p;
  // try direct OS aligned allocation; only supported on BSD and Windows 10+
  void* suggest = NULL;
  void* p = mi_os_mem_alloc_aligned(size,alignment,commit,tld->stats);
-  p = mi_mmap_aligned(size,alignment,tld->stats);
+  // Fall back 
  if (p==NULL && (tld->mmap_next_probable % alignment) == 0) {
    // if the next probable address is aligned,
    // then try to just allocate `size` and hope it is aligned...
-    p = mi_mmap(suggest, size, 0, tld->stats);
+    p = mi_os_mem_alloc(suggest, size, commit, 0, tld->stats);
    if (p == NULL) return NULL;
    if (((uintptr_t)p % alignment) == 0) mi_stat_increase(tld->stats->mmap_right_align, 1);
  }
@ -406,75 +493,23 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, mi_os_tld_t* tld)
  if (p==NULL || ((uintptr_t)p % alignment) != 0) {
    // if `p` is not yet aligned after all, free the block and use a slower
    // but guaranteed way to allocate an aligned block
-    if (p != NULL) mi_munmap(p, size);
+    if (p != NULL) mi_os_mem_free(p, size, tld->stats);
    mi_stat_increase( tld->stats->mmap_ensure_aligned, 1);
    //fprintf(stderr, "mimalloc: slow mmap 0x%lx\n", _mi_thread_id());
-    p = mi_os_alloc_aligned_ensured(size, alignment,0,tld->stats);
+    p = mi_os_alloc_aligned_ensured(size, alignment,commit,0,tld->stats);
  }
-  if (p != NULL) {
+  if (p != NULL) {    
-    mi_stat_increase( tld->stats->reserved, size);
+    // next probable address is the page-aligned address just after the newly allocated area.    
    // next probable address is the page-aligned address just after the newly allocated area.
    const size_t alloc_align =
 #if defined(_WIN32)
      64 * 1024; // Windows allocates 64kb aligned
 #else
      _mi_os_page_size(); // page size on other OS's
 #endif
    size_t probable_size = MI_SEGMENT_SIZE;
    if (tld->mmap_previous > p) {
      // Linux tends to allocate downward
-      tld->mmap_next_probable = _mi_align_down((uintptr_t)p - probable_size, alloc_align); // ((uintptr_t)previous - (uintptr_t)p);
+      tld->mmap_next_probable = _mi_align_down((uintptr_t)p - probable_size, os_alloc_granularity); // ((uintptr_t)previous - (uintptr_t)p);
    }
    else {
      // Otherwise, guess the next address is page aligned `size` from current pointer
-      tld->mmap_next_probable = _mi_align_up((uintptr_t)p + probable_size, alloc_align);
+      tld->mmap_next_probable = _mi_align_up((uintptr_t)p + probable_size, os_alloc_granularity);
    }
    tld->mmap_previous = p;
  }
  return p;
 }
 // Pooled allocation: on 64-bit systems with plenty
 // of virtual addresses, we allocate 10 segments at the
 // time to minimize `mmap` calls and increase aligned
 // allocations. This is only good on systems that
 // do overcommit so we put it behind the `MIMALLOC_POOL_COMMIT` option.
 // For now, we disable it on windows as VirtualFree must
 // be called on the original allocation and cannot be called
 // for individual fragments.
 #if defined(_WIN32) || (MI_INTPTR_SIZE<8)
 static void* os_pool_alloc(size_t size, size_t alignment, mi_os_tld_t* tld) {
  UNUSED(size);
  UNUSED(alignment);
  UNUSED(tld);
  return NULL;
 }
 #else
 #define MI_POOL_ALIGNMENT   MI_SEGMENT_SIZE
 #define MI_POOL_SIZE        (10*MI_POOL_ALIGNMENT)
 static void* os_pool_alloc(size_t size, size_t alignment, mi_os_tld_t* tld)
 {
  if (!mi_option_is_enabled(mi_option_pool_commit)) return NULL;
  if (alignment != MI_POOL_ALIGNMENT) return NULL;
  size = _mi_align_up(size,MI_POOL_ALIGNMENT);
  if (size > MI_POOL_SIZE) return NULL;
  if (tld->pool_available == 0) {
    tld->pool = (uint8_t*)mi_os_alloc_aligned_ensured(MI_POOL_SIZE,MI_POOL_ALIGNMENT,0,tld->stats);
    if (tld->pool == NULL) return NULL;
    tld->pool_available += MI_POOL_SIZE;
  }
  if (size > tld->pool_available) return NULL;
  void* p = tld->pool;
  tld->pool_available -= size;
  tld->pool += size;
  return p;
 }
 #endif
--- a/src/page-queue.c
+++ b/src/page-queue.c
@ -267,7 +267,9 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
 static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
  mi_assert_internal(page->heap == NULL);
  mi_assert_internal(!mi_page_queue_contains(queue, page));
-  mi_assert_internal(page->block_size == queue->block_size || (page->block_size > MI_LARGE_SIZE_MAX && mi_page_queue_is_huge(queue)) || (page->flags.in_full && mi_page_queue_is_full(queue)));
+  mi_assert_internal(page->block_size == queue->block_size || 
                      (page->block_size > MI_LARGE_SIZE_MAX && mi_page_queue_is_huge(queue)) || 
                        (page->flags.in_full && mi_page_queue_is_full(queue)));
  page->flags.in_full = mi_page_queue_is_full(queue);
  page->heap = heap;
@ -292,9 +294,11 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
  mi_assert_internal(page != NULL);
  mi_assert_expensive(mi_page_queue_contains(from, page));
  mi_assert_expensive(!mi_page_queue_contains(to, page));
-  mi_assert_internal(page->block_size == to->block_size ||
+  mi_assert_internal((page->block_size == to->block_size && page->block_size == from->block_size) ||
                     (page->block_size == to->block_size && mi_page_queue_is_full(from)) ||
                     (page->block_size == from->block_size && mi_page_queue_is_full(to)) ||
                     (page->block_size > MI_LARGE_SIZE_MAX && mi_page_queue_is_huge(to)) ||
-                      (page->block_size == from->block_size && mi_page_queue_is_full(to)));
+                     (page->block_size > MI_LARGE_SIZE_MAX && mi_page_queue_is_full(to)));
  if (page->prev != NULL) page->prev->next = page->next;
  if (page->next != NULL) page->next->prev = page->prev;
--- a/src/page.c
+++ b/src/page.c
@ -453,7 +453,7 @@ static void mi_page_free_list_extend( mi_heap_t* heap, mi_page_t* page, size_t e
  }
  // enable the new free list
  page->capacity += (uint16_t)extend;
-  mi_stat_increase(stats->committed, extend * page->block_size);
+  mi_stat_increase(stats->page_committed, extend * page->block_size);
 }
 /* -----------------------------------------------------------
--- a/src/segment.c
+++ b/src/segment.c
@ -108,19 +108,6 @@ static void mi_segment_enqueue(mi_segment_queue_t* queue, mi_segment_t* segment)
  }
 }
 static void mi_segment_queue_insert_before(mi_segment_queue_t* queue, mi_segment_t* elem, mi_segment_t* segment) {
  mi_assert_expensive(elem==NULL || mi_segment_queue_contains(queue, elem));
  mi_assert_expensive(segment != NULL && !mi_segment_queue_contains(queue, segment));
  segment->prev = (elem == NULL ? queue->last : elem->prev);
  if (segment->prev != NULL) segment->prev->next = segment;
                        else queue->first = segment;
  segment->next = elem;
  if (segment->next != NULL) segment->next->prev = segment;
                        else queue->last = segment;
 }
 // Start of the page available memory
 uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size)
 {
@ -176,17 +163,17 @@ static size_t mi_segment_size(size_t capacity, size_t required, size_t* pre_size
 }
-/* -----------------------------------------------------------
+/* ----------------------------------------------------------------------------
 Segment caches
-We keep a small segment cache per thread to avoid repeated allocation
+We keep a small segment cache per thread to increase local 
-and free in the OS if a program allocates memory and then frees
+reuse and avoid setting/clearing guard pages in secure mode.
-all again repeatedly. (We tried a one-element cache but that
+------------------------------------------------------------------------------- */
 proves to be too small for certain workloads).
 ----------------------------------------------------------- */
 static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) {
  if (segment_size>=0) mi_stat_increase(tld->stats->segments,1);
                  else mi_stat_decrease(tld->stats->segments,1);
  tld->count += (segment_size >= 0 ? 1 : -1);
  if (tld->count > tld->peak_count) tld->peak_count = tld->count;
  tld->current_size += segment_size;
  if (tld->current_size > tld->peak_size) tld->peak_size = tld->current_size;
 }
@ -194,123 +181,87 @@ static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) {
 static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_segments_tld_t* tld) {
  mi_segments_track_size(-((long)segment_size),tld);
-  _mi_os_free(segment, segment_size,tld->stats);
+  if (mi_option_is_enabled(mi_option_secure)) {
    _mi_mem_unprotect(segment, segment->segment_size); // ensure no more guard pages are set
  }
  _mi_mem_free(segment, segment_size, segment->memid, tld->stats);
 }
-// The segment cache is limited to be at most 1/8 of the peak size
+// The thread local segment cache is limited to be at most 1/8 of the peak size of segments in use, 
-// in use (and no more than 32)
+// and no more than 4. 
-#define MI_SEGMENT_CACHE_MAX (32)
+#define MI_SEGMENT_CACHE_MAX      (4)
 #define MI_SEGMENT_CACHE_FRACTION (8)
-// Get a segment of at least `required` size.
+static mi_segment_t* mi_segment_cache_pop(size_t segment_size, mi_segments_tld_t* tld) {
-// If `required == MI_SEGMENT_SIZE` the `segment_size` will match exactly
+  if (segment_size != 0 && segment_size != MI_SEGMENT_SIZE) return NULL;
-static mi_segment_t* _mi_segment_cache_findx(mi_segments_tld_t* tld, size_t required, bool reverse) {
+  mi_segment_t* segment = tld->cache;
-  mi_assert_internal(required % _mi_os_page_size() == 0);
+  if (segment == NULL) return NULL;
-  mi_segment_t* segment = (reverse ? tld->cache.last : tld->cache.first);
+  tld->cache_count--;
-  while (segment != NULL) {
+  tld->cache = segment->next;
-    if (segment->segment_size >= required) {
+  segment->next = NULL;
-      tld->cache_count--;
+  mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
-      tld->cache_size -= segment->segment_size;
+  return segment;
      mi_segment_queue_remove(&tld->cache, segment);
      // exact size match?
      if (required==0 || segment->segment_size == required) {
        return segment;
      }
      // not more than 25% waste and on a huge page segment? (in that case the segment size does not need to match required)
      else if (required != MI_SEGMENT_SIZE && segment->segment_size - (segment->segment_size/4) <= required) {
        return segment;
      }
      // try to shrink the memory to match exactly
      else {
        if (mi_option_is_enabled(mi_option_secure)) {
          _mi_os_unprotect(segment, segment->segment_size);
        }
        if (_mi_os_shrink(segment, segment->segment_size, required)) {
          tld->current_size -= segment->segment_size;
          tld->current_size += required;
          segment->segment_size = required;
          return segment;
        }
        else {
          // if that all fails, we give up
          mi_segment_os_free(segment,segment->segment_size,tld);
          return NULL;
        }
      }
    }
    segment = (reverse ? segment->prev : segment->next);
  }
  return NULL;
 }
 static mi_segment_t* mi_segment_cache_find(mi_segments_tld_t* tld, size_t required) {
  return _mi_segment_cache_findx(tld,required,false);
 }
 static mi_segment_t* mi_segment_cache_evict(mi_segments_tld_t* tld) {
  // TODO: random eviction instead?
  return _mi_segment_cache_findx(tld, 0, true /* from the end */);
 }
 static bool mi_segment_cache_full(mi_segments_tld_t* tld) {
  if (tld->cache_count < MI_SEGMENT_CACHE_MAX &&
-      tld->cache_size*MI_SEGMENT_CACHE_FRACTION < tld->peak_size) return false;
+      tld->cache_count < (1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION))) { // always allow 1 element cache
    return false;
  }
  // take the opportunity to reduce the segment cache if it is too large (now)
-  while (tld->cache_size*MI_SEGMENT_CACHE_FRACTION >= tld->peak_size + 1) {
+  // TODO: this never happens as we check against peak usage, should we use current usage instead?
-    mi_segment_t* segment = mi_segment_cache_evict(tld);
+  while (tld->cache_count > (1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION))) {
    mi_segment_t* segment = mi_segment_cache_pop(0,tld);
    mi_assert_internal(segment != NULL);
    if (segment != NULL) mi_segment_os_free(segment, segment->segment_size, tld);
  }
  return true;
 }
-static bool mi_segment_cache_insert(mi_segment_t* segment, mi_segments_tld_t* tld) {
+static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_assert_internal(segment->next==NULL && segment->prev==NULL);
+  mi_assert_internal(!mi_segment_is_in_free_queue(segment, tld));
-  mi_assert_internal(!mi_segment_is_in_free_queue(segment,tld));
+  mi_assert_internal(segment->next == NULL);
-  mi_assert_expensive(!mi_segment_queue_contains(&tld->cache, segment));
+  if (segment->segment_size != MI_SEGMENT_SIZE || mi_segment_cache_full(tld)) {
-  if (mi_segment_cache_full(tld)) return false;
+    return false;
  }
  mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
  if (mi_option_is_enabled(mi_option_cache_reset) && !mi_option_is_enabled(mi_option_page_reset)) {
-    _mi_os_reset((uint8_t*)segment + segment->segment_info_size, segment->segment_size - segment->segment_info_size);
+    _mi_mem_reset((uint8_t*)segment + segment->segment_info_size, segment->segment_size - segment->segment_info_size, tld->stats);
  }
-  // insert ordered
+  segment->next = tld->cache;
-  mi_segment_t* seg = tld->cache.first;
+  tld->cache = segment;
  while (seg != NULL && seg->segment_size < segment->segment_size) {
    seg = seg->next;
  }
  mi_segment_queue_insert_before( &tld->cache, seg, segment );
  tld->cache_count++;
  tld->cache_size += segment->segment_size;
  return true;
 }
-// called by ending threads to free cached segments
+// called by threads that are terminating to free cached segments
 void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
  mi_segment_t* segment;
-  while ((segment = mi_segment_cache_find(tld,0)) != NULL) {
+  while ((segment = mi_segment_cache_pop(0,tld)) != NULL) {
-    mi_segment_os_free(segment, MI_SEGMENT_SIZE, tld);
+    mi_segment_os_free(segment, segment->segment_size, tld);
  }
-  mi_assert_internal(tld->cache_count == 0 && tld->cache_size == 0);
+  mi_assert_internal(tld->cache_count == 0);
-  mi_assert_internal(mi_segment_queue_is_empty(&tld->cache));
+  mi_assert_internal(tld->cache == NULL);
 }
 /* -----------------------------------------------------------
   Segment allocation
 ----------------------------------------------------------- */
 // Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
-static mi_segment_t* mi_segment_alloc( size_t required, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 {
  // calculate needed sizes first
  size_t capacity;
  if (page_kind == MI_PAGE_HUGE) {
-    mi_assert_internal(page_shift==MI_SEGMENT_SHIFT && required > 0);
+    mi_assert_internal(page_shift == MI_SEGMENT_SHIFT && required > 0);
    capacity = 1;
  }
  else {
-    mi_assert_internal(required==0);
+    mi_assert_internal(required == 0);
    size_t page_size = (size_t)1 << page_shift;
    capacity = MI_SEGMENT_SIZE / page_size;
    mi_assert_internal(MI_SEGMENT_SIZE % page_size == 0);
@ -318,46 +269,52 @@ static mi_segment_t* mi_segment_alloc( size_t required, mi_page_kind_t page_kind
  }
  size_t info_size;
  size_t pre_size;
-  size_t segment_size = mi_segment_size( capacity, required, &pre_size, &info_size);
+  size_t segment_size = mi_segment_size(capacity, required, &pre_size, &info_size);
  mi_assert_internal(segment_size >= required);
  size_t page_size = (page_kind == MI_PAGE_HUGE ? segment_size : (size_t)1 << page_shift);
-  // Allocate the segment
+  // Try to get it from our thread local cache first
-  mi_segment_t* segment = NULL;
+  bool protection_still_good = false;
-
+  mi_segment_t* segment = mi_segment_cache_pop(segment_size, tld);
-  // try to get it from our caches
+  if (segment != NULL) {
-  segment = mi_segment_cache_find(tld,segment_size);
+    if (mi_option_is_enabled(mi_option_secure)) {
-  mi_assert_internal(segment == NULL ||
+      if (segment->page_kind != page_kind) {
-                     (segment_size==MI_SEGMENT_SIZE && segment_size == segment->segment_size) ||
+        _mi_mem_unprotect(segment, segment->segment_size); // reset protection if the page kind differs
-                      (segment_size!=MI_SEGMENT_SIZE && segment_size <= segment->segment_size));
+      }
-  if (segment != NULL && mi_option_is_enabled(mi_option_secure) && (segment->page_kind != page_kind || segment->segment_size != segment_size)) {
+      else {
-    _mi_os_unprotect(segment,segment->segment_size);
+        protection_still_good = true; // otherwise, the guard pages are still in place
      }
    }
  }
-
+  else {
-  // and otherwise allocate it from the OS
+    // Allocate the segment from the OS
-  if (segment == NULL) {
+    size_t memid;
-    segment = (mi_segment_t*)_mi_os_alloc_aligned(segment_size, MI_SEGMENT_SIZE, os_tld);
+    segment = (mi_segment_t*)_mi_mem_alloc_aligned(segment_size, MI_SEGMENT_SIZE, &memid, os_tld);
-    if (segment == NULL) return NULL;
+    if (segment == NULL) return NULL;  // failed to allocate
-    mi_segments_track_size((long)segment_size,tld);
+    segment->memid = memid;
    mi_segments_track_size((long)segment_size, tld);
  }
  mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
-  mi_assert_internal((uintptr_t)segment % MI_SEGMENT_SIZE == 0);
+  // zero the segment info
-
+  { size_t memid = segment->memid;
-  memset(segment, 0, info_size);
+    memset(segment, 0, info_size);
-  if (mi_option_is_enabled(mi_option_secure)) {
+    segment->memid = memid;
-    // in secure mode, we set up a protected page in between the segment info
+  }
-    // and the page data
+  
  if (mi_option_is_enabled(mi_option_secure) && !protection_still_good) {
    // in secure mode, we set up a protected page in between the segment info and the page data
    mi_assert_internal( info_size == pre_size - _mi_os_page_size() && info_size % _mi_os_page_size() == 0);
-    _mi_os_protect( (uint8_t*)segment + info_size, (pre_size - info_size) );
+    _mi_mem_protect( (uint8_t*)segment + info_size, (pre_size - info_size) );
    size_t os_page_size = _mi_os_page_size();
    if (mi_option_get(mi_option_secure) <= 1) {
      // and protect the last page too
-      _mi_os_protect( (uint8_t*)segment + segment_size - os_page_size, os_page_size );
+      _mi_mem_protect( (uint8_t*)segment + segment_size - os_page_size, os_page_size );
    }
    else {
      // protect every page
      for (size_t i = 0; i < capacity; i++) {
-        _mi_os_protect( (uint8_t*)segment + (i+1)*page_size - os_page_size, os_page_size );
+        _mi_mem_protect( (uint8_t*)segment + (i+1)*page_size - os_page_size, os_page_size );
      }
    }
  }
@ -372,7 +329,7 @@ static mi_segment_t* mi_segment_alloc( size_t required, mi_page_kind_t page_kind
  for (uint8_t i = 0; i < segment->capacity; i++) {
    segment->pages[i].segment_idx = i;
  }
-  mi_stat_increase(tld->stats->committed, segment->segment_info_size);
+  mi_stat_increase(tld->stats->page_committed, segment->segment_info_size);
  //fprintf(stderr,"mimalloc: alloc segment at %p\n", (void*)segment);
  return segment;
 }
@ -387,6 +344,7 @@ static size_t mi_page_size(const mi_page_t* page) {
 #endif
 static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
  UNUSED(force);
  //fprintf(stderr,"mimalloc: free segment at %p\n", (void*)segment);
  mi_assert(segment != NULL);
  if (mi_segment_is_in_free_queue(segment,tld)) {
@ -403,7 +361,7 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t
  mi_assert_expensive(!mi_segment_queue_contains(&tld->small_free, segment));
  mi_assert(segment->next == NULL);
  mi_assert(segment->prev == NULL);
-  mi_stat_decrease( tld->stats->committed, segment->segment_info_size);
+  mi_stat_decrease( tld->stats->page_committed, segment->segment_info_size);
  segment->thread_id = 0;
  // update reset memory statistics
@ -415,7 +373,7 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t
    }
  }
-  if (!force && mi_segment_cache_insert(segment, tld)) {
+  if (!force && mi_segment_cache_push(segment, tld)) {
    // it is put in our cache
  }
  else {
@ -424,9 +382,6 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t
  }
 }
 /* -----------------------------------------------------------
  Free page management inside a segment
 ----------------------------------------------------------- */
@ -461,17 +416,16 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_sta
  mi_assert_internal(page->segment_in_use);
  mi_assert_internal(mi_page_all_free(page));
  size_t inuse = page->capacity * page->block_size;
-  mi_stat_decrease( stats->committed, inuse);
+  mi_stat_decrease( stats->page_committed, inuse);
  mi_stat_decrease( stats->pages, 1);
  // reset the page memory to reduce memory pressure?
  if (!page->is_reset && mi_option_is_enabled(mi_option_page_reset)) {
    size_t psize;
    uint8_t* start = _mi_segment_page_start(segment, page, &psize);
    mi_stat_increase( stats->reset, psize);  // for stats we assume resetting the full page
    page->is_reset = true;
    if (inuse > 0) {
-      _mi_os_reset(start, inuse);
+      _mi_mem_reset(start, psize, stats); // TODO: just `inuse`?
    }
  }
--- a/src/stats.c
+++ b/src/stats.c
@ -94,12 +94,14 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
  mi_stat_add(&stats->reserved, &src->reserved, 1);
  mi_stat_add(&stats->committed, &src->committed, 1);
  mi_stat_add(&stats->reset, &src->reset, 1);
  mi_stat_add(&stats->page_committed, &src->page_committed, 1);
  mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
  mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1);
  mi_stat_add(&stats->mmap_calls, &src->mmap_calls, 1);
  mi_stat_add(&stats->mmap_ensure_aligned, &src->mmap_ensure_aligned, 1);
  mi_stat_add(&stats->mmap_right_align, &src->mmap_right_align, 1);
  mi_stat_add(&stats->commit_calls, &src->commit_calls, 1);
  mi_stat_add(&stats->threads, &src->threads, 1);
  mi_stat_add(&stats->pages_extended, &src->pages_extended, 1);
@ -226,9 +228,10 @@ static void _mi_stats_print(mi_stats_t* stats, double secs, FILE* out) mi_attr_n
  _mi_fprintf(out, "malloc requested:     ");
  mi_print_amount(stats->malloc.allocated, 1, out);
  _mi_fprintf(out, "\n\n");
  mi_stat_print(&stats->committed, "committed", 1, out);
  mi_stat_print(&stats->reserved, "reserved", 1, out);
  mi_stat_print(&stats->committed, "committed", 1, out);
  mi_stat_print(&stats->reset, "reset", -1, out);
  mi_stat_print(&stats->page_committed, "touched", 1, out);
  mi_stat_print(&stats->segments, "segments", -1, out);
  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out);
  mi_stat_print(&stats->pages, "pages", -1, out);
@ -237,6 +240,7 @@ static void _mi_stats_print(mi_stats_t* stats, double secs, FILE* out) mi_attr_n
  mi_stat_print(&stats->mmap_calls, "mmaps", 0, out);
  mi_stat_print(&stats->mmap_right_align, "mmap fast", 0, out);
  mi_stat_print(&stats->mmap_ensure_aligned, "mmap slow", 0, out);
  mi_stat_print(&stats->commit_calls, "commits", 0, out);
  mi_stat_print(&stats->threads, "threads", 0, out);
  mi_stat_counter_print(&stats->searches, "searches", out);
 #endif
--- a/test/test-api.c
+++ b/test/test-api.c
@ -139,6 +139,8 @@ int main() {
  CHECK("heap_destroy", test_heap1());
  CHECK("heap_delete", test_heap2());
  //mi_stats_print(NULL);
  // ---------------------------------------------------
  // Done
  // ---------------------------------------------------[]