refactor and improve atomic bitmap usage

2024-12-26 21:04:27 +08:00 · 2019-11-07 10:26:52 -08:00 · 2019-11-07 10:26:52 -08:00 · 378716c467
commit 378716c467
parent b09282bc0d
10 changed files with 183 additions and 131 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -10,6 +10,7 @@ option(MI_SEE_ASM           "Generate assembly files" OFF)
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library" OFF)
 option(MI_SECURE            "Use security mitigations (like guard pages and randomization)" OFF)
+option(MI_SECURE_FULL       "Use full security mitigations (like double free protection, more expensive)" OFF)
 option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
 option(MI_BUILD_TESTS       "Build test executables" ON)

@ -70,9 +71,14 @@ if(MI_OVERRIDE MATCHES "ON")
  endif()
 endif()

-if(MI_SECURE MATCHES "ON")
-  message(STATUS "Set secure build (MI_SECURE=ON)")
-  list(APPEND mi_defines MI_SECURE=3)
+if(MI_SECURE_FULL MATCHES "ON")
+  message(STATUS "Set full secure build (experimental) (MI_SECURE_FULL=ON)")
+  list(APPEND mi_defines MI_SECURE=4)
+else()
+  if(MI_SECURE MATCHES "ON")
+    message(STATUS "Set secure build (MI_SECURE=ON)")
+    list(APPEND mi_defines MI_SECURE=3)
+  endif()
 endif()

 if(MI_SEE_ASM MATCHES "ON")
--- a/ide/vs2019/mimalloc-override.vcxproj
+++ b/ide/vs2019/mimalloc-override.vcxproj
@ -232,6 +232,9 @@
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
    <ClCompile Include="..\..\src\arena.c" />
+    <ClCompile Include="..\..\src\bitmap.inc.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+    </ClCompile>
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\memory.c" />
--- a/ide/vs2019/mimalloc.vcxproj
+++ b/ide/vs2019/mimalloc.vcxproj
@ -218,7 +218,9 @@
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
    <ClCompile Include="..\..\src\arena.c" />
-    <ClCompile Include="..\..\src\bitmap.inc.c" />
+    <ClCompile Include="..\..\src\bitmap.inc.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+    </ClCompile>
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\memory.c" />
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@ -163,7 +163,6 @@ bool        _mi_page_is_valid(mi_page_t* page);


 // Overflow detecting multiply
-#define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
 static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
 #include <limits.h>   // UINT_MAX, ULONG_MAX
@ -175,6 +174,7 @@ static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
  return __builtin_umulll_overflow(count, size, total);
 #endif
 #else /* __builtin_umul_overflow is unavailable */
+  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
  *total = count * size;
  return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW)
          && size > 0 && (SIZE_MAX / size) < count);
@ -188,6 +188,7 @@ static inline bool _mi_is_power_of_two(uintptr_t x) {

 // Align upwards
 static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
  uintptr_t mask = alignment - 1;
  if ((alignment & mask) == 0) {  // power of two?
    return ((sz + mask) & ~mask);
@ -197,6 +198,12 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
  }
 }

+// Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
+static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
+  mi_assert_internal(divider != 0);
+  return (divider == 0 ? size : ((size + divider - 1) / divider));
+}
+
 // Is memory zero initialized?
 static inline bool mi_mem_is_zero(void* p, size_t size) {
  for (size_t i = 0; i < size; i++) {
@ -283,7 +290,7 @@ static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
 static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
  // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
-  mi_assert_internal(diff >= 0 && diff < MI_SEGMENT_SIZE);
+  mi_assert_internal(diff >= 0 && (size_t)diff < MI_SEGMENT_SIZE);
  uintptr_t idx = (uintptr_t)diff >> segment->page_shift;
  mi_assert_internal(idx < segment->capacity);
  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@ -29,7 +29,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // #define MI_SECURE 4  // experimental, may be more expensive: checks for double free.

 #if !defined(MI_SECURE)
-#define MI_SECURE 0
+#define MI_SECURE 4
 #endif

 // Define MI_DEBUG for debug mode
@ -93,12 +93,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4mb

 // Derived constants
-#define MI_SEGMENT_SIZE                   (1<<MI_SEGMENT_SHIFT)
+#define MI_SEGMENT_SIZE                   (1UL<<MI_SEGMENT_SHIFT)
 #define MI_SEGMENT_MASK                   ((uintptr_t)MI_SEGMENT_SIZE - 1)

-#define MI_SMALL_PAGE_SIZE                (1<<MI_SMALL_PAGE_SHIFT)
-#define MI_MEDIUM_PAGE_SIZE               (1<<MI_MEDIUM_PAGE_SHIFT)
-#define MI_LARGE_PAGE_SIZE                (1<<MI_LARGE_PAGE_SHIFT)
+#define MI_SMALL_PAGE_SIZE                (1UL<<MI_SMALL_PAGE_SHIFT)
+#define MI_MEDIUM_PAGE_SIZE               (1UL<<MI_MEDIUM_PAGE_SHIFT)
+#define MI_LARGE_PAGE_SIZE                (1UL<<MI_LARGE_PAGE_SHIFT)

 #define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
 #define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
--- a/src/arena.c
+++ b/src/arena.c
@ -7,12 +7,16 @@ terms of the MIT license. A copy of the license can be found in the file

 /* ----------------------------------------------------------------------------
 "Arenas" are fixed area's of OS memory from which we can allocate
-large blocks (>= MI_ARENA_BLOCK_SIZE, 32MiB). Currently only used to
-allocate in one arena consisting of huge OS pages -- otherwise it 
-delegates to direct allocation from the OS.
+large blocks (>= MI_ARENA_BLOCK_SIZE, 32MiB). 
+In contrast to the rest of mimalloc, the arenas are shared between 
+threads and need to be accessed using atomic operations.

-In the future, we can expose an API to manually add more arenas which
-is sometimes needed for embedded devices or shared memory for example.
+Currently arenas are only used to for huge OS page (1GiB) reservations,
+otherwise it delegates to direct allocation from the OS.
+In the future, we can expose an API to manually add more kinds of arenas 
+which is sometimes needed for embedded devices or shared memory for example.
+(We can also employ this with WASI or `sbrk` systems to reserve large arenas
+ on demand and be able to reuse them efficiently).

 The arena allocation needs to be thread safe and we use an atomic
 bitmap to allocate. The current implementation of the bitmap can
@ -48,10 +52,6 @@ int   _mi_os_numa_node_count(void);
 #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 16MiB
 #define MI_MAX_ARENAS         (64)                     // not more than 256 (since we use 8 bits in the memid)

-// Block info: bit 0 contains the `in_use` bit, the upper bits the
-// size in count of arena blocks.
-typedef uintptr_t mi_block_info_t;
-
 // A memory arena descriptor
 typedef struct mi_arena_s {
  uint8_t* start;                         // the start of the memory area
@ -61,8 +61,8 @@ typedef struct mi_arena_s {
  bool     is_zero_init;                  // is the arena zero initialized?
  bool     is_large;                      // large OS page allocated
  volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks
-  mi_bitmap_field_t* blocks_dirty;         // are the blocks potentially non-zero?
-  mi_bitmap_field_t  blocks_map[1];        // bitmap of in-use blocks 
+  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
+  mi_bitmap_field_t  blocks_map[1];       // bitmap of in-use blocks 
 } mi_arena_t;


@ -81,6 +81,7 @@ static _Atomic(uintptr_t)   mi_arena_count; // = 0

 static size_t mi_memid_create(size_t arena_index, mi_bitmap_index_t bitmap_index) {
  mi_assert_internal(arena_index < 0xFE);
+  mi_assert_internal(((bitmap_index << 8) >> 8) == bitmap_index); // no overflow?
  return ((bitmap_index << 8) | ((arena_index+1) & 0xFF));
 }

@ -90,30 +91,25 @@ static void mi_memid_indices(size_t memid, size_t* arena_index, mi_bitmap_index_
  *bitmap_index = (memid >> 8);
 }

-
-static size_t mi_arena_block_count_of_size(size_t size) {
-  const size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
-  const size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
-  return bcount;
+static size_t mi_block_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
 }

 /* -----------------------------------------------------------
  Thread safe allocation in an arena
 ----------------------------------------------------------- */
-static void* mi_arena_alloc(mi_arena_t* arena, size_t blocks, bool* is_zero, mi_bitmap_index_t* bitmap_idx) 
+static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx) 
 {
  const size_t fcount = arena->field_count;
  size_t idx = mi_atomic_read(&arena->search_idx);  // start from last search
  for (size_t visited = 0; visited < fcount; visited++, idx++) {
    if (idx >= fcount) idx = 0;  // wrap around
    if (mi_bitmap_try_claim_field(arena->blocks_map, idx, blocks, bitmap_idx)) {
-      // claimed it! set the dirty bits
-      *is_zero = mi_bitmap_claim(arena->blocks_dirty, fcount, blocks, *bitmap_idx);
      mi_atomic_write(&arena->search_idx, idx);  // start search from here next time
-      return (arena->start + (*bitmap_idx)*MI_ARENA_BLOCK_SIZE);
+      return true;
    }
  }
-  return NULL;
+  return false;
 }


@ -125,13 +121,15 @@ static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t n
                                 bool* commit, bool* large, bool* is_zero, size_t* memid) 
 {
  mi_bitmap_index_t bitmap_index;
-  void* p = mi_arena_alloc(arena, needed_bcount, is_zero, &bitmap_index);
-  if (p != NULL) {
-    *memid = mi_memid_create(arena_index, bitmap_index);
-    *commit = true;           // TODO: support commit on demand?
-    *large = arena->is_large;
+  if (mi_arena_alloc(arena, needed_bcount, &bitmap_index)) {
+    // claimed it! set the dirty bits (todo: no need for an atomic op here?)
+    *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index);
+    *memid   = mi_memid_create(arena_index, bitmap_index);
+    *commit  = true;           // TODO: support commit on demand?
+    *large   = arena->is_large;
+    return (arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE));
  }
-  return p;
+  return NULL;
 }

 void* _mi_arena_alloc_aligned(size_t size, size_t alignment, 
@ -140,7 +138,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
 {
  mi_assert_internal(memid != NULL && tld != NULL);
  mi_assert_internal(size > 0);
-  *memid = MI_MEMID_OS;
+  *memid   = MI_MEMID_OS;
  *is_zero = false;
  bool default_large = false;
  if (large==NULL) large = &default_large;  // ensure `large != NULL`
@ -151,7 +149,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
      size <= MI_ARENA_MAX_OBJ_SIZE && 
      size >= MI_ARENA_MIN_OBJ_SIZE)
  {
-    const size_t bcount = mi_arena_block_count_of_size(size);
+    const size_t bcount = mi_block_count_of_size(size);
    const int numa_node = _mi_os_numa_node(tld); // current numa node

    mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
@ -221,7 +219,7 @@ void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats) {
      _mi_fatal_error("trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
      return;
    }
-    const size_t blocks = mi_arena_block_count_of_size(size);
+    const size_t blocks = mi_block_count_of_size(size);
    bool ones = mi_bitmap_unclaim(arena->blocks_map, arena->field_count, blocks, bitmap_idx);
    if (!ones) {
      _mi_fatal_error("trying to free an already freed block: %p, size %zu\n", p, size);
@ -268,7 +266,7 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
  }
  _mi_verbose_message("reserved %zu gb huge pages\n", pages_reserved);
  
-  size_t bcount = mi_arena_block_count_of_size(hsize);
+  size_t bcount = mi_block_count_of_size(hsize);
  size_t fields = (bcount + MI_BITMAP_FIELD_BITS - 1) / MI_BITMAP_FIELD_BITS;
  size_t asize = sizeof(mi_arena_t) + (2*fields*sizeof(mi_bitmap_field_t));  
  mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
@ -284,6 +282,8 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
  arena->is_zero_init = true;
  arena->search_idx = 0;
  arena->blocks_dirty = &arena->blocks_map[bcount];
+  // the bitmaps are already zero initialized due to os_alloc
+  // just claim leftover blocks if needed
  size_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
  if (post > 0) {
    // don't use leftover bits at the end
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@ -1,41 +1,30 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+This file is meant to be included in other files for efficiency.
+It implements a bitmap that can set/reset sequences of bits atomically
+and is used to concurrently claim memory ranges. 
+
+A bitmap is an array of fields where each field is a machine word (`uintptr_t`)
+
+A current limitation is that the bit sequences cannot cross fields 
+and that the sequence must be smaller or equal to the bits in a field.
+---------------------------------------------------------------------------- */
 #pragma once
-#ifndef MI_BITMAP_H
-#define MI_BITMAP_H
+#ifndef MI_BITMAP_C
+#define MI_BITMAP_C

 #include "mimalloc.h"
 #include "mimalloc-internal.h"

-// Use bit scan forward to quickly find the first zero bit if it is available
-#if defined(_MSC_VER)
-#define MI_HAVE_BITSCAN
-#include <intrin.h>
-static inline size_t mi_bsf(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  MI_64(_BitScanForward)(&idx, x);
-  return idx;
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  MI_64(_BitScanReverse)(&idx, x);
-  return idx;
-}
-#elif defined(__GNUC__) || defined(__clang__)
-#define MI_HAVE_BITSCAN
-#if (INTPTR_MAX == LONG_MAX)
-# define MI_L(x)  x##l
-#else
-# define MI_L(x)  x##ll
-#endif
-static inline size_t mi_bsf(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : MI_L(__builtin_ctz)(x));
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - MI_L(__builtin_clz)(x));
-}
-#endif
-
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */

 #define MI_BITMAP_FIELD_BITS   (8*MI_INTPTR_SIZE)
 #define MI_BITMAP_FIELD_FULL   (~((uintptr_t)0))   // all bits set
@ -63,14 +52,59 @@ static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx)
  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
 }

+// Get the full bit index
+static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
+  return bitmap_idx;
+}
+
+
 // The bit mask for a given number of blocks at a specified bit index.
 static uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
  return ((((uintptr_t)1 << count) - 1) << bitidx);
 }

-// Try to atomically claim a sequence of `count` bits in a single field at `idx` in `bitmap`.
-// Returns `true` on success.
+
+/* -----------------------------------------------------------
+  Use bit scan forward/reverse to quickly find the first zero bit if it is available
+----------------------------------------------------------- */
+#if defined(_MSC_VER)
+#define MI_HAVE_BITSCAN
+#include <intrin.h>
+static inline size_t mi_bsf(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanForward)(&idx, x);
+  return idx;
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanReverse)(&idx, x);
+  return idx;
+}
+#elif defined(__GNUC__) || defined(__clang__)
+#include <limits.h> // LONG_MAX
+#define MI_HAVE_BITSCAN
+#if (INTPTR_MAX == LONG_MAX)
+# define MI_L(x)  x##l
+#else
+# define MI_L(x)  x##ll
+#endif
+static inline size_t mi_bsf(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : MI_L(__builtin_ctz)(x));
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - MI_L(__builtin_clz)(x));
+}
+#endif
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single 
+// field at `idx` in `bitmap`. Returns `true` on success.
 static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx) 
 {  
  mi_assert_internal(bitmap_idx != NULL);
@ -93,7 +127,7 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, con
  while (bitidx <= bitidx_max) {
    if ((map & m) == 0) {  // are the mask bits free at bitidx?
      mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      uintptr_t newmap = map | m;
+      const uintptr_t newmap = map | m;
      mi_assert_internal((newmap^map) >> bitidx == mask);
      if (!mi_atomic_cas_weak(field, newmap, map)) {  // TODO: use strong cas here?
        // no success, another thread claimed concurrently.. keep going
@ -109,10 +143,10 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, con
    else {
      // on to the next bit range
 #ifdef MI_HAVE_BITSCAN
-      size_t shift = (count == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
+      const size_t shift = (count == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
      mi_assert_internal(shift > 0 && shift <= count);
 #else
-      size_t shift = 1;
+      const size_t shift = 1;
 #endif
      bitidx += shift;
      m <<= shift;
--- a/src/memory.c
+++ b/src/memory.c
@ -16,10 +16,10 @@ We need this memory layer between the raw OS calls because of:
 1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
   to reuse memory effectively.
 2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
-   an OS allocation/free is still (much) too expensive relative to the accesses in that
-   object :-( (`malloc-large` tests this). This means we need a cheaper way to
-   reuse memory.
-3. This layer can help with a NUMA aware allocation in the future.
+   an OS allocation/free is still (much) too expensive relative to the accesses 
+   in that object :-( (`malloc-large` tests this). This means we need a cheaper 
+   way to reuse memory.
+3. This layer allows for NUMA aware allocation.

 Possible issues:
 - (2) can potentially be addressed too with a small cache per thread which is much
@ -47,8 +47,6 @@ bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
 bool    _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
 bool    _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
 bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-//void*   _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
-//void    _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);

 // arena.c
 void    _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats);
@ -58,18 +56,18 @@ void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, boo

 // Constants
 #if (MI_INTPTR_SIZE==8)
-#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 16KiB for the region map
+#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 40KiB for the region map 
 #elif (MI_INTPTR_SIZE==4)
-#define MI_HEAP_REGION_MAX_SIZE    (3 * GiB)    // 196 bytes for the region map
+#define MI_HEAP_REGION_MAX_SIZE    (3 * GiB)    // ~ KiB for the region map
 #else
 #error "define the maximum heap space allowed for regions on this platform"
 #endif

 #define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE

-#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB
+#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB  (64MiB on 32 bits)
 #define MI_REGION_MAX_ALLOC_SIZE  (MI_REGION_SIZE/4)                          // 64MiB
-#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  
+#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  // 1024  (48 on 32 bits)


 // Region info is a pointer to the memory region and two bits for 
@ -95,7 +93,7 @@ typedef struct mem_region_s {
  size_t   arena_memid;                          // if allocated from a (huge page) arena
 } mem_region_t;

-// The region map; 16KiB for a 256GiB HEAP_REGION_MAX
+// The region map
 static mem_region_t regions[MI_REGION_MAX];

 // A bit mask per region for its claimed MI_SEGMENT_SIZE blocks.
@ -173,7 +171,7 @@ static bool mi_region_ensure_allocated(size_t idx, bool allow_large, mi_region_i
    bool region_large = allow_large;
    bool is_zero = false;
    size_t arena_memid = 0;
-    void* start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
+    void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
    mi_assert_internal(!(region_large && !allow_large));

    if (start == NULL) {
@ -183,35 +181,31 @@ static bool mi_region_ensure_allocated(size_t idx, bool allow_large, mi_region_i
    }

    // set the newly allocated region
+    // try to initialize any region up to 4 beyond the current one in
+    // care multiple threads are doing this concurrently (common at startup)    
    info = mi_region_info_create(start, region_large, region_commit);
-    if (mi_atomic_cas_strong(&regions[idx].info, info, 0)) {
-      // update the region count
-      regions[idx].arena_memid = arena_memid;
-      mi_atomic_write(&regions[idx].numa_node, _mi_os_numa_node(tld) + 1);
-      mi_atomic_write(&regions_dirty[idx], is_zero ? 0 : ~((uintptr_t)0));
-      mi_atomic_increment(&regions_count);
-    }
-    else {
-      // failed, another thread allocated just before us!
-      // we assign it to a later slot instead (up to 4 tries).
-      for (size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
-        if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
-          regions[idx+i].arena_memid = arena_memid;
-          mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
-          mi_atomic_write(&regions_dirty[idx], is_zero ? 0 : ~((uintptr_t)0));
-          mi_atomic_increment(&regions_count);
-          start = NULL;
-          break;
-        }
+    bool claimed = false;
+    for (size_t i = 0; i <= 4 && idx + i < MI_REGION_MAX && !claimed; i++) {
+      if (!is_zero) {
+        // set dirty bits before CAS; this might race with a zero block but that is ok. 
+        // (but writing before cas prevents a concurrent allocation to assume it is not dirty)
+        mi_atomic_write(&regions_dirty[idx+i], MI_BITMAP_FIELD_FULL);
      }
-      if (start != NULL) {
-        // free it if we didn't succeed to save it to some other region
-        _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);
-        // _mi_os_free_ex(start, MI_REGION_SIZE, region_commit, tld->stats);
+      if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
+        // claimed!
+        regions[idx+i].arena_memid = arena_memid;
+        mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
+        mi_atomic_increment(&regions_count);
+        claimed = true;
      }
-      // and continue with the memory at our index
-      info = mi_atomic_read(&regions[idx].info);
    }
+    if (!claimed) {
+      // free our OS allocation if we didn't succeed to store it in some region
+      _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);      
+    }
+    // continue with the actual info at our index in case another thread was quicker with the allocation
+    info = mi_atomic_read(&regions[idx].info);
+    mi_assert_internal(info != 0);
  }
  mi_assert_internal(info == mi_atomic_read(&regions[idx].info));
  mi_assert_internal(info != 0);
@ -290,19 +284,21 @@ static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool a
    int rnode = ((int)mi_atomic_read_relaxed(&regions->numa_node)) - 1;
    if (rnode != numa_node) return false;
  }
-  if (mi_unlikely(!(commit || allow_large))) {
-    // otherwise skip incompatible regions if possible. 
-    // this is not guaranteed due to multiple threads allocating at the same time but
-    // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
-    // otherwise we might just not be able to reset/decommit individual pages sometimes.
-    mi_region_info_t info = mi_atomic_read_relaxed(&regions->info);
-    bool is_large;
-    bool is_committed;
-    void* start = mi_region_info_read(info, &is_large, &is_committed);
-    bool ok = (start == NULL || (commit || !is_committed) || (allow_large || !is_large)); // Todo: test with one bitmap operation?
-    if (!ok) return false;
-  }
-  return true;
+  if (commit && allow_large) return true;  // always ok
+
+  // otherwise skip incompatible regions if possible. 
+  // this is not guaranteed due to multiple threads allocating at the same time but
+  // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
+  // otherwise we might just not be able to reset/decommit individual pages sometimes.
+  mi_region_info_t info = mi_atomic_read_relaxed(&regions->info);
+  bool is_large;
+  bool is_committed;
+  void* start = mi_region_info_read(info, &is_large, &is_committed);
+  // note: we also skip if commit is false and the region is committed,
+  // that is a bit strong but prevents allocation of eager delayed segments in 
+  // committed memory
+  bool ok = (start == NULL || (commit || !is_committed) || (allow_large || !is_large)); // Todo: test with one bitmap operation?
+  return ok;
 }

 // Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim.
--- a/src/page.c
+++ b/src/page.c
@ -497,8 +497,10 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
 static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* page, size_t extend, mi_stats_t* stats)
 {
  UNUSED(stats);
+  #if (MI_SECURE <= 2)
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->local_free == NULL);
+  #endif
  mi_assert_internal(page->capacity + extend <= page->reserved);
  void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
  size_t bsize = page->block_size;
--- a/test/test-stress.c
+++ b/test/test-stress.c
@ -66,7 +66,9 @@ static void* alloc_items(size_t items, random_t r) {
  if (chance(1, r)) items *= 100; // 1% huge objects;
  if (items==40) items++;              // pthreads uses that size for stack increases
  uintptr_t* p = (uintptr_t*)mi_malloc(items*sizeof(uintptr_t));
-  for (uintptr_t i = 0; i < items; i++) p[i] = (items - i) ^ cookie;
+  if (p != NULL) {
+    for (uintptr_t i = 0; i < items; i++) p[i] = (items - i) ^ cookie;
+  }
  return p;
 }