refactor and improve atomic bitmap usage

This commit is contained in:
daan 2019-11-07 10:26:52 -08:00
parent b09282bc0d
commit 378716c467
10 changed files with 183 additions and 131 deletions

View File

@ -10,6 +10,7 @@ option(MI_SEE_ASM "Generate assembly files" OFF)
option(MI_CHECK_FULL "Use full internal invariant checking in DEBUG mode" OFF) option(MI_CHECK_FULL "Use full internal invariant checking in DEBUG mode" OFF)
option(MI_USE_CXX "Use the C++ compiler to compile the library" OFF) option(MI_USE_CXX "Use the C++ compiler to compile the library" OFF)
option(MI_SECURE "Use security mitigations (like guard pages and randomization)" OFF) option(MI_SECURE "Use security mitigations (like guard pages and randomization)" OFF)
option(MI_SECURE_FULL "Use full security mitigations (like double free protection, more expensive)" OFF)
option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF) option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
option(MI_BUILD_TESTS "Build test executables" ON) option(MI_BUILD_TESTS "Build test executables" ON)
@ -70,9 +71,14 @@ if(MI_OVERRIDE MATCHES "ON")
endif() endif()
endif() endif()
if(MI_SECURE MATCHES "ON") if(MI_SECURE_FULL MATCHES "ON")
message(STATUS "Set secure build (MI_SECURE=ON)") message(STATUS "Set full secure build (experimental) (MI_SECURE_FULL=ON)")
list(APPEND mi_defines MI_SECURE=3) list(APPEND mi_defines MI_SECURE=4)
else()
if(MI_SECURE MATCHES "ON")
message(STATUS "Set secure build (MI_SECURE=ON)")
list(APPEND mi_defines MI_SECURE=3)
endif()
endif() endif()
if(MI_SEE_ASM MATCHES "ON") if(MI_SEE_ASM MATCHES "ON")

View File

@ -232,6 +232,9 @@
<ClCompile Include="..\..\src\alloc-posix.c" /> <ClCompile Include="..\..\src\alloc-posix.c" />
<ClCompile Include="..\..\src\alloc.c" /> <ClCompile Include="..\..\src\alloc.c" />
<ClCompile Include="..\..\src\arena.c" /> <ClCompile Include="..\..\src\arena.c" />
<ClCompile Include="..\..\src\bitmap.inc.c">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="..\..\src\heap.c" /> <ClCompile Include="..\..\src\heap.c" />
<ClCompile Include="..\..\src\init.c" /> <ClCompile Include="..\..\src\init.c" />
<ClCompile Include="..\..\src\memory.c" /> <ClCompile Include="..\..\src\memory.c" />

View File

@ -218,7 +218,9 @@
<ClCompile Include="..\..\src\alloc-posix.c" /> <ClCompile Include="..\..\src\alloc-posix.c" />
<ClCompile Include="..\..\src\alloc.c" /> <ClCompile Include="..\..\src\alloc.c" />
<ClCompile Include="..\..\src\arena.c" /> <ClCompile Include="..\..\src\arena.c" />
<ClCompile Include="..\..\src\bitmap.inc.c" /> <ClCompile Include="..\..\src\bitmap.inc.c">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="..\..\src\heap.c" /> <ClCompile Include="..\..\src\heap.c" />
<ClCompile Include="..\..\src\init.c" /> <ClCompile Include="..\..\src\init.c" />
<ClCompile Include="..\..\src\memory.c" /> <ClCompile Include="..\..\src\memory.c" />

View File

@ -163,7 +163,6 @@ bool _mi_page_is_valid(mi_page_t* page);
// Overflow detecting multiply // Overflow detecting multiply
#define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t))) // sqrt(SIZE_MAX)
static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) { static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
#if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5 #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
#include <limits.h> // UINT_MAX, ULONG_MAX #include <limits.h> // UINT_MAX, ULONG_MAX
@ -175,6 +174,7 @@ static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
return __builtin_umulll_overflow(count, size, total); return __builtin_umulll_overflow(count, size, total);
#endif #endif
#else /* __builtin_umul_overflow is unavailable */ #else /* __builtin_umul_overflow is unavailable */
#define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t))) // sqrt(SIZE_MAX)
*total = count * size; *total = count * size;
return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW) return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW)
&& size > 0 && (SIZE_MAX / size) < count); && size > 0 && (SIZE_MAX / size) < count);
@ -188,6 +188,7 @@ static inline bool _mi_is_power_of_two(uintptr_t x) {
// Align upwards // Align upwards
static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) { static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
mi_assert_internal(alignment != 0);
uintptr_t mask = alignment - 1; uintptr_t mask = alignment - 1;
if ((alignment & mask) == 0) { // power of two? if ((alignment & mask) == 0) { // power of two?
return ((sz + mask) & ~mask); return ((sz + mask) & ~mask);
@ -197,6 +198,12 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
} }
} }
// Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
mi_assert_internal(divider != 0);
return (divider == 0 ? size : ((size + divider - 1) / divider));
}
// Is memory zero initialized? // Is memory zero initialized?
static inline bool mi_mem_is_zero(void* p, size_t size) { static inline bool mi_mem_is_zero(void* p, size_t size) {
for (size_t i = 0; i < size; i++) { for (size_t i = 0; i < size; i++) {
@ -283,7 +290,7 @@ static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) { static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
// if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0]; // huge pages // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0]; // huge pages
ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment; ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
mi_assert_internal(diff >= 0 && diff < MI_SEGMENT_SIZE); mi_assert_internal(diff >= 0 && (size_t)diff < MI_SEGMENT_SIZE);
uintptr_t idx = (uintptr_t)diff >> segment->page_shift; uintptr_t idx = (uintptr_t)diff >> segment->page_shift;
mi_assert_internal(idx < segment->capacity); mi_assert_internal(idx < segment->capacity);
mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0); mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);

View File

@ -29,7 +29,7 @@ terms of the MIT license. A copy of the license can be found in the file
// #define MI_SECURE 4 // experimental, may be more expensive: checks for double free. // #define MI_SECURE 4 // experimental, may be more expensive: checks for double free.
#if !defined(MI_SECURE) #if !defined(MI_SECURE)
#define MI_SECURE 0 #define MI_SECURE 4
#endif #endif
// Define MI_DEBUG for debug mode // Define MI_DEBUG for debug mode
@ -93,12 +93,12 @@ terms of the MIT license. A copy of the license can be found in the file
#define MI_SEGMENT_SHIFT ( MI_LARGE_PAGE_SHIFT) // 4mb #define MI_SEGMENT_SHIFT ( MI_LARGE_PAGE_SHIFT) // 4mb
// Derived constants // Derived constants
#define MI_SEGMENT_SIZE (1<<MI_SEGMENT_SHIFT) #define MI_SEGMENT_SIZE (1UL<<MI_SEGMENT_SHIFT)
#define MI_SEGMENT_MASK ((uintptr_t)MI_SEGMENT_SIZE - 1) #define MI_SEGMENT_MASK ((uintptr_t)MI_SEGMENT_SIZE - 1)
#define MI_SMALL_PAGE_SIZE (1<<MI_SMALL_PAGE_SHIFT) #define MI_SMALL_PAGE_SIZE (1UL<<MI_SMALL_PAGE_SHIFT)
#define MI_MEDIUM_PAGE_SIZE (1<<MI_MEDIUM_PAGE_SHIFT) #define MI_MEDIUM_PAGE_SIZE (1UL<<MI_MEDIUM_PAGE_SHIFT)
#define MI_LARGE_PAGE_SIZE (1<<MI_LARGE_PAGE_SHIFT) #define MI_LARGE_PAGE_SIZE (1UL<<MI_LARGE_PAGE_SHIFT)
#define MI_SMALL_PAGES_PER_SEGMENT (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE) #define MI_SMALL_PAGES_PER_SEGMENT (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
#define MI_MEDIUM_PAGES_PER_SEGMENT (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE) #define MI_MEDIUM_PAGES_PER_SEGMENT (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)

View File

@ -7,12 +7,16 @@ terms of the MIT license. A copy of the license can be found in the file
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
"Arenas" are fixed area's of OS memory from which we can allocate "Arenas" are fixed area's of OS memory from which we can allocate
large blocks (>= MI_ARENA_BLOCK_SIZE, 32MiB). Currently only used to large blocks (>= MI_ARENA_BLOCK_SIZE, 32MiB).
allocate in one arena consisting of huge OS pages -- otherwise it In contrast to the rest of mimalloc, the arenas are shared between
delegates to direct allocation from the OS. threads and need to be accessed using atomic operations.
In the future, we can expose an API to manually add more arenas which Currently arenas are only used to for huge OS page (1GiB) reservations,
is sometimes needed for embedded devices or shared memory for example. otherwise it delegates to direct allocation from the OS.
In the future, we can expose an API to manually add more kinds of arenas
which is sometimes needed for embedded devices or shared memory for example.
(We can also employ this with WASI or `sbrk` systems to reserve large arenas
on demand and be able to reuse them efficiently).
The arena allocation needs to be thread safe and we use an atomic The arena allocation needs to be thread safe and we use an atomic
bitmap to allocate. The current implementation of the bitmap can bitmap to allocate. The current implementation of the bitmap can
@ -48,10 +52,6 @@ int _mi_os_numa_node_count(void);
#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2) // 16MiB #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2) // 16MiB
#define MI_MAX_ARENAS (64) // not more than 256 (since we use 8 bits in the memid) #define MI_MAX_ARENAS (64) // not more than 256 (since we use 8 bits in the memid)
// Block info: bit 0 contains the `in_use` bit, the upper bits the
// size in count of arena blocks.
typedef uintptr_t mi_block_info_t;
// A memory arena descriptor // A memory arena descriptor
typedef struct mi_arena_s { typedef struct mi_arena_s {
uint8_t* start; // the start of the memory area uint8_t* start; // the start of the memory area
@ -61,8 +61,8 @@ typedef struct mi_arena_s {
bool is_zero_init; // is the arena zero initialized? bool is_zero_init; // is the arena zero initialized?
bool is_large; // large OS page allocated bool is_large; // large OS page allocated
volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks
mi_bitmap_field_t* blocks_dirty; // are the blocks potentially non-zero? mi_bitmap_field_t* blocks_dirty; // are the blocks potentially non-zero?
mi_bitmap_field_t blocks_map[1]; // bitmap of in-use blocks mi_bitmap_field_t blocks_map[1]; // bitmap of in-use blocks
} mi_arena_t; } mi_arena_t;
@ -81,6 +81,7 @@ static _Atomic(uintptr_t) mi_arena_count; // = 0
static size_t mi_memid_create(size_t arena_index, mi_bitmap_index_t bitmap_index) { static size_t mi_memid_create(size_t arena_index, mi_bitmap_index_t bitmap_index) {
mi_assert_internal(arena_index < 0xFE); mi_assert_internal(arena_index < 0xFE);
mi_assert_internal(((bitmap_index << 8) >> 8) == bitmap_index); // no overflow?
return ((bitmap_index << 8) | ((arena_index+1) & 0xFF)); return ((bitmap_index << 8) | ((arena_index+1) & 0xFF));
} }
@ -90,30 +91,25 @@ static void mi_memid_indices(size_t memid, size_t* arena_index, mi_bitmap_index_
*bitmap_index = (memid >> 8); *bitmap_index = (memid >> 8);
} }
static size_t mi_block_count_of_size(size_t size) {
static size_t mi_arena_block_count_of_size(size_t size) { return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
const size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
const size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
return bcount;
} }
/* ----------------------------------------------------------- /* -----------------------------------------------------------
Thread safe allocation in an arena Thread safe allocation in an arena
----------------------------------------------------------- */ ----------------------------------------------------------- */
static void* mi_arena_alloc(mi_arena_t* arena, size_t blocks, bool* is_zero, mi_bitmap_index_t* bitmap_idx) static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx)
{ {
const size_t fcount = arena->field_count; const size_t fcount = arena->field_count;
size_t idx = mi_atomic_read(&arena->search_idx); // start from last search size_t idx = mi_atomic_read(&arena->search_idx); // start from last search
for (size_t visited = 0; visited < fcount; visited++, idx++) { for (size_t visited = 0; visited < fcount; visited++, idx++) {
if (idx >= fcount) idx = 0; // wrap around if (idx >= fcount) idx = 0; // wrap around
if (mi_bitmap_try_claim_field(arena->blocks_map, idx, blocks, bitmap_idx)) { if (mi_bitmap_try_claim_field(arena->blocks_map, idx, blocks, bitmap_idx)) {
// claimed it! set the dirty bits
*is_zero = mi_bitmap_claim(arena->blocks_dirty, fcount, blocks, *bitmap_idx);
mi_atomic_write(&arena->search_idx, idx); // start search from here next time mi_atomic_write(&arena->search_idx, idx); // start search from here next time
return (arena->start + (*bitmap_idx)*MI_ARENA_BLOCK_SIZE); return true;
} }
} }
return NULL; return false;
} }
@ -125,13 +121,15 @@ static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t n
bool* commit, bool* large, bool* is_zero, size_t* memid) bool* commit, bool* large, bool* is_zero, size_t* memid)
{ {
mi_bitmap_index_t bitmap_index; mi_bitmap_index_t bitmap_index;
void* p = mi_arena_alloc(arena, needed_bcount, is_zero, &bitmap_index); if (mi_arena_alloc(arena, needed_bcount, &bitmap_index)) {
if (p != NULL) { // claimed it! set the dirty bits (todo: no need for an atomic op here?)
*memid = mi_memid_create(arena_index, bitmap_index); *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index);
*commit = true; // TODO: support commit on demand? *memid = mi_memid_create(arena_index, bitmap_index);
*large = arena->is_large; *commit = true; // TODO: support commit on demand?
*large = arena->is_large;
return (arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE));
} }
return p; return NULL;
} }
void* _mi_arena_alloc_aligned(size_t size, size_t alignment, void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
@ -140,7 +138,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
{ {
mi_assert_internal(memid != NULL && tld != NULL); mi_assert_internal(memid != NULL && tld != NULL);
mi_assert_internal(size > 0); mi_assert_internal(size > 0);
*memid = MI_MEMID_OS; *memid = MI_MEMID_OS;
*is_zero = false; *is_zero = false;
bool default_large = false; bool default_large = false;
if (large==NULL) large = &default_large; // ensure `large != NULL` if (large==NULL) large = &default_large; // ensure `large != NULL`
@ -151,7 +149,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
size <= MI_ARENA_MAX_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE &&
size >= MI_ARENA_MIN_OBJ_SIZE) size >= MI_ARENA_MIN_OBJ_SIZE)
{ {
const size_t bcount = mi_arena_block_count_of_size(size); const size_t bcount = mi_block_count_of_size(size);
const int numa_node = _mi_os_numa_node(tld); // current numa node const int numa_node = _mi_os_numa_node(tld); // current numa node
mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE); mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
@ -221,7 +219,7 @@ void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats) {
_mi_fatal_error("trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); _mi_fatal_error("trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
return; return;
} }
const size_t blocks = mi_arena_block_count_of_size(size); const size_t blocks = mi_block_count_of_size(size);
bool ones = mi_bitmap_unclaim(arena->blocks_map, arena->field_count, blocks, bitmap_idx); bool ones = mi_bitmap_unclaim(arena->blocks_map, arena->field_count, blocks, bitmap_idx);
if (!ones) { if (!ones) {
_mi_fatal_error("trying to free an already freed block: %p, size %zu\n", p, size); _mi_fatal_error("trying to free an already freed block: %p, size %zu\n", p, size);
@ -268,7 +266,7 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
} }
_mi_verbose_message("reserved %zu gb huge pages\n", pages_reserved); _mi_verbose_message("reserved %zu gb huge pages\n", pages_reserved);
size_t bcount = mi_arena_block_count_of_size(hsize); size_t bcount = mi_block_count_of_size(hsize);
size_t fields = (bcount + MI_BITMAP_FIELD_BITS - 1) / MI_BITMAP_FIELD_BITS; size_t fields = (bcount + MI_BITMAP_FIELD_BITS - 1) / MI_BITMAP_FIELD_BITS;
size_t asize = sizeof(mi_arena_t) + (2*fields*sizeof(mi_bitmap_field_t)); size_t asize = sizeof(mi_arena_t) + (2*fields*sizeof(mi_bitmap_field_t));
mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS? mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
@ -284,6 +282,8 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
arena->is_zero_init = true; arena->is_zero_init = true;
arena->search_idx = 0; arena->search_idx = 0;
arena->blocks_dirty = &arena->blocks_map[bcount]; arena->blocks_dirty = &arena->blocks_map[bcount];
// the bitmaps are already zero initialized due to os_alloc
// just claim leftover blocks if needed
size_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount; size_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
if (post > 0) { if (post > 0) {
// don't use leftover bits at the end // don't use leftover bits at the end

View File

@ -1,41 +1,30 @@
/* ----------------------------------------------------------------------------
Copyright (c) 2019, Microsoft Research, Daan Leijen
This is free software; you can redistribute it and/or modify it under the
terms of the MIT license. A copy of the license can be found in the file
"LICENSE" at the root of this distribution.
-----------------------------------------------------------------------------*/
/* ----------------------------------------------------------------------------
This file is meant to be included in other files for efficiency.
It implements a bitmap that can set/reset sequences of bits atomically
and is used to concurrently claim memory ranges.
A bitmap is an array of fields where each field is a machine word (`uintptr_t`)
A current limitation is that the bit sequences cannot cross fields
and that the sequence must be smaller or equal to the bits in a field.
---------------------------------------------------------------------------- */
#pragma once #pragma once
#ifndef MI_BITMAP_H #ifndef MI_BITMAP_C
#define MI_BITMAP_H #define MI_BITMAP_C
#include "mimalloc.h" #include "mimalloc.h"
#include "mimalloc-internal.h" #include "mimalloc-internal.h"
// Use bit scan forward to quickly find the first zero bit if it is available /* -----------------------------------------------------------
#if defined(_MSC_VER) Bitmap definition
#define MI_HAVE_BITSCAN ----------------------------------------------------------- */
#include <intrin.h>
static inline size_t mi_bsf(uintptr_t x) {
if (x==0) return 8*MI_INTPTR_SIZE;
DWORD idx;
MI_64(_BitScanForward)(&idx, x);
return idx;
}
static inline size_t mi_bsr(uintptr_t x) {
if (x==0) return 8*MI_INTPTR_SIZE;
DWORD idx;
MI_64(_BitScanReverse)(&idx, x);
return idx;
}
#elif defined(__GNUC__) || defined(__clang__)
#define MI_HAVE_BITSCAN
#if (INTPTR_MAX == LONG_MAX)
# define MI_L(x) x##l
#else
# define MI_L(x) x##ll
#endif
static inline size_t mi_bsf(uintptr_t x) {
return (x==0 ? 8*MI_INTPTR_SIZE : MI_L(__builtin_ctz)(x));
}
static inline size_t mi_bsr(uintptr_t x) {
return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - MI_L(__builtin_clz)(x));
}
#endif
#define MI_BITMAP_FIELD_BITS (8*MI_INTPTR_SIZE) #define MI_BITMAP_FIELD_BITS (8*MI_INTPTR_SIZE)
#define MI_BITMAP_FIELD_FULL (~((uintptr_t)0)) // all bits set #define MI_BITMAP_FIELD_FULL (~((uintptr_t)0)) // all bits set
@ -63,14 +52,59 @@ static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx)
return (bitmap_idx % MI_BITMAP_FIELD_BITS); return (bitmap_idx % MI_BITMAP_FIELD_BITS);
} }
// Get the full bit index
static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
return bitmap_idx;
}
// The bit mask for a given number of blocks at a specified bit index. // The bit mask for a given number of blocks at a specified bit index.
static uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) { static uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS); mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
return ((((uintptr_t)1 << count) - 1) << bitidx); return ((((uintptr_t)1 << count) - 1) << bitidx);
} }
// Try to atomically claim a sequence of `count` bits in a single field at `idx` in `bitmap`.
// Returns `true` on success. /* -----------------------------------------------------------
Use bit scan forward/reverse to quickly find the first zero bit if it is available
----------------------------------------------------------- */
#if defined(_MSC_VER)
#define MI_HAVE_BITSCAN
#include <intrin.h>
static inline size_t mi_bsf(uintptr_t x) {
if (x==0) return 8*MI_INTPTR_SIZE;
DWORD idx;
MI_64(_BitScanForward)(&idx, x);
return idx;
}
static inline size_t mi_bsr(uintptr_t x) {
if (x==0) return 8*MI_INTPTR_SIZE;
DWORD idx;
MI_64(_BitScanReverse)(&idx, x);
return idx;
}
#elif defined(__GNUC__) || defined(__clang__)
#include <limits.h> // LONG_MAX
#define MI_HAVE_BITSCAN
#if (INTPTR_MAX == LONG_MAX)
# define MI_L(x) x##l
#else
# define MI_L(x) x##ll
#endif
static inline size_t mi_bsf(uintptr_t x) {
return (x==0 ? 8*MI_INTPTR_SIZE : MI_L(__builtin_ctz)(x));
}
static inline size_t mi_bsr(uintptr_t x) {
return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - MI_L(__builtin_clz)(x));
}
#endif
/* -----------------------------------------------------------
Claim a bit sequence atomically
----------------------------------------------------------- */
// Try to atomically claim a sequence of `count` bits in a single
// field at `idx` in `bitmap`. Returns `true` on success.
static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx) static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
{ {
mi_assert_internal(bitmap_idx != NULL); mi_assert_internal(bitmap_idx != NULL);
@ -93,7 +127,7 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, con
while (bitidx <= bitidx_max) { while (bitidx <= bitidx_max) {
if ((map & m) == 0) { // are the mask bits free at bitidx? if ((map & m) == 0) { // are the mask bits free at bitidx?
mi_assert_internal((m >> bitidx) == mask); // no overflow? mi_assert_internal((m >> bitidx) == mask); // no overflow?
uintptr_t newmap = map | m; const uintptr_t newmap = map | m;
mi_assert_internal((newmap^map) >> bitidx == mask); mi_assert_internal((newmap^map) >> bitidx == mask);
if (!mi_atomic_cas_weak(field, newmap, map)) { // TODO: use strong cas here? if (!mi_atomic_cas_weak(field, newmap, map)) { // TODO: use strong cas here?
// no success, another thread claimed concurrently.. keep going // no success, another thread claimed concurrently.. keep going
@ -109,10 +143,10 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, con
else { else {
// on to the next bit range // on to the next bit range
#ifdef MI_HAVE_BITSCAN #ifdef MI_HAVE_BITSCAN
size_t shift = (count == 1 ? 1 : mi_bsr(map & m) - bitidx + 1); const size_t shift = (count == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
mi_assert_internal(shift > 0 && shift <= count); mi_assert_internal(shift > 0 && shift <= count);
#else #else
size_t shift = 1; const size_t shift = 1;
#endif #endif
bitidx += shift; bitidx += shift;
m <<= shift; m <<= shift;

View File

@ -16,10 +16,10 @@ We need this memory layer between the raw OS calls because of:
1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order 1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
to reuse memory effectively. to reuse memory effectively.
2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of 2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
an OS allocation/free is still (much) too expensive relative to the accesses in that an OS allocation/free is still (much) too expensive relative to the accesses
object :-( (`malloc-large` tests this). This means we need a cheaper way to in that object :-( (`malloc-large` tests this). This means we need a cheaper
reuse memory. way to reuse memory.
3. This layer can help with a NUMA aware allocation in the future. 3. This layer allows for NUMA aware allocation.
Possible issues: Possible issues:
- (2) can potentially be addressed too with a small cache per thread which is much - (2) can potentially be addressed too with a small cache per thread which is much
@ -47,8 +47,6 @@ bool _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
bool _mi_os_decommit(void* p, size_t size, mi_stats_t* stats); bool _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
bool _mi_os_reset(void* p, size_t size, mi_stats_t* stats); bool _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
bool _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats); bool _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
//void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
//void _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
// arena.c // arena.c
void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats); void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats);
@ -58,18 +56,18 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, boo
// Constants // Constants
#if (MI_INTPTR_SIZE==8) #if (MI_INTPTR_SIZE==8)
#define MI_HEAP_REGION_MAX_SIZE (256 * GiB) // 16KiB for the region map #define MI_HEAP_REGION_MAX_SIZE (256 * GiB) // 40KiB for the region map
#elif (MI_INTPTR_SIZE==4) #elif (MI_INTPTR_SIZE==4)
#define MI_HEAP_REGION_MAX_SIZE (3 * GiB) // 196 bytes for the region map #define MI_HEAP_REGION_MAX_SIZE (3 * GiB) // ~ KiB for the region map
#else #else
#error "define the maximum heap space allowed for regions on this platform" #error "define the maximum heap space allowed for regions on this platform"
#endif #endif
#define MI_SEGMENT_ALIGN MI_SEGMENT_SIZE #define MI_SEGMENT_ALIGN MI_SEGMENT_SIZE
#define MI_REGION_SIZE (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS) // 256MiB #define MI_REGION_SIZE (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS) // 256MiB (64MiB on 32 bits)
#define MI_REGION_MAX_ALLOC_SIZE (MI_REGION_SIZE/4) // 64MiB #define MI_REGION_MAX_ALLOC_SIZE (MI_REGION_SIZE/4) // 64MiB
#define MI_REGION_MAX (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE) #define MI_REGION_MAX (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE) // 1024 (48 on 32 bits)
// Region info is a pointer to the memory region and two bits for // Region info is a pointer to the memory region and two bits for
@ -95,7 +93,7 @@ typedef struct mem_region_s {
size_t arena_memid; // if allocated from a (huge page) arena size_t arena_memid; // if allocated from a (huge page) arena
} mem_region_t; } mem_region_t;
// The region map; 16KiB for a 256GiB HEAP_REGION_MAX // The region map
static mem_region_t regions[MI_REGION_MAX]; static mem_region_t regions[MI_REGION_MAX];
// A bit mask per region for its claimed MI_SEGMENT_SIZE blocks. // A bit mask per region for its claimed MI_SEGMENT_SIZE blocks.
@ -173,7 +171,7 @@ static bool mi_region_ensure_allocated(size_t idx, bool allow_large, mi_region_i
bool region_large = allow_large; bool region_large = allow_large;
bool is_zero = false; bool is_zero = false;
size_t arena_memid = 0; size_t arena_memid = 0;
void* start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld); void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
mi_assert_internal(!(region_large && !allow_large)); mi_assert_internal(!(region_large && !allow_large));
if (start == NULL) { if (start == NULL) {
@ -183,35 +181,31 @@ static bool mi_region_ensure_allocated(size_t idx, bool allow_large, mi_region_i
} }
// set the newly allocated region // set the newly allocated region
// try to initialize any region up to 4 beyond the current one in
// care multiple threads are doing this concurrently (common at startup)
info = mi_region_info_create(start, region_large, region_commit); info = mi_region_info_create(start, region_large, region_commit);
if (mi_atomic_cas_strong(&regions[idx].info, info, 0)) { bool claimed = false;
// update the region count for (size_t i = 0; i <= 4 && idx + i < MI_REGION_MAX && !claimed; i++) {
regions[idx].arena_memid = arena_memid; if (!is_zero) {
mi_atomic_write(&regions[idx].numa_node, _mi_os_numa_node(tld) + 1); // set dirty bits before CAS; this might race with a zero block but that is ok.
mi_atomic_write(&regions_dirty[idx], is_zero ? 0 : ~((uintptr_t)0)); // (but writing before cas prevents a concurrent allocation to assume it is not dirty)
mi_atomic_increment(&regions_count); mi_atomic_write(&regions_dirty[idx+i], MI_BITMAP_FIELD_FULL);
}
else {
// failed, another thread allocated just before us!
// we assign it to a later slot instead (up to 4 tries).
for (size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
regions[idx+i].arena_memid = arena_memid;
mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
mi_atomic_write(&regions_dirty[idx], is_zero ? 0 : ~((uintptr_t)0));
mi_atomic_increment(&regions_count);
start = NULL;
break;
}
} }
if (start != NULL) { if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
// free it if we didn't succeed to save it to some other region // claimed!
_mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats); regions[idx+i].arena_memid = arena_memid;
// _mi_os_free_ex(start, MI_REGION_SIZE, region_commit, tld->stats); mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
mi_atomic_increment(&regions_count);
claimed = true;
} }
// and continue with the memory at our index
info = mi_atomic_read(&regions[idx].info);
} }
if (!claimed) {
// free our OS allocation if we didn't succeed to store it in some region
_mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);
}
// continue with the actual info at our index in case another thread was quicker with the allocation
info = mi_atomic_read(&regions[idx].info);
mi_assert_internal(info != 0);
} }
mi_assert_internal(info == mi_atomic_read(&regions[idx].info)); mi_assert_internal(info == mi_atomic_read(&regions[idx].info));
mi_assert_internal(info != 0); mi_assert_internal(info != 0);
@ -290,19 +284,21 @@ static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool a
int rnode = ((int)mi_atomic_read_relaxed(&regions->numa_node)) - 1; int rnode = ((int)mi_atomic_read_relaxed(&regions->numa_node)) - 1;
if (rnode != numa_node) return false; if (rnode != numa_node) return false;
} }
if (mi_unlikely(!(commit || allow_large))) { if (commit && allow_large) return true; // always ok
// otherwise skip incompatible regions if possible.
// this is not guaranteed due to multiple threads allocating at the same time but // otherwise skip incompatible regions if possible.
// that's ok. In secure mode, large is never allowed for any thread, so that works out; // this is not guaranteed due to multiple threads allocating at the same time but
// otherwise we might just not be able to reset/decommit individual pages sometimes. // that's ok. In secure mode, large is never allowed for any thread, so that works out;
mi_region_info_t info = mi_atomic_read_relaxed(&regions->info); // otherwise we might just not be able to reset/decommit individual pages sometimes.
bool is_large; mi_region_info_t info = mi_atomic_read_relaxed(&regions->info);
bool is_committed; bool is_large;
void* start = mi_region_info_read(info, &is_large, &is_committed); bool is_committed;
bool ok = (start == NULL || (commit || !is_committed) || (allow_large || !is_large)); // Todo: test with one bitmap operation? void* start = mi_region_info_read(info, &is_large, &is_committed);
if (!ok) return false; // note: we also skip if commit is false and the region is committed,
} // that is a bit strong but prevents allocation of eager delayed segments in
return true; // committed memory
bool ok = (start == NULL || (commit || !is_committed) || (allow_large || !is_large)); // Todo: test with one bitmap operation?
return ok;
} }
// Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim. // Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim.

View File

@ -497,8 +497,10 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* page, size_t extend, mi_stats_t* stats) static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* page, size_t extend, mi_stats_t* stats)
{ {
UNUSED(stats); UNUSED(stats);
#if (MI_SECURE <= 2)
mi_assert_internal(page->free == NULL); mi_assert_internal(page->free == NULL);
mi_assert_internal(page->local_free == NULL); mi_assert_internal(page->local_free == NULL);
#endif
mi_assert_internal(page->capacity + extend <= page->reserved); mi_assert_internal(page->capacity + extend <= page->reserved);
void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL ); void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
size_t bsize = page->block_size; size_t bsize = page->block_size;

View File

@ -66,7 +66,9 @@ static void* alloc_items(size_t items, random_t r) {
if (chance(1, r)) items *= 100; // 1% huge objects; if (chance(1, r)) items *= 100; // 1% huge objects;
if (items==40) items++; // pthreads uses that size for stack increases if (items==40) items++; // pthreads uses that size for stack increases
uintptr_t* p = (uintptr_t*)mi_malloc(items*sizeof(uintptr_t)); uintptr_t* p = (uintptr_t*)mi_malloc(items*sizeof(uintptr_t));
for (uintptr_t i = 0; i < items; i++) p[i] = (items - i) ^ cookie; if (p != NULL) {
for (uintptr_t i = 0; i < items; i++) p[i] = (items - i) ^ cookie;
}
return p; return p;
} }