initial numa support for arenas

2024-12-28 22:05:40 +08:00 · 2019-11-01 19:53:07 -07:00 · 2019-11-01 19:53:07 -07:00 · a6499be074
commit a6499be074
parent aaf01620f4
6 changed files with 241 additions and 153 deletions
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@ -56,6 +56,7 @@ void       _mi_os_init(void);                                      // called fro
 void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
 void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
 size_t     _mi_os_good_alloc_size(size_t size);
+int        _mi_os_numa_node(void);


 // memory.c
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@ -228,9 +228,14 @@ mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_b

 // Experimental
 mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
-mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
 mi_decl_export bool mi_is_redirected() mi_attr_noexcept;

+mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept;
+
+// deprecated
+mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+
 // ------------------------------------------------------
 // Convenience
 // ------------------------------------------------------
@ -271,6 +276,7 @@ typedef enum mi_option_e {
  mi_option_eager_commit_delay,
  mi_option_segment_reset,
  mi_option_os_tag,
+  mi_option_max_numa_node,
  _mi_option_last
 } mi_option_t;

--- a/src/arena.c
+++ b/src/arena.c
@ -25,8 +25,10 @@ with on-demand coalescing.

 // os.c
 void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
-int   _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** pstart, size_t* pages_reserved, size_t* psize) mi_attr_noexcept;
+//int   _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** pstart, size_t* pages_reserved, size_t* psize) mi_attr_noexcept;
 void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, size_t* psize);
+int   _mi_os_numa_node_count(void);

 /* -----------------------------------------------------------
  Arena allocation
@ -44,6 +46,7 @@ typedef uintptr_t mi_block_info_t;
 typedef struct mi_arena_s {
  uint8_t* start;                         // the start of the memory area
  size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  int      numa_node;                     // associated NUMA node
  bool     is_zero_init;                  // is the arena zero initialized?
  bool     is_large;                      // large OS page allocated
  _Atomic(uintptr_t)       block_bottom;  // optimization to start the search for free blocks
@ -223,7 +226,31 @@ static void* mi_arena_alloc(mi_arena_t* arena, size_t needed_bcount, bool* is_ze
  Arena Allocation
 ----------------------------------------------------------- */

-void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld) {
+static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, 
+                                    bool* commit, bool* large, bool* is_zero,
+                                    size_t* memid) 
+{
+  size_t block_index = SIZE_MAX;
+  void* p = mi_arena_alloc(arena, needed_bcount, is_zero, &block_index);
+  if (p != NULL) {
+    mi_assert_internal(block_index != SIZE_MAX);
+#if MI_DEBUG>=1
+    _Atomic(mi_block_info_t)* block = &arena->blocks[block_index];
+    mi_block_info_t binfo = mi_atomic_read(block);
+    mi_assert_internal(mi_block_is_in_use(binfo));
+    mi_assert_internal(mi_block_count(binfo) >= needed_bcount);
+#endif
+    *memid = mi_memid_create(arena_index, block_index);
+    *commit = true;           // TODO: support commit on demand?
+    *large = arena->is_large;
+  }
+  return p;
+}
+
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, 
+                              bool* commit, bool* large, bool* is_zero, 
+                              size_t* memid, mi_os_tld_t* tld) 
+{
  mi_assert_internal(memid != NULL && tld != NULL);
  mi_assert_internal(size > 0);
  *memid = MI_MEMID_OS;
@ -240,33 +267,36 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool*
  {
    size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
    size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
+    int numa_node = _mi_os_numa_node(); // current numa node

    mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
+    // try numa affine allocation
    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
      mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[i]));
-      if (arena==NULL) break;
-      if (*large || !arena->is_large) { // large OS pages allowed, or arena is not large OS pages
-        size_t block_index = SIZE_MAX;
-        void* p = mi_arena_alloc(arena, bcount, is_zero, &block_index);
-        if (p != NULL) {
-          mi_assert_internal(block_index != SIZE_MAX);
-          #if MI_DEBUG>=1
-            _Atomic(mi_block_info_t)* block = &arena->blocks[block_index];
-            mi_block_info_t binfo = mi_atomic_read(block);
-            mi_assert_internal(mi_block_is_in_use(binfo));
-            mi_assert_internal(mi_block_count(binfo)*MI_ARENA_BLOCK_SIZE >= size);
-          #endif
-          *memid  = mi_memid_create(i, block_index);
-          *commit = true;           // TODO: support commit on demand?
-          *large  = arena->is_large;
+      if (arena==NULL) break; // end reached
+      if ((arena->numa_node<0 || arena->numa_node==numa_node) && // numa local?
+          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+      { 
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid);
        mi_assert_internal((uintptr_t)p % alignment == 0);
-          return p;
+        if (p != NULL) return p;
      }
    }
+    // try from another numa node instead..
+    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
+      mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[i]));
+      if (arena==NULL) break; // end reached
+      if ((arena->numa_node>=0 && arena->numa_node!=numa_node) && // not numa local!
+          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+      {
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid);
+        mi_assert_internal((uintptr_t)p % alignment == 0);
+        if (p != NULL) return p;
+      }
    }
  }

-  // fall back to the OS
+  // finally, fall back to the OS
  *is_zero = true;
  *memid = MI_MEMID_OS;
  return _mi_os_alloc_aligned(size, alignment, *commit, large, tld);
@ -350,31 +380,61 @@ static bool mi_arena_add(mi_arena_t* arena) {
 ----------------------------------------------------------- */
 #include <errno.h> // ENOMEM

-int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
-  size_t pages_reserved_default = 0;
-  if (pages_reserved==NULL) pages_reserved = &pages_reserved_default;
+// reserve at a specific numa node
+static int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept {
  size_t hsize = 0;
-  void* p = NULL;
-  int err = _mi_os_alloc_huge_os_pages(pages, max_secs, &p, pages_reserved, &hsize);
-  _mi_verbose_message("reserved %zu huge pages\n", *pages_reserved);
-  if (p==NULL) return err;
-  // err might be != 0 but that is fine, we just got less pages.
-  mi_assert_internal(*pages_reserved > 0 && hsize > 0 && *pages_reserved <= pages);
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, &hsize);
+  if (p==NULL) return ENOMEM;
+  _mi_verbose_message("reserved %zu huge (1GiB) pages\n", pages);
+  
  size_t bcount = hsize / MI_ARENA_BLOCK_SIZE;
  size_t asize = sizeof(mi_arena_t) + (bcount*sizeof(mi_block_info_t));  // one too much
-  mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main);
+  mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
  if (arena == NULL) {
-    *pages_reserved = 0;
    _mi_os_free(p, hsize, &_mi_stats_main);
    return ENOMEM;
  }
  arena->block_count = bcount;
  arena->start = (uint8_t*)p;
  arena->block_bottom = 0;
+  arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
  arena->is_large = true;
  arena->is_zero_init = true;
  memset(arena->blocks, 0, bcount * sizeof(mi_block_info_t));
-  //mi_atomic_write(&arena->blocks[0], mi_block_info_create(bcount, false));
  mi_arena_add(arena);
  return 0;
 }
+
+
+// reserve huge pages evenly among all numa nodes. 
+int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept {
+  if (pages == 0) return 0;
+
+  // pages per numa node
+  int numa_count = _mi_os_numa_node_count();
+  if (numa_count <= 0) numa_count = 1;
+  size_t pages_per = pages / numa_count;
+  if (pages_per == 0) pages_per = 1;
+  
+  // reserve evenly among numa nodes
+  for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+    int err = mi_reserve_huge_os_pages_at((pages_per > pages ? pages : pages_per), numa_node);
+    if (err) return err;
+    if (pages < pages_per) {
+      pages = 0;
+    }
+    else {
+      pages -= pages_per;
+    }
+  }
+
+  return 0;
+}
+
+int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  _mi_verbose_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  int err = mi_reserve_huge_os_pages_interleave(pages);  
+  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
+  return err;
+}
--- a/src/init.c
+++ b/src/init.c
@ -435,7 +435,7 @@ static void mi_process_load(void) {
  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
    size_t pages     = mi_option_get(mi_option_reserve_huge_os_pages);
    double max_secs = (double)pages / 2.0; // 0.5s per page (1GiB)
-    mi_reserve_huge_os_pages(pages, max_secs, NULL);
+    mi_reserve_huge_os_pages_interleave(pages);
  }
 }

--- a/src/options.c
+++ b/src/options.c
@ -66,7 +66,8 @@ static mi_option_desc_t options[_mi_option_last] =
  { 0, UNINIT, MI_OPTION(reset_decommits) },     // note: cannot enable this if secure is on
  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
  { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
-  { 100, UNINIT, MI_OPTION(os_tag) }             // only apple specific for now but might serve more or less related purpose
+  { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
+  { 256, UNINIT, MI_OPTION(max_numa_node) }      // maximum allowed numa node
 };

 static void mi_option_init(mi_option_desc_t* desc);
--- a/src/os.c
+++ b/src/os.c
@ -170,7 +170,7 @@ void _mi_os_init() {
    os_alloc_granularity = os_page_size;
  }
  if (mi_option_is_enabled(mi_option_large_os_pages)) {
-    large_os_page_size = (1UL << 21); // 2MiB
+    large_os_page_size = 2*MiB;
  }
 }
 #endif
@ -207,31 +207,6 @@ static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size);

 #ifdef _WIN32
 static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
-#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
-  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
-  if ((size % ((uintptr_t)1 << 30)) == 0 /* 1GiB multiple */
-    && (flags & MEM_LARGE_PAGES) != 0 && (flags & MEM_COMMIT) != 0 && (flags & MEM_RESERVE) != 0
-    && (addr != NULL || try_alignment == 0 || try_alignment % _mi_os_page_size() == 0)
-    && pNtAllocateVirtualMemoryEx != NULL)
-  {
-    #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
-    #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
-    #endif
-    MEM_EXTENDED_PARAMETER param = { 0, 0 };
-    param.Type = 5; // == MemExtendedParameterAttributeFlags;
-    param.ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
-    SIZE_T psize = size;
-    void*  base  = addr;
-    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, &param, 1);
-    if (err == 0) {
-      return base;
-    }
-    else {
-      // else fall back to regular large OS pages
-      _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error 0x%lx)\n", err);
-    }
-  }
-#endif
 #if (MI_INTPTR_SIZE >= 8) 
  // on 64-bit systems, try to use the virtual address area after 4TiB for 4MiB aligned allocations
  void* hint;
@ -364,7 +339,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
      lflags |= MAP_HUGETLB;
      #endif
      #ifdef MAP_HUGE_1GB
-      if ((size % ((uintptr_t)1 << 30)) == 0) {
+      if ((size % GiB) == 0) {
        lflags |= MAP_HUGE_1GB;
      }
      else
@ -400,10 +375,10 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);    
    #if defined(MADV_HUGEPAGE)
    // Many Linux systems don't allow MAP_HUGETLB but they support instead
-    // transparent huge pages (TPH). It is not required to call `madvise` with MADV_HUGE
+    // transparent huge pages (THP). It is not required to call `madvise` with MADV_HUGE
    // though since properly aligned allocations will already use large pages if available
    // in that case -- in particular for our large regions (in `memory.c`).
-    // However, some systems only allow TPH if called with explicit `madvise`, so
+    // However, some systems only allow THP if called with explicit `madvise`, so
    // when large OS pages are enabled for mimalloc, we call `madvice` anyways.
    if (allow_large && use_large_os_page(size, try_alignment)) {
      if (madvise(p, size, MADV_HUGEPAGE) == 0) {
@ -810,101 +785,146 @@ bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {


 /* ----------------------------------------------------------------------------
-Support for huge OS pages (1Gib) that are reserved up-front and never
-released. Only regions are allocated in here (see `memory.c`) so the memory
-will be reused.
+Support for allocating huge OS pages (1Gib) that are reserved up-front 
+and possibly associated with a specific NUMA node. (use `numa_node>=0`)
 -----------------------------------------------------------------------------*/
-#define MI_HUGE_OS_PAGE_SIZE ((size_t)1 << 30)  // 1GiB
+#define MI_HUGE_OS_PAGE_SIZE  (GiB)  

-
-#if !(MI_INTPTR_SIZE >= 8 && (defined(_WIN32) || defined(MI_OS_USE_MMAP)))
-int _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** start, size_t* pages_reserved, size_t* size) mi_attr_noexcept {
-  UNUSED(pages); UNUSED(max_secs);
-  if (start != NULL) *start = NULL;
-  if (pages_reserved != NULL) *pages_reserved = 0;
-  if (size != NULL) *size = 0;
-  return ENOMEM; 
-}
-#else
-static _Atomic(uintptr_t) huge_top; // = 0
-
-int _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** pstart, size_t* pages_reserved, size_t* psize) mi_attr_noexcept 
+#if defined(WIN32) && (MI_INTPTR_SIZE >= 8)
+static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) 
 {  
-  *pstart = NULL;
-  *pages_reserved = 0;
-  *psize = 0;
-  if (max_secs==0) return ETIMEDOUT; // timeout 
-  if (pages==0) return 0;            // ok
+  mi_assert_internal(size%GiB == 0);

-  // Atomically claim a huge address range
-  size_t size = pages * MI_HUGE_OS_PAGE_SIZE;
-  uint8_t* start;
-  do {
-    start = (uint8_t*)mi_atomic_addu(&huge_top, size);  
-    if (start == NULL) {
-      uintptr_t top = ((uintptr_t)32 << 40);  // 32TiB virtual start address
-      #if (MI_SECURE>0 || MI_DEBUG==0)        // security: randomize start of huge pages unless in debug mode
-      uintptr_t r = _mi_random_init((uintptr_t)&_mi_os_alloc_huge_os_pages);
-      top += ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
+  #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
+  DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
+  MEM_EXTENDED_PARAMETER params[4] = { {0,0},{0,0},{0,0},{0,0} };
+  MEM_ADDRESS_REQUIREMENTS reqs = {0,0,0};
+  reqs.HighestEndingAddress = NULL;
+  reqs.LowestStartingAddress = NULL;
+  reqs.Alignment = MI_SEGMENT_SIZE;
+  
+  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages  
+  if (pNtAllocateVirtualMemoryEx != NULL) {
+    #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
+    #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
    #endif
-      mi_atomic_cas_strong(&huge_top, top, 0);
+    params[0].Type = MemExtendedParameterAddressRequirements;
+    params[0].Pointer = &reqs;
+    params[1].Type = 5; // == MemExtendedParameterAttributeFlags;
+    params[1].ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
+    size_t param_count = 2;
+    if (numa_node >= 0) {
+      param_count++;
+      params[2].Type = MemExtendedParameterNumaNode;
+      params[2].ULong = (unsigned)numa_node;
    }
-  } while (start == NULL);
-
-  
-  // Allocate one page at the time but try to place them contiguously
-  // We allocate one page at the time to be able to abort if it takes too long
-  double start_t = _mi_clock_start();
-  uint8_t* addr = start;  // current top of the allocations
-  for (size_t page = 0; page < pages; page++, addr += MI_HUGE_OS_PAGE_SIZE ) {
-    // allocate a page
-    void* p = NULL; 
-    bool is_large = true;
-    #ifdef _WIN32
-    if (page==0) { mi_win_enable_large_os_pages(); }
-    p = mi_win_virtual_alloc(addr, MI_HUGE_OS_PAGE_SIZE, 0, MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE, true, true, &is_large);
-    #elif defined(MI_OS_USE_MMAP)
-    p = mi_unix_mmap(addr, MI_HUGE_OS_PAGE_SIZE, 0, PROT_READ | PROT_WRITE, true, true, &is_large);
-    #else 
-    // always fail
-    #endif  
-    
-    // Did we succeed at a contiguous address?
-    if (p != addr) {
-      // no success, issue a warning and return with an error 
-      if (p != NULL) {
-        _mi_warning_message("could not allocate contiguous huge page %zu at 0x%p\n", page, addr); 
-        _mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main );
+    SIZE_T psize = size;
+    void* base = NULL;
+    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
+    if (err == 0) {
+      return base;
    }
    else {
-        #ifdef _WIN32
-        int err = GetLastError();
-        #else
-        int err = errno;
+      // fall back to regular huge pages    
+      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (error 0x%lx)\n", err);
+    }
+  }  
+  // on modern Windows try use VirtualAlloc2 for aligned large OS page allocation
+  if (pVirtualAlloc2 != NULL) {
+    params[0].Type = MemExtendedParameterAddressRequirements;
+    params[0].Pointer = &reqs;
+    size_t param_count = 1;
+    if (numa_node >= 0) {
+      param_count++;
+      params[1].Type = MemExtendedParameterNumaNode;
+      params[1].ULong = (unsigned)numa_node;
+    }
+    return (*pVirtualAlloc2)(GetCurrentProcess(), NULL, size, flags, PAGE_READWRITE, params, param_count);
+  }
  #endif
-        _mi_warning_message("could not allocate huge page %zu at 0x%p, error: %i\n", page, addr, err);
+  return NULL; // give up on older Windows.. 
 }
-      return ENOMEM;  
+#elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8)
+#ifdef MI_HAS_NUMA
+#include <numaif.h> // mbind, and use -lnuma
+#endif
+static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
+  mi_assert_internal(size%GiB == 0);
+  bool is_large = true;
+  void* p = mi_unix_mmap(NULL, MI_HUGE_OS_PAGE_SIZE, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  if (p == NULL) return NULL;
+  #ifdef MI_HAS_NUMA  
+  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) {
+    uintptr_t numa_mask = (1UL << numa_node);
+    long err = mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
+    if (err != 0) {
+      _mi_warning_message("failed to bind huge (1GiB) pages to NUMA node %d: %s\n", numa_node, strerror(errno));
    }
-    // success, record it
-    if (page==0) {
-      *pstart = addr;
  }
-    *psize += MI_HUGE_OS_PAGE_SIZE; 
-    *pages_reserved += 1;
-    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
-    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
+  #endif
+  return p;
+}
+#else 
+static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
+  return NULL;
+}
+#endif

-    // check for timeout
-    double elapsed = _mi_clock_end(start_t);
-    if (elapsed > max_secs) return ETIMEDOUT; 
-    if (page >= 1) {
-      double estimate = ((elapsed / (double)(page+1)) * (double)pages);
-      if (estimate > 1.5*max_secs) return ETIMEDOUT; // seems like we are going to timeout
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, size_t* psize) {
+  if (psize != NULL) *psize = 0;
+  size_t size = pages * MI_HUGE_OS_PAGE_SIZE;
+  void* p = mi_os_alloc_huge_os_pagesx(size, numa_node);
+  if (p==NULL) return NULL;
+  if (psize != NULL) *psize = size;
+  _mi_stat_increase(&_mi_stats_main.committed, size);
+  _mi_stat_increase(&_mi_stats_main.reserved, size);
+  return p;
 }
+
+#ifdef WIN32
+static int mi_os_numa_nodex(void) {
+  PROCESSOR_NUMBER pnum;
+  USHORT numa_node = 0;
+  GetCurrentProcessorNumberEx(&pnum);
+  GetNumaProcessorNodeEx(&pnum,&numa_node);
+  return (int)numa_node; 
 }
-  mi_assert_internal(*psize == size);
+
+static int mi_os_numa_node_countx(void) {
+  ULONG numa_max = 0;
+  GetNumaHighestNodeNumber(&numa_max);
+  return (int)(numa_max + 1);
+}
+#elif MI_HAS_NUMA
+#include <numa.h>
+static int mi_os_numa_nodex(void) {
+  return numa_preferred();
+}
+static int mi_os_numa_node_countx(void) {
+  return (numa_max_node() + 1);
+}
+#else
+static int mi_os_numa_nodex(void) {
  return 0;
 }
+static int mi_os_numa_node_countx(void) {
+  return 1;
+}
 #endif

+int _mi_os_numa_node_count(void) {
+  long ncount = mi_os_numa_node_countx();
+  // never more than max numa node and at least 1
+  long nmax  = 1 + mi_option_get(mi_option_max_numa_node);
+  if (ncount > nmax) ncount = nmax;
+  if (ncount <= 0) ncount = 1;
+  return ncount;
+}
+
+int _mi_os_numa_node(void) {
+  int nnode = mi_os_numa_nodex();
+  // never more than the node count
+  int ncount = _mi_os_numa_node_count();
+  if (nnode >= ncount) { nnode = nnode % ncount; }  
+  return nnode;
+}