diff --git a/.gitattributes b/.gitattributes
index 0332e031..f083b107 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -10,3 +10,4 @@
 *.dll binary
 *.lib binary
 *.exe binary
+bin export-ignore
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1387e0db..0cc7e575 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clan
 option(MI_SKIP_COLLECT_ON_EXIT "Skip collecting memory on program exit" OFF)
 option(MI_NO_PADDING        "Force no use of padding even in DEBUG mode etc." OFF)
 option(MI_INSTALL_TOPLEVEL  "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version" OFF)
+option(MI_NO_THP            "Disable transparent huge pages support on Linux/Android for the mimalloc process only" OFF)
 
 # deprecated options
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
@@ -128,7 +129,7 @@ endif()
 
 if(MI_SECURE)
   message(STATUS "Set full secure build (MI_SECURE=ON)")
-  list(APPEND mi_defines MI_SECURE=4)  
+  list(APPEND mi_defines MI_SECURE=4)
 endif()
 
 if(MI_TRACK_VALGRIND)
@@ -247,7 +248,7 @@ if(MI_DEBUG_UBSAN)
       message(WARNING "Can only use undefined-behavior sanitizer with clang++ (MI_DEBUG_UBSAN=ON but ignored)")
     endif()
   else()
-    message(WARNING "Can only use thread sanitizer with a debug build (CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})")
+    message(WARNING "Can only use undefined-behavior sanitizer with a debug build (CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})")
   endif()
 endif()
 
@@ -263,10 +264,18 @@ if(MI_USE_CXX)
   endif()
 endif()
 
-if(CMAKE_SYSTEM_NAME MATCHES "Haiku")
-   SET(CMAKE_INSTALL_LIBDIR ~/config/non-packaged/lib)
-   SET(CMAKE_INSTALL_INCLUDEDIR ~/config/non-packaged/headers)
- endif()
+if(CMAKE_SYSTEM_NAME MATCHES "Linux|Android")
+  if(MI_NO_THP)
+    message(STATUS "Disable transparent huge pages support (MI_NO_THP=ON)")
+    list(APPEND mi_defines MI_NO_THP=1)
+  endif()
+endif()
+
+# On Haiku use `-DCMAKE_INSTALL_PREFIX` instead, issue #788
+# if(CMAKE_SYSTEM_NAME MATCHES "Haiku")
+#   SET(CMAKE_INSTALL_LIBDIR ~/config/non-packaged/lib)
+#   SET(CMAKE_INSTALL_INCLUDEDIR ~/config/non-packaged/headers)
+# endif()
 
 # Compiler flags
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")
@@ -468,7 +477,7 @@ if (MI_BUILD_OBJECT)
     set(mimalloc-obj-static "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/mimalloc-obj.dir/src/static.c${CMAKE_C_OUTPUT_EXTENSION}")
     set(mimalloc-obj-out    "${CMAKE_CURRENT_BINARY_DIR}/${mi_basename}${CMAKE_C_OUTPUT_EXTENSION}")
     add_custom_command(OUTPUT ${mimalloc-obj-out} DEPENDS mimalloc-obj COMMAND "${CMAKE_COMMAND}" -E copy "${mimalloc-obj-static}" "${mimalloc-obj-out}")
-    add_custom_target(mimalloc-obj-target ALL DEPENDS ${mimalloc-obj-out})      
+    add_custom_target(mimalloc-obj-target ALL DEPENDS ${mimalloc-obj-out})
   endif()
 
   # the following seems to lead to cmake warnings/errors on some systems, disable for now :-(
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index ae19cfb3..96f3922e 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -30,14 +30,17 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_noinline        __declspec(noinline)
 #define mi_decl_thread          __declspec(thread)
 #define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
+#define mi_decl_weak            
 #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
 #define mi_decl_noinline        __attribute__((noinline))
 #define mi_decl_thread          __thread
 #define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
+#define mi_decl_weak            __attribute__((weak))
 #else
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
 #define mi_decl_cache_align
+#define mi_decl_weak           
 #endif
 
 #if defined(__EMSCRIPTEN__) && !defined(__wasi__)
@@ -309,6 +312,12 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
   }
 }
 
+// Align a pointer upwards
+static inline void* mi_align_up_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_up((uintptr_t)p, alignment);
+}
+
+
 // Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
 static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
   mi_assert_internal(divider != 0);
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 9e560696..d14b885b 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -35,10 +35,10 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config );
 
 // Free OS memory
 int _mi_prim_free(void* addr, size_t size );
-  
+
 // Allocate OS memory. Return NULL on error.
 // The `try_alignment` is just a hint and the returned pointer does not have to be aligned.
-// If `commit` is false, the virtual memory range only needs to be reserved (with no access) 
+// If `commit` is false, the virtual memory range only needs to be reserved (with no access)
 // which will later be committed explicitly using `_mi_prim_commit`.
 // `is_zero` is set to true if the memory was zero initialized (as on most OS's)
 // pre: !commit => !allow_large
@@ -82,11 +82,11 @@ mi_msecs_t _mi_prim_clock_now(void);
 typedef struct mi_process_info_s {
   mi_msecs_t  elapsed;
   mi_msecs_t  utime;
-  mi_msecs_t  stime; 
-  size_t      current_rss; 
-  size_t      peak_rss;  
+  mi_msecs_t  stime;
+  size_t      current_rss;
+  size_t      peak_rss;
   size_t      current_commit;
-  size_t      peak_commit; 
+  size_t      peak_commit;
   size_t      page_faults;
 } mi_process_info_t;
 
@@ -117,7 +117,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 
 //-------------------------------------------------------------------
 // Thread id: `_mi_prim_thread_id()`
-// 
+//
 // Getting the thread id should be performant as it is called in the
 // fast path of `_mi_free` and we specialize for various platforms as
 // inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
@@ -125,33 +125,23 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 // for each thread (unequal to zero).
 //-------------------------------------------------------------------
 
-// defined in `init.c`; do not use these directly
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-extern bool _mi_process_is_initialized;             // has mi_process_init been called?
-
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
-
-#if defined(_WIN32)
-
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
-  // Windows: works on Intel and ARM in both 32- and 64-bit
-  return (uintptr_t)NtCurrentTeb();
-}
-
-// We use assembly for a fast thread id on the main platforms. The TLS layout depends on
-// both the OS and libc implementation so we use specific tests for each main platform.
+// On some libc + platform combinations we can directly access a thread-local storage (TLS) slot.
+// The TLS layout depends on both the OS and libc implementation so we use specific tests for each main platform.
 // If you test on another platform and it works please send a PR :-)
 // see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
-#elif defined(__GNUC__) && ( \
+//
+// Note: on most platforms this is not actually used anymore as we prefer `__builtin_thread_pointer()` nowadays.
+// However, we do still use it with older clang compilers and Apple OS (as we use TLS slot for the default heap there).
+#if defined(__GNUC__) && ( \
            (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
-        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
+        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__) || defined(__POWERPC__))) \
         || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
         || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
         || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
       )
 
+#define MI_HAS_TLS_SLOT
+
 static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
   void* res;
   const size_t ofs = (slot*sizeof(void*));
@@ -175,6 +165,9 @@ static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
     __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
     #endif
     res = tcb[slot];
+  #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
+    MI_UNUSED(ofs);
+    res = pthread_getspecific(slot);
   #endif
   return res;
 }
@@ -202,9 +195,40 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
     __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
     #endif
     tcb[slot] = value;
+  #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
+    MI_UNUSED(ofs);
+    pthread_setspecific(slot, value);    
   #endif
 }
 
+#endif
+
+// defined in `init.c`; do not use these directly
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
+
+#if defined(_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Windows: works on Intel and ARM in both 32- and 64-bit
+  return (uintptr_t)NtCurrentTeb();
+}
+
+#elif defined(__has_builtin) && __has_builtin(__builtin_thread_pointer) && \
+      (!defined(__APPLE__)) && /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
+      (!defined(__clang_major__) || __clang_major__ >= 14)  // older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>)
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Works on most Unix based platforms
+  return (uintptr_t)__builtin_thread_pointer();  
+}
+
+#elif defined(MI_HAS_TLS_SLOT)
+
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   #if defined(__BIONIC__)
     // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
@@ -251,7 +275,6 @@ static inline mi_heap_t* mi_prim_get_default_heap(void);
 #if defined(MI_MALLOC_OVERRIDE)
 #if defined(__APPLE__) // macOS
   #define MI_TLS_SLOT               89  // seems unused?
-  // #define MI_TLS_RECURSE_GUARD 1
   // other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
   // see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
 #elif defined(__OpenBSD__)
@@ -269,6 +292,9 @@ static inline mi_heap_t* mi_prim_get_default_heap(void);
 
 
 #if defined(MI_TLS_SLOT)
+# if !defined(MI_HAS_TLS_SLOT)
+#  error "trying to use a TLS slot for the default heap, but the mi_prim_tls_slot primitives are not defined"
+# endif
 
 static inline mi_heap_t* mi_prim_get_default_heap(void) {
   mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT);
diff --git a/src/alloc-override.c b/src/alloc-override.c
index 873065dc..7cf0bf2c 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -259,10 +259,11 @@ extern "C" {
 // no forwarding here due to aliasing/name mangling issues
 void  cfree(void* p)                                    { mi_free(p); }
 void* pvalloc(size_t size)                              { return mi_pvalloc(size); }
-void* reallocarray(void* p, size_t count, size_t size)  { return mi_reallocarray(p, count, size); }
-int   reallocarr(void* p, size_t count, size_t size)    { return mi_reallocarr(p, count, size); }
 void* memalign(size_t alignment, size_t size)           { return mi_memalign(alignment, size); }
 void* _aligned_malloc(size_t alignment, size_t size)    { return mi_aligned_alloc(alignment, size); }
+void* reallocarray(void* p, size_t count, size_t size)  { return mi_reallocarray(p, count, size); }
+// some systems define reallocarr so mark it as a weak symbol (#751)
+mi_decl_weak int reallocarr(void* p, size_t count, size_t size)    { return mi_reallocarr(p, count, size); }
 
 #if defined(__wasi__)
   // forward __libc interface (see PR #667)
diff --git a/src/alloc.c b/src/alloc.c
index e2273d28..8a76d3d3 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -922,9 +922,13 @@ static bool mi_try_new_handler(bool nothrow) {
   #endif
   if (h==NULL) {
     _mi_error_message(ENOMEM, "out of memory in 'new'");
+    #if defined(_CPPUNWIND) || defined(__cpp_exceptions)  // exceptions are not always enabled
     if (!nothrow) {
       throw std::bad_alloc();
     }
+    #else
+    MI_UNUSED(nothrow);
+    #endif
     return false;
   }
   else {
diff --git a/src/arena.c b/src/arena.c
index e08ea22a..fc8a79c6 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -163,6 +163,7 @@ static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* m
 
   // success
   *memid = _mi_memid_create(MI_MEM_STATIC);
+  memid->initially_zero = true;
   const size_t start = _mi_align_up(oldtop, alignment);
   uint8_t* const p = &mi_arena_static[start];
   _mi_memzero(p, size);
@@ -180,8 +181,10 @@ static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* st
   p = _mi_os_alloc(size, memid, stats);
   if (p == NULL) return NULL;
 
+  // zero the OS memory if needed
   if (!memid->initially_zero) {
     _mi_memzero_aligned(p, size);
+    memid->initially_zero = true;
   }
   return p;
 }
@@ -480,7 +483,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
     // schedule decommit
     mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
     if (expire != 0) {
-      mi_atomic_addi64_acq_rel(&arena->purge_expire, delay/10);  // add smallish extra delay
+      mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10));  // add smallish extra delay
     }
     else {
       mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
@@ -524,7 +527,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
   if (!force && expire > now) return false;
 
   // reset expire (if not already set concurrently)
-  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, 0);
+  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
 
   // potential purges scheduled, walk through the bitmap
   bool any_purged = false;
diff --git a/src/init.c b/src/init.c
index fe885a5b..fda17d70 100644
--- a/src/init.c
+++ b/src/init.c
@@ -425,7 +425,7 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   #if defined(MI_TLS_SLOT)
   mi_prim_tls_slot_set(MI_TLS_SLOT,heap);
   #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
-  *mi_tls_pthread_heap_slot() = heap;
+  *mi_prim_tls_pthread_heap_slot() = heap;
   #elif defined(MI_TLS_PTHREAD)
   // we use _mi_heap_default_key
   #else
diff --git a/src/os.c b/src/os.c
index 69ad2bf9..b98950a4 100644
--- a/src/os.c
+++ b/src/os.c
@@ -73,10 +73,6 @@ void _mi_os_init(void) {
 bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
 
-static void* mi_align_up_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_up((uintptr_t)p, alignment);
-}
-
 static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
   mi_assert_internal(alignment != 0);
   uintptr_t mask = alignment - 1;
diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c
new file mode 100644
index 00000000..c0fa0f4a
--- /dev/null
+++ b/src/prim/emscripten/prim.c
@@ -0,0 +1,251 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen, Alon Zakai
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// This file is included in `src/prim/prim.c`
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
+
+// Design
+// ======
+//
+// mimalloc is built on top of emmalloc. emmalloc is a minimal allocator on top
+// of sbrk. The reason for having three layers here is that we want mimalloc to
+// be able to allocate and release system memory properly, the same way it would
+// when using VirtualAlloc on Windows or mmap on POSIX, and sbrk is too limited.
+// Specifically, sbrk can only go up and down, and not "skip" over regions, and
+// so we end up either never freeing memory to the system, or we can get stuck
+// with holes.
+//
+// Atm wasm generally does *not* free memory back the system: once grown, we do
+// not shrink back down (https://github.com/WebAssembly/design/issues/1397).
+// However, that is expected to improve
+// (https://github.com/WebAssembly/memory-control/blob/main/proposals/memory-control/Overview.md)
+// and so we do not want to bake those limitations in here.
+//
+// Even without that issue, we want our system allocator to handle holes, that
+// is, it should merge freed regions and allow allocating new content there of
+// the full size, etc., so that we do not waste space. That means that the
+// system allocator really does need to handle the general problem of allocating
+// and freeing variable-sized chunks of memory in a random order, like malloc/
+// free do. And so it makes sense to layer mimalloc on top of such an
+// implementation.
+//
+// emmalloc makes sense for the lower level because it is small and simple while
+// still fully handling merging of holes etc. It is not the most efficient
+// allocator, but our assumption is that mimalloc needs to be fast while the
+// system allocator underneath it is called much less frequently.
+//
+
+//---------------------------------------------
+// init
+//---------------------------------------------
+
+void _mi_prim_mem_init( mi_os_mem_config_t* config) {
+  config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
+  config->alloc_granularity = 16;
+  config->has_overcommit = false;
+  config->must_free_whole = true;
+  config->has_virtual_reserve = false;
+}
+
+extern void emmalloc_free(void*);
+
+int _mi_prim_free(void* addr, size_t size) {
+  MI_UNUSED(size);
+  emmalloc_free(addr);
+  return 0;
+}
+
+
+//---------------------------------------------
+// Allocation
+//---------------------------------------------
+
+extern void* emmalloc_memalign(size_t, size_t);
+
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  MI_UNUSED(try_alignment); MI_UNUSED(allow_large); MI_UNUSED(commit);
+  *is_large = false;
+  // TODO: Track the highest address ever seen; first uses of it are zeroes.
+  //       That assumes no one else uses sbrk but us (they could go up,
+  //       scribble, and then down), but we could assert on that perhaps.
+  *is_zero = false;
+  // emmalloc has some limitations on alignment size.
+  // TODO: Why does mimalloc ask for an align of 4MB? that ends up allocating
+  //       8, which wastes quite a lot for us in wasm. If that is unavoidable,
+  //       we may want to improve emmalloc to support such alignment. See also
+  //       https://github.com/emscripten-core/emscripten/issues/20645
+  #define MIN_EMMALLOC_ALIGN           8
+  #define MAX_EMMALLOC_ALIGN (1024*1024)
+  if (try_alignment < MIN_EMMALLOC_ALIGN) {
+    try_alignment = MIN_EMMALLOC_ALIGN;
+  } else if (try_alignment > MAX_EMMALLOC_ALIGN) {
+    try_alignment = MAX_EMMALLOC_ALIGN;
+  }
+  void* p = emmalloc_memalign(try_alignment, size);
+  *addr = p;
+  if (p == 0) {
+    return ENOMEM;
+  }
+  return 0;
+}
+
+
+//---------------------------------------------
+// Commit/Reset
+//---------------------------------------------
+
+int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  // See TODO above.
+  *is_zero = false;
+  return 0;
+}
+
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  *needs_recommit = false;
+  return 0;
+}
+
+int _mi_prim_reset(void* addr, size_t size) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  return 0;
+}
+
+int _mi_prim_protect(void* addr, size_t size, bool protect) {
+  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
+  return 0;
+}
+
+
+//---------------------------------------------
+// Huge pages and NUMA nodes
+//---------------------------------------------
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node);
+  *is_zero = true;
+  *addr = NULL;
+  return ENOSYS;
+}
+
+size_t _mi_prim_numa_node(void) {
+  return 0;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  return 1;
+}
+
+
+//----------------------------------------------------------------
+// Clock
+//----------------------------------------------------------------
+
+#include <emscripten/html5.h>
+
+mi_msecs_t _mi_prim_clock_now(void) {
+  return emscripten_date_now();
+}
+
+
+//----------------------------------------------------------------
+// Process info
+//----------------------------------------------------------------
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  // use defaults
+  MI_UNUSED(pinfo);
+}
+
+
+//----------------------------------------------------------------
+// Output
+//----------------------------------------------------------------
+
+#include <emscripten/console.h>
+
+void _mi_prim_out_stderr( const char* msg) {
+  emscripten_console_error(msg);
+}
+
+
+//----------------------------------------------------------------
+// Environment
+//----------------------------------------------------------------
+
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  // For code size reasons, do not support environ customization for now.
+  MI_UNUSED(name);
+  MI_UNUSED(result);
+  MI_UNUSED(result_size);
+  return false;
+}
+
+
+//----------------------------------------------------------------
+// Random
+//----------------------------------------------------------------
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  int err = getentropy(buf, buf_len);
+  return !err;
+}
+
+
+//----------------------------------------------------------------
+// Thread init/done
+//----------------------------------------------------------------
+
+#ifdef __EMSCRIPTEN_SHARED_MEMORY__
+
+// use pthread local storage keys to detect thread ending
+// (and used with MI_TLS_PTHREADS for the default heap)
+pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
+
+static void mi_pthread_done(void* value) {
+  if (value!=NULL) {
+    _mi_thread_done((mi_heap_t*)value);
+  }
+}
+
+void _mi_prim_thread_init_auto_done(void) {
+  mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
+  pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing to do
+}
+
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
+    pthread_setspecific(_mi_heap_default_key, heap);
+  }
+}
+
+#else
+
+void _mi_prim_thread_init_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+  MI_UNUSED(heap);
+
+}
+#endif
diff --git a/src/prim/prim.c b/src/prim/prim.c
index 9a597d8e..3b7d3736 100644
--- a/src/prim/prim.c
+++ b/src/prim/prim.c
@@ -18,6 +18,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_USE_SBRK
 #include "wasi/prim.c"     // memory-grow or sbrk (Wasm)
 
+#elif defined(__EMSCRIPTEN__)
+#include "emscripten/prim.c" // emmalloc_*, + pthread support
+
 #else
 #include "unix/prim.c"     // mmap() (Linux, macOSX, BSD, Illumnos, Haiku, DragonFly, etc.)
 
diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 54bf57b2..2035e1a4 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -27,16 +27,20 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include <sys/mman.h>  // mmap
 #include <unistd.h>    // sysconf
-
+#include <fcntl.h>     // open, close, read, access
+  
 #if defined(__linux__)
   #include <features.h>
-  #include <fcntl.h>
+  #if defined(MI_NO_THP)
+  #include <sys/prctl.h>
+  #endif
   #if defined(__GLIBC__)
   #include <linux/mman.h> // linux mmap flags
   #else
   #include <sys/mman.h>
   #endif
 #elif defined(__APPLE__)
+  #include <AvailabilityMacros.h>
   #include <TargetConditionals.h>
   #if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
   #include <mach/vm_statistics.h>
@@ -50,17 +54,19 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <sys/sysctl.h>
 #endif
 
-#if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__)
+#if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__) && !defined(__OpenBSD__) && !defined(__sun)
   #define MI_HAS_SYSCALL_H
   #include <sys/syscall.h>
 #endif
 
+
 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
 // and do allocation themselves; using syscalls prevents recursion when mimalloc is 
 // still initializing (issue #713)
 //------------------------------------------------------------------------------------
 
+
 #if defined(MI_HAS_SYSCALL_H) && defined(SYS_open) && defined(SYS_close) && defined(SYS_read) && defined(SYS_access)
 
 static int mi_prim_open(const char* fpath, int open_flags) {
@@ -76,7 +82,7 @@ static int mi_prim_access(const char *fpath, int mode) {
   return syscall(SYS_access,fpath,mode);
 }
 
-#elif !defined(__APPLE__)  // avoid unused warnings
+#elif (!defined(__APPLE__) || MAC_OS_X_VERSION_MIN_REQUIRED < 1070) && !defined(__sun) // avoid unused warnings on macOS and Solaris
 
 static int mi_prim_open(const char* fpath, int open_flags) {
   return open(fpath,open_flags);
@@ -125,7 +131,8 @@ static bool unix_detect_overcommit(void) {
   return os_overcommit;
 }
 
-void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
+void _mi_prim_mem_init( mi_os_mem_config_t* config ) 
+{
   long psize = sysconf(_SC_PAGESIZE);
   if (psize > 0) {
     config->page_size = (size_t)psize;
@@ -135,6 +142,17 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
   config->has_overcommit = unix_detect_overcommit();
   config->must_free_whole = false;    // mmap can free in parts
   config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
+
+  // disable transparent huge pages for this process?
+  #if defined(MI_NO_THP) && (defined(__linux__) || defined(__ANDROID__))
+  int val = 0;
+  if (prctl(PR_GET_THP_DISABLE, &val, 0, 0, 0) != 0) {
+    // Most likely since distros often come with always/madvise settings.
+    val = 1;
+    // Disabling only for mimalloc process rather than touching system wide settings
+    (void)prctl(PR_SET_THP_DISABLE, &val, 0, 0, 0);
+  }
+  #endif
 }
 
 
@@ -276,7 +294,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
         *is_large = true;
         p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
         #ifdef MAP_HUGE_1GB
-        if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
+        if (p == NULL && (lflags & MAP_HUGE_1GB) == MAP_HUGE_1GB) {
           mi_huge_pages_available = false; // don't try huge 1GiB pages again
           _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno);
           lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
@@ -310,7 +328,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
       #elif defined(__sun)
       if (allow_large && _mi_os_use_large_page(size, try_alignment)) {
         struct memcntl_mha cmd = {0};
-        cmd.mha_pagesize = large_os_page_size;
+        cmd.mha_pagesize = _mi_os_large_page_size();
         cmd.mha_cmd = MHA_MAPSIZE_VA;
         if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
           *is_large = true;
@@ -731,28 +749,20 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
 // Random
 //----------------------------------------------------------------
 
-#if defined(__APPLE__)
-
-#include <AvailabilityMacros.h>
-#if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10
+#if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15 && MAC_OS_X_VERSION_MIN_REQUIRED >= 1070
 #include <CommonCrypto/CommonCryptoError.h>
 #include <CommonCrypto/CommonRandom.h>
-#endif
+
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
-  #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
-    // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
-    // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
-    return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
-  #else
-    // fall back on older macOS
-    arc4random_buf(buf, buf_len);
-    return true;
-  #endif
+  // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
+  // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
+  return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);  
 }
 
 #elif defined(__ANDROID__) || defined(__DragonFly__) || \
       defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
-      defined(__sun) 
+      defined(__sun) || \
+      (defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10 && MAC_OS_X_VERSION_MIN_REQUIRED >= 1070)
 
 #include <stdlib.h>
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
@@ -760,11 +770,10 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
   return true;
 }
 
-#elif defined(__linux__) || defined(__HAIKU__)
+#elif defined(__APPLE__) || defined(__linux__) || defined(__HAIKU__)   // for old apple versions < 1070 (issue #829)
 
 #include <sys/types.h>
 #include <sys/stat.h>
-#include <fcntl.h>
 #include <errno.h>
 
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
@@ -835,7 +844,9 @@ void _mi_prim_thread_init_auto_done(void) {
 }
 
 void _mi_prim_thread_done_auto_done(void) {
-  // nothing to do
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // do not leak the key, see issue #809
+    pthread_key_delete(_mi_heap_default_key);
+  }
 }
 
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
diff --git a/src/prim/wasi/prim.c b/src/prim/wasi/prim.c
index 50511f0b..f74acd2a 100644
--- a/src/prim/wasi/prim.c
+++ b/src/prim/wasi/prim.c
@@ -12,6 +12,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 
+#include <stdio.h>   // fputs
+#include <stdlib.h>  // getenv
+
 //---------------------------------------------
 // Initialize
 //---------------------------------------------
@@ -40,6 +43,8 @@ int _mi_prim_free(void* addr, size_t size ) {
 //---------------------------------------------
 
 #if defined(MI_USE_SBRK)
+  #include <unistd.h>  // for sbrk
+
   static void* mi_memory_grow( size_t size ) {
     void* p = sbrk(size);
     if (p == (void*)(-1)) return NULL;
diff --git a/src/segment-map.c b/src/segment-map.c
index 4c2104bd..a306ec67 100644
--- a/src/segment-map.c
+++ b/src/segment-map.c
@@ -29,6 +29,7 @@ terms of the MIT license. A copy of the license can be found in the file
 static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
 
 static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
+  // note: segment can be invalid or NULL.
   mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
   if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
     *bitidx = 0;
@@ -70,8 +71,7 @@ void _mi_segment_map_freed_at(const mi_segment_t* segment) {
 // Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
 static mi_segment_t* _mi_segment_of(const void* p) {
   if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  mi_assert_internal(segment != NULL);
+  mi_segment_t* segment = _mi_ptr_segment(p);  // segment can be NULL  
   size_t bitidx;
   size_t index = mi_segment_map_index_of(segment, &bitidx);
   // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge
diff --git a/test/main-override.cpp b/test/main-override.cpp
index f9ac7327..64ea178b 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -100,7 +100,7 @@ static void various_tests() {
   auto tbuf = new unsigned char[sizeof(Test)];
   t = new (tbuf) Test(42);
   t->~Test();
-  delete tbuf;
+  delete[] tbuf;
 }
 
 class Static {