529 lines
18 KiB
C++
529 lines
18 KiB
C++
// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
|
|
// Copyright (c) 2008, Google Inc.
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above
|
|
// copyright notice, this list of conditions and the following disclaimer
|
|
// in the documentation and/or other materials provided with the
|
|
// distribution.
|
|
// * Neither the name of Google Inc. nor the names of its
|
|
// contributors may be used to endorse or promote products derived from
|
|
// this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
// ---
|
|
// Author: Ken Ashcraft <opensource@google.com>
|
|
|
|
#include <config.h>
|
|
#include "thread_cache.h"
|
|
#include <errno.h>
|
|
#include <string.h> // for memcpy
|
|
#include <algorithm> // for max, min
|
|
#include "base/commandlineflags.h" // for SpinLockHolder
|
|
#include "base/spinlock.h" // for SpinLockHolder
|
|
#include "getenv_safe.h" // for TCMallocGetenvSafe
|
|
#include "central_freelist.h" // for CentralFreeListPadded
|
|
|
|
using std::min;
|
|
using std::max;
|
|
|
|
// Note: this is initialized manually in InitModule to ensure that
|
|
// it's configured at right time
|
|
//
|
|
// DEFINE_int64(tcmalloc_max_total_thread_cache_bytes,
|
|
// EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES",
|
|
// kDefaultOverallThreadCacheSize),
|
|
// "Bound on the total amount of bytes allocated to "
|
|
// "thread caches. This bound is not strict, so it is possible "
|
|
// "for the cache to go over this bound in certain circumstances. "
|
|
// "Maximum value of this flag is capped to 1 GB.");
|
|
|
|
|
|
namespace tcmalloc {
|
|
|
|
static bool phinited = false;
|
|
|
|
volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize;
|
|
size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize;
|
|
ssize_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize;
|
|
PageHeapAllocator<ThreadCache> threadcache_allocator;
|
|
ThreadCache* ThreadCache::thread_heaps_ = NULL;
|
|
int ThreadCache::thread_heap_count_ = 0;
|
|
ThreadCache* ThreadCache::next_memory_steal_ = NULL;
|
|
#ifdef HAVE_TLS
|
|
__thread ThreadCache::ThreadLocalData ThreadCache::threadlocal_data_
|
|
ATTR_INITIAL_EXEC CACHELINE_ALIGNED;
|
|
#endif
|
|
bool ThreadCache::tsd_inited_ = false;
|
|
pthread_key_t ThreadCache::heap_key_;
|
|
|
|
void ThreadCache::Init(pthread_t tid) {
|
|
size_ = 0;
|
|
|
|
max_size_ = 0;
|
|
IncreaseCacheLimitLocked();
|
|
if (max_size_ == 0) {
|
|
// There isn't enough memory to go around. Just give the minimum to
|
|
// this thread.
|
|
SetMaxSize(kMinThreadCacheSize);
|
|
|
|
// Take unclaimed_cache_space_ negative.
|
|
unclaimed_cache_space_ -= kMinThreadCacheSize;
|
|
ASSERT(unclaimed_cache_space_ < 0);
|
|
}
|
|
|
|
next_ = NULL;
|
|
prev_ = NULL;
|
|
tid_ = tid;
|
|
in_setspecific_ = false;
|
|
for (uint32 cl = 0; cl < Static::num_size_classes(); ++cl) {
|
|
list_[cl].Init(Static::sizemap()->class_to_size(cl));
|
|
}
|
|
|
|
uint32_t sampler_seed;
|
|
memcpy(&sampler_seed, &tid, sizeof(sampler_seed));
|
|
sampler_.Init(uint64_t{sampler_seed} ^ reinterpret_cast<uintptr_t>(&sampler_seed));
|
|
}
|
|
|
|
void ThreadCache::Cleanup() {
|
|
// Put unused memory back into central cache
|
|
for (uint32 cl = 0; cl < Static::num_size_classes(); ++cl) {
|
|
if (list_[cl].length() > 0) {
|
|
ReleaseToCentralCache(&list_[cl], cl, list_[cl].length());
|
|
}
|
|
}
|
|
}
|
|
|
|
// Remove some objects of class "cl" from central cache and add to thread heap.
|
|
// On success, return the first object for immediate use; otherwise return NULL.
|
|
void* ThreadCache::FetchFromCentralCache(uint32 cl, int32_t byte_size,
|
|
void *(*oom_handler)(size_t size)) {
|
|
FreeList* list = &list_[cl];
|
|
ASSERT(list->empty());
|
|
const int batch_size = Static::sizemap()->num_objects_to_move(cl);
|
|
|
|
const int num_to_move = min<int>(list->max_length(), batch_size);
|
|
void *start, *end;
|
|
int fetch_count = Static::central_cache()[cl].RemoveRange(
|
|
&start, &end, num_to_move);
|
|
|
|
if (fetch_count == 0) {
|
|
ASSERT(start == NULL);
|
|
return oom_handler(byte_size);
|
|
}
|
|
ASSERT(start != NULL);
|
|
|
|
if (--fetch_count >= 0) {
|
|
size_ += byte_size * fetch_count;
|
|
list->PushRange(fetch_count, SLL_Next(start), end);
|
|
}
|
|
|
|
// Increase max length slowly up to batch_size. After that,
|
|
// increase by batch_size in one shot so that the length is a
|
|
// multiple of batch_size.
|
|
if (list->max_length() < batch_size) {
|
|
list->set_max_length(list->max_length() + 1);
|
|
} else {
|
|
// Don't let the list get too long. In 32 bit builds, the length
|
|
// is represented by a 16 bit int, so we need to watch out for
|
|
// integer overflow.
|
|
int new_length = min<int>(list->max_length() + batch_size,
|
|
kMaxDynamicFreeListLength);
|
|
// The list's max_length must always be a multiple of batch_size,
|
|
// and kMaxDynamicFreeListLength is not necessarily a multiple
|
|
// of batch_size.
|
|
new_length -= new_length % batch_size;
|
|
ASSERT(new_length % batch_size == 0);
|
|
list->set_max_length(new_length);
|
|
}
|
|
return start;
|
|
}
|
|
|
|
void ThreadCache::ListTooLong(FreeList* list, uint32 cl) {
|
|
size_ += list->object_size();
|
|
|
|
const int batch_size = Static::sizemap()->num_objects_to_move(cl);
|
|
ReleaseToCentralCache(list, cl, batch_size);
|
|
|
|
// If the list is too long, we need to transfer some number of
|
|
// objects to the central cache. Ideally, we would transfer
|
|
// num_objects_to_move, so the code below tries to make max_length
|
|
// converge on num_objects_to_move.
|
|
|
|
if (list->max_length() < batch_size) {
|
|
// Slow start the max_length so we don't overreserve.
|
|
list->set_max_length(list->max_length() + 1);
|
|
} else if (list->max_length() > batch_size) {
|
|
// If we consistently go over max_length, shrink max_length. If we don't
|
|
// shrink it, some amount of memory will always stay in this freelist.
|
|
list->set_length_overages(list->length_overages() + 1);
|
|
if (list->length_overages() > kMaxOverages) {
|
|
ASSERT(list->max_length() > batch_size);
|
|
list->set_max_length(list->max_length() - batch_size);
|
|
list->set_length_overages(0);
|
|
}
|
|
}
|
|
|
|
if (PREDICT_FALSE(size_ > max_size_)) {
|
|
Scavenge();
|
|
}
|
|
}
|
|
|
|
// Remove some objects of class "cl" from thread heap and add to central cache
|
|
void ThreadCache::ReleaseToCentralCache(FreeList* src, uint32 cl, int N) {
|
|
ASSERT(src == &list_[cl]);
|
|
if (N > src->length()) N = src->length();
|
|
size_t delta_bytes = N * Static::sizemap()->ByteSizeForClass(cl);
|
|
|
|
// We return prepackaged chains of the correct size to the central cache.
|
|
// TODO: Use the same format internally in the thread caches?
|
|
int batch_size = Static::sizemap()->num_objects_to_move(cl);
|
|
while (N > batch_size) {
|
|
void *tail, *head;
|
|
src->PopRange(batch_size, &head, &tail);
|
|
Static::central_cache()[cl].InsertRange(head, tail, batch_size);
|
|
N -= batch_size;
|
|
}
|
|
void *tail, *head;
|
|
src->PopRange(N, &head, &tail);
|
|
Static::central_cache()[cl].InsertRange(head, tail, N);
|
|
size_ -= delta_bytes;
|
|
}
|
|
|
|
// Release idle memory to the central cache
|
|
void ThreadCache::Scavenge() {
|
|
// If the low-water mark for the free list is L, it means we would
|
|
// not have had to allocate anything from the central cache even if
|
|
// we had reduced the free list size by L. We aim to get closer to
|
|
// that situation by dropping L/2 nodes from the free list. This
|
|
// may not release much memory, but if so we will call scavenge again
|
|
// pretty soon and the low-water marks will be high on that call.
|
|
for (int cl = 0; cl < Static::num_size_classes(); cl++) {
|
|
FreeList* list = &list_[cl];
|
|
const int lowmark = list->lowwatermark();
|
|
if (lowmark > 0) {
|
|
const int drop = (lowmark > 1) ? lowmark/2 : 1;
|
|
ReleaseToCentralCache(list, cl, drop);
|
|
|
|
// Shrink the max length if it isn't used. Only shrink down to
|
|
// batch_size -- if the thread was active enough to get the max_length
|
|
// above batch_size, it will likely be that active again. If
|
|
// max_length shinks below batch_size, the thread will have to
|
|
// go through the slow-start behavior again. The slow-start is useful
|
|
// mainly for threads that stay relatively idle for their entire
|
|
// lifetime.
|
|
const int batch_size = Static::sizemap()->num_objects_to_move(cl);
|
|
if (list->max_length() > batch_size) {
|
|
list->set_max_length(
|
|
max<int>(list->max_length() - batch_size, batch_size));
|
|
}
|
|
}
|
|
list->clear_lowwatermark();
|
|
}
|
|
|
|
IncreaseCacheLimit();
|
|
}
|
|
|
|
void ThreadCache::IncreaseCacheLimit() {
|
|
SpinLockHolder h(Static::pageheap_lock());
|
|
IncreaseCacheLimitLocked();
|
|
}
|
|
|
|
void ThreadCache::IncreaseCacheLimitLocked() {
|
|
if (unclaimed_cache_space_ > 0) {
|
|
// Possibly make unclaimed_cache_space_ negative.
|
|
unclaimed_cache_space_ -= kStealAmount;
|
|
SetMaxSize(max_size_ + kStealAmount);
|
|
return;
|
|
}
|
|
// Don't hold pageheap_lock too long. Try to steal from 10 other
|
|
// threads before giving up. The i < 10 condition also prevents an
|
|
// infinite loop in case none of the existing thread heaps are
|
|
// suitable places to steal from.
|
|
for (int i = 0; i < 10;
|
|
++i, next_memory_steal_ = next_memory_steal_->next_) {
|
|
// Reached the end of the linked list. Start at the beginning.
|
|
if (next_memory_steal_ == NULL) {
|
|
ASSERT(thread_heaps_ != NULL);
|
|
next_memory_steal_ = thread_heaps_;
|
|
}
|
|
if (next_memory_steal_ == this ||
|
|
next_memory_steal_->max_size_ <= kMinThreadCacheSize) {
|
|
continue;
|
|
}
|
|
next_memory_steal_->SetMaxSize(next_memory_steal_->max_size_ - kStealAmount);
|
|
SetMaxSize(max_size_ + kStealAmount);
|
|
|
|
next_memory_steal_ = next_memory_steal_->next_;
|
|
return;
|
|
}
|
|
}
|
|
|
|
int ThreadCache::GetSamplePeriod() {
|
|
return Sampler::GetSamplePeriod();
|
|
}
|
|
|
|
void ThreadCache::InitModule() {
|
|
{
|
|
SpinLockHolder h(Static::pageheap_lock());
|
|
if (phinited) {
|
|
return;
|
|
}
|
|
const char *tcb = TCMallocGetenvSafe("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES");
|
|
if (tcb) {
|
|
set_overall_thread_cache_size(strtoll(tcb, NULL, 10));
|
|
}
|
|
Static::InitStaticVars();
|
|
threadcache_allocator.Init();
|
|
phinited = 1;
|
|
}
|
|
|
|
// We do "late" part of initialization without holding lock since
|
|
// there is chance it'll recurse into malloc
|
|
Static::InitLateMaybeRecursive();
|
|
|
|
#ifndef NDEBUG
|
|
// pthread_atfork above may malloc sometimes. Lets ensure we test
|
|
// that malloc works from here.
|
|
(operator delete)((operator new)(1));
|
|
#endif
|
|
}
|
|
|
|
void ThreadCache::InitTSD() {
|
|
ASSERT(!tsd_inited_);
|
|
perftools_pthread_key_create(&heap_key_, DestroyThreadCache);
|
|
tsd_inited_ = true;
|
|
|
|
#ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
|
|
// We may have used a fake pthread_t for the main thread. Fix it.
|
|
pthread_t zero;
|
|
memset(&zero, 0, sizeof(zero));
|
|
SpinLockHolder h(Static::pageheap_lock());
|
|
for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
|
|
if (h->tid_ == zero) {
|
|
h->tid_ = pthread_self();
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
ThreadCache* ThreadCache::CreateCacheIfNecessary() {
|
|
if (!tsd_inited_) {
|
|
#ifndef NDEBUG
|
|
// tests that freeing nullptr very early is working
|
|
free(NULL);
|
|
#endif
|
|
|
|
InitModule();
|
|
}
|
|
|
|
// Initialize per-thread data if necessary
|
|
ThreadCache* heap = NULL;
|
|
|
|
bool seach_condition = true;
|
|
#ifdef HAVE_TLS
|
|
static __thread ThreadCache** current_heap_ptr ATTR_INITIAL_EXEC;
|
|
if (tsd_inited_) {
|
|
// In most common case we're avoiding expensive linear search
|
|
// through all heaps (see below). Working TLS enables faster
|
|
// protection from malloc recursion in pthread_setspecific
|
|
seach_condition = false;
|
|
|
|
if (current_heap_ptr != NULL) {
|
|
// we're being recursively called by pthread_setspecific below.
|
|
return *current_heap_ptr;
|
|
}
|
|
current_heap_ptr = &heap;
|
|
}
|
|
#endif
|
|
|
|
{
|
|
SpinLockHolder h(Static::pageheap_lock());
|
|
// On some old glibc's, and on freebsd's libc (as of freebsd 8.1),
|
|
// calling pthread routines (even pthread_self) too early could
|
|
// cause a segfault. Since we can call pthreads quite early, we
|
|
// have to protect against that in such situations by making a
|
|
// 'fake' pthread. This is not ideal since it doesn't work well
|
|
// when linking tcmalloc statically with apps that create threads
|
|
// before main, so we only do it if we have to.
|
|
#ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
|
|
pthread_t me;
|
|
if (!tsd_inited_) {
|
|
memset(&me, 0, sizeof(me));
|
|
} else {
|
|
me = pthread_self();
|
|
}
|
|
#else
|
|
const pthread_t me = pthread_self();
|
|
#endif
|
|
|
|
// This may be a recursive malloc call from pthread_setspecific()
|
|
// In that case, the heap for this thread has already been created
|
|
// and added to the linked list. So we search for that first.
|
|
if (seach_condition) {
|
|
for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
|
|
if (h->tid_ == me) {
|
|
heap = h;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (heap == NULL) heap = NewHeap(me);
|
|
}
|
|
|
|
// We call pthread_setspecific() outside the lock because it may
|
|
// call malloc() recursively. We check for the recursive call using
|
|
// the "in_setspecific_" flag so that we can avoid calling
|
|
// pthread_setspecific() if we are already inside pthread_setspecific().
|
|
if (!heap->in_setspecific_ && tsd_inited_) {
|
|
heap->in_setspecific_ = true;
|
|
perftools_pthread_setspecific(heap_key_, heap);
|
|
#ifdef HAVE_TLS
|
|
// Also keep a copy in __thread for faster retrieval
|
|
threadlocal_data_.heap = heap;
|
|
threadlocal_data_.fast_path_heap = heap;
|
|
#endif
|
|
heap->in_setspecific_ = false;
|
|
}
|
|
#ifdef HAVE_TLS
|
|
current_heap_ptr = NULL;
|
|
#endif
|
|
return heap;
|
|
}
|
|
|
|
ThreadCache* ThreadCache::NewHeap(pthread_t tid) {
|
|
// Create the heap and add it to the linked list
|
|
ThreadCache *heap = threadcache_allocator.New();
|
|
heap->Init(tid);
|
|
heap->next_ = thread_heaps_;
|
|
heap->prev_ = NULL;
|
|
if (thread_heaps_ != NULL) {
|
|
thread_heaps_->prev_ = heap;
|
|
} else {
|
|
// This is the only thread heap at the momment.
|
|
ASSERT(next_memory_steal_ == NULL);
|
|
next_memory_steal_ = heap;
|
|
}
|
|
thread_heaps_ = heap;
|
|
thread_heap_count_++;
|
|
return heap;
|
|
}
|
|
|
|
void ThreadCache::BecomeIdle() {
|
|
if (!tsd_inited_) return; // No caches yet
|
|
ThreadCache* heap = GetThreadHeap();
|
|
if (heap == NULL) return; // No thread cache to remove
|
|
if (heap->in_setspecific_) return; // Do not disturb the active caller
|
|
|
|
heap->in_setspecific_ = true;
|
|
perftools_pthread_setspecific(heap_key_, NULL);
|
|
#ifdef HAVE_TLS
|
|
// Also update the copy in __thread
|
|
threadlocal_data_.heap = NULL;
|
|
threadlocal_data_.fast_path_heap = NULL;
|
|
#endif
|
|
heap->in_setspecific_ = false;
|
|
if (GetThreadHeap() == heap) {
|
|
// Somehow heap got reinstated by a recursive call to malloc
|
|
// from pthread_setspecific. We give up in this case.
|
|
return;
|
|
}
|
|
|
|
// We can now get rid of the heap
|
|
DeleteCache(heap);
|
|
}
|
|
|
|
void ThreadCache::DestroyThreadCache(void* ptr) {
|
|
// Note that "ptr" cannot be NULL since pthread promises not
|
|
// to invoke the destructor on NULL values, but for safety,
|
|
// we check anyway.
|
|
if (ptr == NULL) return;
|
|
#ifdef HAVE_TLS
|
|
// Prevent fast path of GetThreadHeap() from returning heap.
|
|
threadlocal_data_.heap = NULL;
|
|
threadlocal_data_.fast_path_heap = NULL;
|
|
#endif
|
|
DeleteCache(reinterpret_cast<ThreadCache*>(ptr));
|
|
}
|
|
|
|
void ThreadCache::DeleteCache(ThreadCache* heap) {
|
|
// Remove all memory from heap
|
|
heap->Cleanup();
|
|
|
|
// Remove from linked list
|
|
SpinLockHolder h(Static::pageheap_lock());
|
|
if (heap->next_ != NULL) heap->next_->prev_ = heap->prev_;
|
|
if (heap->prev_ != NULL) heap->prev_->next_ = heap->next_;
|
|
if (thread_heaps_ == heap) thread_heaps_ = heap->next_;
|
|
thread_heap_count_--;
|
|
|
|
if (next_memory_steal_ == heap) next_memory_steal_ = heap->next_;
|
|
if (next_memory_steal_ == NULL) next_memory_steal_ = thread_heaps_;
|
|
unclaimed_cache_space_ += heap->max_size_;
|
|
|
|
threadcache_allocator.Delete(heap);
|
|
}
|
|
|
|
void ThreadCache::RecomputePerThreadCacheSize() {
|
|
// Divide available space across threads
|
|
int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1;
|
|
size_t space = overall_thread_cache_size_ / n;
|
|
|
|
// Limit to allowed range
|
|
if (space < kMinThreadCacheSize) space = kMinThreadCacheSize;
|
|
if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize;
|
|
|
|
double ratio = space / max<double>(1, per_thread_cache_size_);
|
|
size_t claimed = 0;
|
|
for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
|
|
// Increasing the total cache size should not circumvent the
|
|
// slow-start growth of max_size_.
|
|
if (ratio < 1.0) {
|
|
h->SetMaxSize(h->max_size_ * ratio);
|
|
}
|
|
claimed += h->max_size_;
|
|
}
|
|
unclaimed_cache_space_ = overall_thread_cache_size_ - claimed;
|
|
per_thread_cache_size_ = space;
|
|
}
|
|
|
|
void ThreadCache::GetThreadStats(uint64_t* total_bytes, uint64_t* class_count) {
|
|
for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
|
|
*total_bytes += h->Size();
|
|
if (class_count) {
|
|
for (int cl = 0; cl < Static::num_size_classes(); ++cl) {
|
|
class_count[cl] += h->freelist_length(cl);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void ThreadCache::set_overall_thread_cache_size(size_t new_size) {
|
|
// Clip the value to a reasonable range
|
|
if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize;
|
|
if (new_size > (1<<30)) new_size = (1<<30); // Limit to 1GB
|
|
overall_thread_cache_size_ = new_size;
|
|
|
|
RecomputePerThreadCacheSize();
|
|
}
|
|
|
|
} // namespace tcmalloc
|