728 lines
24 KiB
C++
728 lines
24 KiB
C++
// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
|
|
/* Copyright (c) 2005-2007, Google Inc.
|
|
* Copyright (c) 2023, gperftools Contributors
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are
|
|
* met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above
|
|
* copyright notice, this list of conditions and the following disclaimer
|
|
* in the documentation and/or other materials provided with the
|
|
* distribution.
|
|
* * Neither the name of Google Inc. nor the names of its
|
|
* contributors may be used to endorse or promote products derived from
|
|
* this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* ---
|
|
* Author: Markus Gutschke
|
|
*
|
|
* Substantial upgrades by Aliaksey Kandratsenka. All bugs are mine.
|
|
*/
|
|
#ifndef _GNU_SOURCE
|
|
#define _GNU_SOURCE
|
|
#endif
|
|
|
|
#include "base/linuxthreads.h"
|
|
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#include <limits.h>
|
|
#include <sched.h>
|
|
#include <signal.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <sys/prctl.h>
|
|
#include <sys/ptrace.h>
|
|
#include <sys/socket.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/syscall.h>
|
|
#include <sys/wait.h>
|
|
#include <unistd.h>
|
|
|
|
#include <atomic>
|
|
|
|
#include "base/basictypes.h"
|
|
#include "base/logging.h"
|
|
|
|
#ifndef CLONE_UNTRACED
|
|
#define CLONE_UNTRACED 0x00800000
|
|
#endif
|
|
|
|
#ifndef PR_SET_PTRACER
|
|
#define PR_SET_PTRACER 0x59616d61
|
|
#endif
|
|
|
|
namespace {
|
|
|
|
class SetPTracerSetup {
|
|
public:
|
|
~SetPTracerSetup() {
|
|
if (need_cleanup_) {
|
|
prctl(PR_SET_PTRACER, 0, 0, 0, 0);
|
|
}
|
|
}
|
|
void Prepare(int clone_pid) {
|
|
if (prctl(PR_SET_PTRACER, clone_pid, 0, 0, 0) == 0) {
|
|
need_cleanup_ = true;
|
|
}
|
|
}
|
|
|
|
private:
|
|
bool need_cleanup_ = false;
|
|
};
|
|
|
|
class UniqueFD {
|
|
public:
|
|
explicit UniqueFD(int fd) : fd_(fd) {}
|
|
|
|
int ReleaseFD() {
|
|
int retval = fd_;
|
|
fd_ = -1;
|
|
return retval;
|
|
}
|
|
|
|
~UniqueFD() {
|
|
if (fd_ < 0) {
|
|
return;
|
|
}
|
|
(void)close(fd_);
|
|
}
|
|
private:
|
|
int fd_;
|
|
};
|
|
|
|
template <typename Body>
|
|
struct SimpleCleanup {
|
|
const Body body;
|
|
|
|
explicit SimpleCleanup(const Body& body) : body(body) {}
|
|
|
|
~SimpleCleanup() {
|
|
body();
|
|
}
|
|
};
|
|
|
|
template <typename Body>
|
|
SimpleCleanup<Body> MakeSimpleCleanup(const Body& body) {
|
|
return SimpleCleanup<Body>{body};
|
|
};
|
|
|
|
} // namespace
|
|
|
|
/* Synchronous signals that should not be blocked while in the lister thread.
|
|
*/
|
|
static const int sync_signals[] = {
|
|
SIGABRT, SIGILL,
|
|
SIGFPE, SIGSEGV, SIGBUS,
|
|
#ifdef SIGEMT
|
|
SIGEMT,
|
|
#endif
|
|
SIGSYS, SIGTRAP,
|
|
SIGXCPU, SIGXFSZ };
|
|
|
|
ATTRIBUTE_NOINLINE
|
|
static int local_clone (int (*fn)(void *), void *arg) {
|
|
#ifdef __PPC64__
|
|
/* To avoid the gap cross page boundaries, increase by the large parge
|
|
* size mostly PowerPC system uses. */
|
|
|
|
// FIXME(alk): I don't really understand why ppc needs this and why
|
|
// 64k pages matter. I.e. some other architectures have 64k pages,
|
|
// so should we do the same there?
|
|
uintptr_t clone_stack_size = 64 << 10;
|
|
#else
|
|
uintptr_t clone_stack_size = 4 << 10;
|
|
#endif
|
|
|
|
bool grows_to_low = (&arg < arg);
|
|
if (grows_to_low) {
|
|
// Negate clone_stack_size if stack grows to lower addresses
|
|
// (common for arch-es that matter).
|
|
clone_stack_size = ~clone_stack_size + 1;
|
|
}
|
|
|
|
#if defined(__i386__) || defined(__x86_64__) || defined(__riscv) || defined(__arm__) || defined(__aarch64__)
|
|
// Sanity check code above. We know that those arch-es grow stack to
|
|
// lower addresses.
|
|
CHECK(grows_to_low);
|
|
#endif
|
|
|
|
/* Leave 4kB of gap between the callers stack and the new clone. This
|
|
* should be more than sufficient for the caller to call waitpid() until
|
|
* the cloned thread terminates.
|
|
*
|
|
* It is important that we set the CLONE_UNTRACED flag, because newer
|
|
* versions of "gdb" otherwise attempt to attach to our thread, and will
|
|
* attempt to reap its status codes. This subsequently results in the
|
|
* caller hanging indefinitely in waitpid(), waiting for a change in
|
|
* status that will never happen. By setting the CLONE_UNTRACED flag, we
|
|
* prevent "gdb" from stealing events, but we still expect the thread
|
|
* lister to fail, because it cannot PTRACE_ATTACH to the process that
|
|
* is being debugged. This is OK and the error code will be reported
|
|
* correctly.
|
|
*/
|
|
uintptr_t stack_addr = reinterpret_cast<uintptr_t>(&arg) + clone_stack_size;
|
|
stack_addr &= ~63; // align stack address on 64 bytes (x86 needs 16, but lets be generous)
|
|
return clone(fn, reinterpret_cast<void*>(stack_addr),
|
|
CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_UNTRACED,
|
|
arg, 0, 0, 0);
|
|
}
|
|
|
|
|
|
/* Local substitute for the atoi() function, which is not necessarily safe
|
|
* to call once threads are suspended (depending on whether libc looks up
|
|
* locale information, when executing atoi()).
|
|
*/
|
|
static int local_atoi(const char *s) {
|
|
int n = 0;
|
|
int neg = *s == '-';
|
|
if (neg)
|
|
s++;
|
|
while (*s >= '0' && *s <= '9')
|
|
n = 10*n + (*s++ - '0');
|
|
return neg ? -n : n;
|
|
}
|
|
|
|
static int ptrace_detach(pid_t pid) {
|
|
return ptrace(PTRACE_DETACH, pid, nullptr, nullptr);
|
|
}
|
|
|
|
/* Re-runs fn until it doesn't cause EINTR
|
|
*/
|
|
#define NO_INTR(fn) do {} while ((fn) < 0 && errno == EINTR)
|
|
|
|
/* abort() is not safely reentrant, and changes it's behavior each time
|
|
* it is called. This means, if the main application ever called abort()
|
|
* we cannot safely call it again. This would happen if we were called
|
|
* from a SIGABRT signal handler in the main application. So, document
|
|
* that calling SIGABRT from the thread lister makes it not signal safe
|
|
* (and vice-versa).
|
|
* Also, since we share address space with the main application, we
|
|
* cannot call abort() from the callback and expect the main application
|
|
* to behave correctly afterwards. In fact, the only thing we can do, is
|
|
* to terminate the main application with extreme prejudice (aka
|
|
* PTRACE_KILL).
|
|
* We set up our own SIGABRT handler to do this.
|
|
* In order to find the main application from the signal handler, we
|
|
* need to store information about it in global variables. This is
|
|
* safe, because the main application should be suspended at this
|
|
* time. If the callback ever called TCMalloc_ResumeAllProcessThreads(), then
|
|
* we are running a higher risk, though. So, try to avoid calling
|
|
* abort() after calling TCMalloc_ResumeAllProcessThreads.
|
|
*/
|
|
static volatile int *sig_pids, sig_num_threads;
|
|
|
|
|
|
/* Signal handler to help us recover from dying while we are attached to
|
|
* other threads.
|
|
*/
|
|
static void SignalHandler(int signum, siginfo_t *si, void *data) {
|
|
RAW_LOG(ERROR, "Got fatal signal %d inside ListerThread", signum);
|
|
|
|
if (sig_pids != NULL) {
|
|
if (signum == SIGABRT) {
|
|
prctl(PR_SET_PDEATHSIG, 0);
|
|
while (sig_num_threads-- > 0) {
|
|
/* Not sure if sched_yield is really necessary here, but it does not */
|
|
/* hurt, and it might be necessary for the same reasons that we have */
|
|
/* to do so in ptrace_detach(). */
|
|
sched_yield();
|
|
ptrace(PTRACE_KILL, sig_pids[sig_num_threads], 0, 0);
|
|
}
|
|
} else if (sig_num_threads > 0) {
|
|
TCMalloc_ResumeAllProcessThreads(sig_num_threads, (int *)sig_pids);
|
|
}
|
|
}
|
|
sig_pids = NULL;
|
|
|
|
syscall(SYS_exit, signum == SIGABRT ? 1 : 2);
|
|
}
|
|
|
|
|
|
/* Try to dirty the stack, and hope that the compiler is not smart enough
|
|
* to optimize this function away. Or worse, the compiler could inline the
|
|
* function and permanently allocate the data on the stack.
|
|
*/
|
|
static void DirtyStack(size_t amount) {
|
|
char buf[amount];
|
|
memset(buf, 0, amount);
|
|
read(-1, buf, amount);
|
|
}
|
|
|
|
|
|
/* Data structure for passing arguments to the lister thread.
|
|
*/
|
|
#define ALT_STACKSIZE (MINSIGSTKSZ + 4096)
|
|
|
|
struct ListerParams {
|
|
int result, err;
|
|
pid_t ppid;
|
|
int start_pipe_rd;
|
|
int start_pipe_wr;
|
|
char *altstack_mem;
|
|
ListAllProcessThreadsCallBack callback;
|
|
void *parameter;
|
|
va_list ap;
|
|
int proc_fd;
|
|
};
|
|
|
|
struct kernel_dirent64 { // see man 2 getdents
|
|
int64_t d_ino; /* 64-bit inode number */
|
|
int64_t d_off; /* 64-bit offset to next structure */
|
|
unsigned short d_reclen; /* Size of this dirent */
|
|
unsigned char d_type; /* File type */
|
|
char d_name[]; /* Filename (null-terminated) */
|
|
};
|
|
|
|
static const kernel_dirent64 *BumpDirentPtr(const kernel_dirent64 *ptr, uintptr_t by_bytes) {
|
|
return reinterpret_cast<kernel_dirent64*>(reinterpret_cast<uintptr_t>(ptr) + by_bytes);
|
|
}
|
|
|
|
static int ListerThread(struct ListerParams *args) {
|
|
int found_parent = 0;
|
|
pid_t clone_pid = syscall(SYS_gettid);
|
|
int proc = args->proc_fd, num_threads = 0;
|
|
int max_threads = 0, sig;
|
|
struct stat proc_sb;
|
|
stack_t altstack;
|
|
|
|
/* Wait for parent thread to set appropriate permissions to allow
|
|
* ptrace activity. Note we using pipe pair, so which ensures we
|
|
* don't sleep past parent's death.
|
|
*/
|
|
(void)close(args->start_pipe_wr);
|
|
{
|
|
char tmp;
|
|
read(args->start_pipe_rd, &tmp, sizeof(tmp));
|
|
}
|
|
|
|
// No point in continuing if parent dies before/during ptracing.
|
|
prctl(PR_SET_PDEATHSIG, SIGKILL);
|
|
|
|
/* Catch signals on an alternate pre-allocated stack. This way, we can
|
|
* safely execute the signal handler even if we ran out of memory.
|
|
*/
|
|
memset(&altstack, 0, sizeof(altstack));
|
|
altstack.ss_sp = args->altstack_mem;
|
|
altstack.ss_flags = 0;
|
|
altstack.ss_size = ALT_STACKSIZE;
|
|
sigaltstack(&altstack, nullptr);
|
|
|
|
/* Some kernels forget to wake up traced processes, when the
|
|
* tracer dies. So, intercept synchronous signals and make sure
|
|
* that we wake up our tracees before dying. It is the caller's
|
|
* responsibility to ensure that asynchronous signals do not
|
|
* interfere with this function.
|
|
*/
|
|
for (sig = 0; sig < sizeof(sync_signals)/sizeof(*sync_signals); sig++) {
|
|
struct sigaction sa;
|
|
memset(&sa, 0, sizeof(sa));
|
|
sa.sa_sigaction = SignalHandler;
|
|
sigfillset(&sa.sa_mask);
|
|
sa.sa_flags = SA_ONSTACK|SA_SIGINFO|SA_RESETHAND;
|
|
sigaction(sync_signals[sig], &sa, nullptr);
|
|
}
|
|
|
|
/* Read process directories in /proc/... */
|
|
for (;;) {
|
|
if (lseek(proc, 0, SEEK_SET) < 0) {
|
|
goto failure;
|
|
}
|
|
if (fstat(proc, &proc_sb) < 0) {
|
|
goto failure;
|
|
}
|
|
|
|
/* Since we are suspending threads, we cannot call any libc
|
|
* functions that might acquire locks. Most notably, we cannot
|
|
* call malloc(). So, we have to allocate memory on the stack,
|
|
* instead. Since we do not know how much memory we need, we
|
|
* make a best guess. And if we guessed incorrectly we retry on
|
|
* a second iteration (by jumping to "detach_threads").
|
|
*
|
|
* Unless the number of threads is increasing very rapidly, we
|
|
* should never need to do so, though, as our guestimate is very
|
|
* conservative.
|
|
*/
|
|
if (max_threads < proc_sb.st_nlink + 100) {
|
|
max_threads = proc_sb.st_nlink + 100;
|
|
}
|
|
|
|
/* scope */ {
|
|
pid_t pids[max_threads];
|
|
int added_entries = 0;
|
|
sig_num_threads = num_threads;
|
|
sig_pids = pids;
|
|
for (;;) {
|
|
// lets make sure to align buf to store kernel_dirent64-s properly.
|
|
int64_t buf[4096 / sizeof(int64_t)];
|
|
|
|
ssize_t nbytes = syscall(SYS_getdents64, proc, buf, sizeof(buf));
|
|
// fprintf(stderr, "nbytes = %zd\n", nbytes);
|
|
|
|
if (nbytes < 0) {
|
|
goto failure;
|
|
}
|
|
|
|
if (nbytes == 0) {
|
|
if (added_entries) {
|
|
/* Need to keep iterating over "/proc" in multiple
|
|
* passes until we no longer find any more threads. This
|
|
* algorithm eventually completes, when all threads have
|
|
* been suspended.
|
|
*/
|
|
added_entries = 0;
|
|
lseek(proc, 0, SEEK_SET);
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
|
|
const kernel_dirent64 *entry = reinterpret_cast<kernel_dirent64*>(buf);
|
|
const kernel_dirent64 *end = BumpDirentPtr(entry, nbytes);
|
|
|
|
for (;entry < end; entry = BumpDirentPtr(entry, entry->d_reclen)) {
|
|
if (entry->d_ino == 0) {
|
|
continue;
|
|
}
|
|
|
|
const char *ptr = entry->d_name;
|
|
// fprintf(stderr, "name: %s\n", ptr);
|
|
pid_t pid;
|
|
|
|
/* Some kernels hide threads by preceding the pid with a '.' */
|
|
if (*ptr == '.')
|
|
ptr++;
|
|
|
|
/* If the directory is not numeric, it cannot be a
|
|
* process/thread
|
|
*/
|
|
if (*ptr < '0' || *ptr > '9')
|
|
continue;
|
|
pid = local_atoi(ptr);
|
|
// fprintf(stderr, "pid = %d (%d)\n", pid, getpid());
|
|
|
|
if (!pid || pid == clone_pid) {
|
|
continue;
|
|
}
|
|
|
|
/* Attach (and suspend) all threads */
|
|
long i, j;
|
|
|
|
/* Found one of our threads, make sure it is no duplicate */
|
|
for (i = 0; i < num_threads; i++) {
|
|
/* Linear search is slow, but should not matter much for
|
|
* the typically small number of threads.
|
|
*/
|
|
if (pids[i] == pid) {
|
|
/* Found a duplicate; most likely on second pass */
|
|
goto next_entry;
|
|
}
|
|
}
|
|
|
|
/* Check whether data structure needs growing */
|
|
if (num_threads >= max_threads) {
|
|
/* Back to square one, this time with more memory */
|
|
goto detach_threads;
|
|
}
|
|
|
|
/* Attaching to thread suspends it */
|
|
pids[num_threads++] = pid;
|
|
sig_num_threads = num_threads;
|
|
|
|
if (ptrace(PTRACE_ATTACH, pid, (void *)0,
|
|
(void *)0) < 0) {
|
|
/* If operation failed, ignore thread. Maybe it
|
|
* just died? There might also be a race
|
|
* condition with a concurrent core dumper or
|
|
* with a debugger. In that case, we will just
|
|
* make a best effort, rather than failing
|
|
* entirely.
|
|
*/
|
|
num_threads--;
|
|
sig_num_threads = num_threads;
|
|
goto next_entry;
|
|
}
|
|
while (waitpid(pid, (int *)0, __WALL) < 0) {
|
|
if (errno != EINTR) {
|
|
ptrace_detach(pid);
|
|
num_threads--;
|
|
sig_num_threads = num_threads;
|
|
goto next_entry;
|
|
}
|
|
}
|
|
|
|
if (syscall(SYS_ptrace, PTRACE_PEEKDATA, pid, &i, &j) || i++ != j ||
|
|
syscall(SYS_ptrace, PTRACE_PEEKDATA, pid, &i, &j) || i != j) {
|
|
/* Address spaces are distinct. This is probably
|
|
* a forked child process rather than a thread.
|
|
*/
|
|
ptrace_detach(pid);
|
|
num_threads--;
|
|
sig_num_threads = num_threads;
|
|
goto next_entry;
|
|
}
|
|
|
|
found_parent |= pid == args->ppid;
|
|
added_entries++;
|
|
|
|
next_entry:;
|
|
} // entries iterations loop
|
|
} // getdents loop
|
|
|
|
/* If we never found the parent process, something is very wrong.
|
|
* Most likely, we are running in debugger. Any attempt to operate
|
|
* on the threads would be very incomplete. Let's just report an
|
|
* error to the caller.
|
|
*/
|
|
if (!found_parent) {
|
|
TCMalloc_ResumeAllProcessThreads(num_threads, pids);
|
|
return 3;
|
|
}
|
|
|
|
/* Now we are ready to call the callback,
|
|
* which takes care of resuming the threads for us.
|
|
*/
|
|
args->result = args->callback(args->parameter, num_threads,
|
|
pids, args->ap);
|
|
args->err = errno;
|
|
|
|
/* Callback should have resumed threads, but better safe than sorry */
|
|
if (TCMalloc_ResumeAllProcessThreads(num_threads, pids)) {
|
|
/* Callback forgot to resume at least one thread, report error */
|
|
args->err = EINVAL;
|
|
args->result = -1;
|
|
}
|
|
|
|
return 0;
|
|
|
|
detach_threads:
|
|
/* Resume all threads prior to retrying the operation */
|
|
TCMalloc_ResumeAllProcessThreads(num_threads, pids);
|
|
sig_pids = NULL;
|
|
num_threads = 0;
|
|
sig_num_threads = num_threads;
|
|
max_threads += 100;
|
|
} // pids[max_threads] scope
|
|
} // for (;;)
|
|
|
|
failure:
|
|
args->result = -1;
|
|
args->err = errno;
|
|
return 1;
|
|
}
|
|
|
|
/* This function gets the list of all linux threads of the current process
|
|
* passes them to the 'callback' along with the 'parameter' pointer; at the
|
|
* call back call time all the threads are paused via
|
|
* PTRACE_ATTACH.
|
|
* The callback is executed from a separate thread which shares only the
|
|
* address space, the filesystem, and the filehandles with the caller. Most
|
|
* notably, it does not share the same pid and ppid; and if it terminates,
|
|
* the rest of the application is still there. 'callback' is supposed to do
|
|
* or arrange for TCMalloc_ResumeAllProcessThreads. This happens automatically, if
|
|
* the thread raises a synchronous signal (e.g. SIGSEGV); asynchronous
|
|
* signals are blocked. If the 'callback' decides to unblock them, it must
|
|
* ensure that they cannot terminate the application, or that
|
|
* TCMalloc_ResumeAllProcessThreads will get called.
|
|
* It is an error for the 'callback' to make any library calls that could
|
|
* acquire locks. Most notably, this means that most system calls have to
|
|
* avoid going through libc. Also, this means that it is not legal to call
|
|
* exit() or abort().
|
|
* We return -1 on error and the return value of 'callback' on success.
|
|
*/
|
|
int TCMalloc_ListAllProcessThreads(void *parameter,
|
|
ListAllProcessThreadsCallBack callback, ...) {
|
|
char altstack_mem[ALT_STACKSIZE];
|
|
struct ListerParams args;
|
|
pid_t clone_pid;
|
|
int dumpable = 1;
|
|
int need_sigprocmask = 0;
|
|
sigset_t sig_blocked, sig_old;
|
|
int status, rc;
|
|
|
|
SetPTracerSetup ptracer_setup;
|
|
|
|
auto cleanup = MakeSimpleCleanup([&] () {
|
|
int old_errno = errno;
|
|
|
|
if (need_sigprocmask) {
|
|
sigprocmask(SIG_SETMASK, &sig_old, nullptr);
|
|
}
|
|
|
|
if (!dumpable) {
|
|
prctl(PR_SET_DUMPABLE, dumpable);
|
|
}
|
|
|
|
errno = old_errno;
|
|
});
|
|
|
|
va_start(args.ap, callback);
|
|
|
|
/* If we are short on virtual memory, initializing the alternate stack
|
|
* might trigger a SIGSEGV. Let's do this early, before it could get us
|
|
* into more trouble (i.e. before signal handlers try to use the alternate
|
|
* stack, and before we attach to other threads).
|
|
*/
|
|
memset(altstack_mem, 0, sizeof(altstack_mem));
|
|
|
|
/* Some of our cleanup functions could conceivable use more stack space.
|
|
* Try to touch the stack right now. This could be defeated by the compiler
|
|
* being too smart for it's own good, so try really hard.
|
|
*/
|
|
DirtyStack(32768);
|
|
|
|
/* Make this process "dumpable". This is necessary in order to ptrace()
|
|
* after having called setuid().
|
|
*/
|
|
dumpable = prctl(PR_GET_DUMPABLE, 0);
|
|
if (!dumpable) {
|
|
prctl(PR_SET_DUMPABLE, 1);
|
|
}
|
|
|
|
/* Fill in argument block for dumper thread */
|
|
args.result = -1;
|
|
args.err = 0;
|
|
args.ppid = getpid();
|
|
args.altstack_mem = altstack_mem;
|
|
args.parameter = parameter;
|
|
args.callback = callback;
|
|
|
|
NO_INTR(args.proc_fd = open("/proc/self/task/", O_RDONLY|O_DIRECTORY|O_CLOEXEC));
|
|
UniqueFD proc_closer{args.proc_fd};
|
|
|
|
if (args.proc_fd < 0) {
|
|
return -1;
|
|
}
|
|
|
|
int pipefds[2];
|
|
if (pipe2(pipefds, O_CLOEXEC)) {
|
|
return -1;
|
|
}
|
|
|
|
UniqueFD pipe_rd_closer{pipefds[0]};
|
|
UniqueFD pipe_wr_closer{pipefds[1]};
|
|
|
|
args.start_pipe_rd = pipefds[0];
|
|
args.start_pipe_wr = pipefds[1];
|
|
|
|
/* Before cloning the thread lister, block all asynchronous signals, as we */
|
|
/* are not prepared to handle them. */
|
|
sigfillset(&sig_blocked);
|
|
for (int sig = 0; sig < sizeof(sync_signals)/sizeof(*sync_signals); sig++) {
|
|
sigdelset(&sig_blocked, sync_signals[sig]);
|
|
}
|
|
if (sigprocmask(SIG_BLOCK, &sig_blocked, &sig_old)) {
|
|
return -1;
|
|
}
|
|
need_sigprocmask = 1;
|
|
|
|
// make sure all functions used by parent from local_clone to after
|
|
// waitpid have plt entries fully initialized. We cannot afford
|
|
// dynamic linker running relocations and messing with errno (see
|
|
// comment just below)
|
|
(void)prctl(PR_GET_PDEATHSIG, 0);
|
|
(void)close(-1);
|
|
(void)waitpid(INT_MIN, nullptr, 0);
|
|
|
|
/* After cloning, both the parent and the child share the same
|
|
* instance of errno. We deal with this by being very
|
|
* careful. Specifically, child immediately calls into sem_wait
|
|
* which never fails (cannot even EINTR), so doesn't touch errno.
|
|
*
|
|
* Parent sets up PR_SET_PTRACER prctl (if it fails, which usually
|
|
* doesn't happen, we ignore that failure). Then parent does close
|
|
* on write side of start pipe. After that child runs complex code,
|
|
* including arbitrary callback. So parent avoids screwing with
|
|
* errno by immediately calling waitpid with async signals disabled.
|
|
*
|
|
* I.e. errno is parent's up until close below. Then errno belongs
|
|
* to child up until it exits.
|
|
*/
|
|
clone_pid = local_clone((int (*)(void *))ListerThread, &args);
|
|
if (clone_pid < 0) {
|
|
return -1;
|
|
}
|
|
|
|
/* Most Linux kernels in the wild have Yama LSM enabled, so
|
|
* requires us to explicitly give permission for child to ptrace
|
|
* us. See man 2 ptrace for details. This then requires us to
|
|
* synchronize with the child (see close on start pipe
|
|
* below). I.e. so that child doesn't start ptracing before we've
|
|
* completed this prctl call.
|
|
*/
|
|
ptracer_setup.Prepare(clone_pid);
|
|
|
|
/* Closing write side of pipe works like releasing the lock. It
|
|
* allows the ListerThread to run past read() call on read side of
|
|
* pipe and ptrace us.
|
|
*/
|
|
close(pipe_wr_closer.ReleaseFD());
|
|
|
|
/* So here child runs (see ListerThread), it finds and ptraces all
|
|
* threads, runs whatever callback is setup and then
|
|
* detaches/resumes everything. In any case we wait for child's
|
|
* completion to gather status and synchronize everything. */
|
|
|
|
rc = waitpid(clone_pid, &status, __WALL);
|
|
|
|
if (rc < 0) {
|
|
if (errno == EINTR) {
|
|
RAW_LOG(FATAL, "BUG: EINTR from waitpid shouldn't be possible!");
|
|
}
|
|
// Any error waiting for child is sign of some bug, so abort
|
|
// asap. Continuing is unsafe anyways with child potentially writing to our
|
|
// stack.
|
|
RAW_LOG(FATAL, "BUG: waitpid inside TCMalloc_ListAllProcessThreads cannot fail, but it did. Raw errno: %d\n", errno);
|
|
} else if (WIFEXITED(status)) {
|
|
errno = args.err;
|
|
switch (WEXITSTATUS(status)) {
|
|
case 0: break; /* Normal process termination */
|
|
case 2: args.err = EFAULT; /* Some fault (e.g. SIGSEGV) detected */
|
|
args.result = -1;
|
|
break;
|
|
case 3: args.err = EPERM; /* Process is already being traced */
|
|
args.result = -1;
|
|
break;
|
|
default:args.err = ECHILD; /* Child died unexpectedly */
|
|
args.result = -1;
|
|
break;
|
|
}
|
|
} else if (!WIFEXITED(status)) {
|
|
args.err = EFAULT; /* Terminated due to an unhandled signal*/
|
|
args.result = -1;
|
|
}
|
|
|
|
errno = args.err;
|
|
return args.result;
|
|
}
|
|
|
|
/* This function resumes the list of all linux threads that
|
|
* TCMalloc_ListAllProcessThreads pauses before giving to its callback.
|
|
* The function returns non-zero if at least one thread was
|
|
* suspended and has now been resumed.
|
|
*/
|
|
int TCMalloc_ResumeAllProcessThreads(int num_threads, pid_t *thread_pids) {
|
|
int detached_at_least_one = 0;
|
|
while (num_threads-- > 0) {
|
|
detached_at_least_one |= (ptrace_detach(thread_pids[num_threads]) >= 0);
|
|
}
|
|
return detached_at_least_one;
|
|
}
|