// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- /* Copyright (c) 2005-2007, Google Inc. * Copyright (c) 2023, gperftools Contributors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * --- * Author: Markus Gutschke * * Substantial upgrades by Aliaksey Kandratsenka. All bugs are mine. */ #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include "base/linuxthreads.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "base/basictypes.h" #include "base/logging.h" #ifndef CLONE_UNTRACED #define CLONE_UNTRACED 0x00800000 #endif #ifndef PR_SET_PTRACER #define PR_SET_PTRACER 0x59616d61 #endif namespace { class SetPTracerSetup { public: ~SetPTracerSetup() { if (need_cleanup_) { prctl(PR_SET_PTRACER, 0, 0, 0, 0); } } void Prepare(int clone_pid) { if (prctl(PR_SET_PTRACER, clone_pid, 0, 0, 0) == 0) { need_cleanup_ = true; } } private: bool need_cleanup_ = false; }; class UniqueFD { public: explicit UniqueFD(int fd) : fd_(fd) {} int ReleaseFD() { int retval = fd_; fd_ = -1; return retval; } ~UniqueFD() { if (fd_ < 0) { return; } (void)close(fd_); } private: int fd_; }; template struct SimpleCleanup { const Body body; explicit SimpleCleanup(const Body& body) : body(body) {} ~SimpleCleanup() { body(); } }; template SimpleCleanup MakeSimpleCleanup(const Body& body) { return SimpleCleanup{body}; }; } // namespace /* Synchronous signals that should not be blocked while in the lister thread. */ static const int sync_signals[] = { SIGABRT, SIGILL, SIGFPE, SIGSEGV, SIGBUS, #ifdef SIGEMT SIGEMT, #endif SIGSYS, SIGTRAP, SIGXCPU, SIGXFSZ }; ATTRIBUTE_NOINLINE static int local_clone (int (*fn)(void *), void *arg) { #ifdef __PPC64__ /* To avoid the gap cross page boundaries, increase by the large parge * size mostly PowerPC system uses. */ // FIXME(alk): I don't really understand why ppc needs this and why // 64k pages matter. I.e. some other architectures have 64k pages, // so should we do the same there? uintptr_t clone_stack_size = 64 << 10; #else uintptr_t clone_stack_size = 4 << 10; #endif bool grows_to_low = (&arg < arg); if (grows_to_low) { // Negate clone_stack_size if stack grows to lower addresses // (common for arch-es that matter). clone_stack_size = ~clone_stack_size + 1; } #if defined(__i386__) || defined(__x86_64__) || defined(__riscv) || defined(__arm__) || defined(__aarch64__) // Sanity check code above. We know that those arch-es grow stack to // lower addresses. CHECK(grows_to_low); #endif /* Leave 4kB of gap between the callers stack and the new clone. This * should be more than sufficient for the caller to call waitpid() until * the cloned thread terminates. * * It is important that we set the CLONE_UNTRACED flag, because newer * versions of "gdb" otherwise attempt to attach to our thread, and will * attempt to reap its status codes. This subsequently results in the * caller hanging indefinitely in waitpid(), waiting for a change in * status that will never happen. By setting the CLONE_UNTRACED flag, we * prevent "gdb" from stealing events, but we still expect the thread * lister to fail, because it cannot PTRACE_ATTACH to the process that * is being debugged. This is OK and the error code will be reported * correctly. */ uintptr_t stack_addr = reinterpret_cast(&arg) + clone_stack_size; stack_addr &= ~63; // align stack address on 64 bytes (x86 needs 16, but lets be generous) return clone(fn, reinterpret_cast(stack_addr), CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_UNTRACED, arg, 0, 0, 0); } /* Local substitute for the atoi() function, which is not necessarily safe * to call once threads are suspended (depending on whether libc looks up * locale information, when executing atoi()). */ static int local_atoi(const char *s) { int n = 0; int neg = *s == '-'; if (neg) s++; while (*s >= '0' && *s <= '9') n = 10*n + (*s++ - '0'); return neg ? -n : n; } static int ptrace_detach(pid_t pid) { return ptrace(PTRACE_DETACH, pid, nullptr, nullptr); } /* Re-runs fn until it doesn't cause EINTR */ #define NO_INTR(fn) do {} while ((fn) < 0 && errno == EINTR) /* abort() is not safely reentrant, and changes it's behavior each time * it is called. This means, if the main application ever called abort() * we cannot safely call it again. This would happen if we were called * from a SIGABRT signal handler in the main application. So, document * that calling SIGABRT from the thread lister makes it not signal safe * (and vice-versa). * Also, since we share address space with the main application, we * cannot call abort() from the callback and expect the main application * to behave correctly afterwards. In fact, the only thing we can do, is * to terminate the main application with extreme prejudice (aka * PTRACE_KILL). * We set up our own SIGABRT handler to do this. * In order to find the main application from the signal handler, we * need to store information about it in global variables. This is * safe, because the main application should be suspended at this * time. If the callback ever called TCMalloc_ResumeAllProcessThreads(), then * we are running a higher risk, though. So, try to avoid calling * abort() after calling TCMalloc_ResumeAllProcessThreads. */ static volatile int *sig_pids, sig_num_threads; /* Signal handler to help us recover from dying while we are attached to * other threads. */ static void SignalHandler(int signum, siginfo_t *si, void *data) { RAW_LOG(ERROR, "Got fatal signal %d inside ListerThread", signum); if (sig_pids != NULL) { if (signum == SIGABRT) { prctl(PR_SET_PDEATHSIG, 0); while (sig_num_threads-- > 0) { /* Not sure if sched_yield is really necessary here, but it does not */ /* hurt, and it might be necessary for the same reasons that we have */ /* to do so in ptrace_detach(). */ sched_yield(); ptrace(PTRACE_KILL, sig_pids[sig_num_threads], 0, 0); } } else if (sig_num_threads > 0) { TCMalloc_ResumeAllProcessThreads(sig_num_threads, (int *)sig_pids); } } sig_pids = NULL; syscall(SYS_exit, signum == SIGABRT ? 1 : 2); } /* Try to dirty the stack, and hope that the compiler is not smart enough * to optimize this function away. Or worse, the compiler could inline the * function and permanently allocate the data on the stack. */ static void DirtyStack(size_t amount) { char buf[amount]; memset(buf, 0, amount); read(-1, buf, amount); } /* Data structure for passing arguments to the lister thread. */ #define ALT_STACKSIZE (MINSIGSTKSZ + 4096) struct ListerParams { int result, err; pid_t ppid; int start_pipe_rd; int start_pipe_wr; char *altstack_mem; ListAllProcessThreadsCallBack callback; void *parameter; va_list ap; int proc_fd; }; struct kernel_dirent64 { // see man 2 getdents int64_t d_ino; /* 64-bit inode number */ int64_t d_off; /* 64-bit offset to next structure */ unsigned short d_reclen; /* Size of this dirent */ unsigned char d_type; /* File type */ char d_name[]; /* Filename (null-terminated) */ }; static const kernel_dirent64 *BumpDirentPtr(const kernel_dirent64 *ptr, uintptr_t by_bytes) { return reinterpret_cast(reinterpret_cast(ptr) + by_bytes); } static int ListerThread(struct ListerParams *args) { int found_parent = 0; pid_t clone_pid = syscall(SYS_gettid); int proc = args->proc_fd, num_threads = 0; int max_threads = 0, sig; struct stat proc_sb; stack_t altstack; /* Wait for parent thread to set appropriate permissions to allow * ptrace activity. Note we using pipe pair, so which ensures we * don't sleep past parent's death. */ (void)close(args->start_pipe_wr); { char tmp; read(args->start_pipe_rd, &tmp, sizeof(tmp)); } // No point in continuing if parent dies before/during ptracing. prctl(PR_SET_PDEATHSIG, SIGKILL); /* Catch signals on an alternate pre-allocated stack. This way, we can * safely execute the signal handler even if we ran out of memory. */ memset(&altstack, 0, sizeof(altstack)); altstack.ss_sp = args->altstack_mem; altstack.ss_flags = 0; altstack.ss_size = ALT_STACKSIZE; sigaltstack(&altstack, nullptr); /* Some kernels forget to wake up traced processes, when the * tracer dies. So, intercept synchronous signals and make sure * that we wake up our tracees before dying. It is the caller's * responsibility to ensure that asynchronous signals do not * interfere with this function. */ for (sig = 0; sig < sizeof(sync_signals)/sizeof(*sync_signals); sig++) { struct sigaction sa; memset(&sa, 0, sizeof(sa)); sa.sa_sigaction = SignalHandler; sigfillset(&sa.sa_mask); sa.sa_flags = SA_ONSTACK|SA_SIGINFO|SA_RESETHAND; sigaction(sync_signals[sig], &sa, nullptr); } /* Read process directories in /proc/... */ for (;;) { if (lseek(proc, 0, SEEK_SET) < 0) { goto failure; } if (fstat(proc, &proc_sb) < 0) { goto failure; } /* Since we are suspending threads, we cannot call any libc * functions that might acquire locks. Most notably, we cannot * call malloc(). So, we have to allocate memory on the stack, * instead. Since we do not know how much memory we need, we * make a best guess. And if we guessed incorrectly we retry on * a second iteration (by jumping to "detach_threads"). * * Unless the number of threads is increasing very rapidly, we * should never need to do so, though, as our guestimate is very * conservative. */ if (max_threads < proc_sb.st_nlink + 100) { max_threads = proc_sb.st_nlink + 100; } /* scope */ { pid_t pids[max_threads]; int added_entries = 0; sig_num_threads = num_threads; sig_pids = pids; for (;;) { // lets make sure to align buf to store kernel_dirent64-s properly. int64_t buf[4096 / sizeof(int64_t)]; ssize_t nbytes = syscall(SYS_getdents64, proc, buf, sizeof(buf)); // fprintf(stderr, "nbytes = %zd\n", nbytes); if (nbytes < 0) { goto failure; } if (nbytes == 0) { if (added_entries) { /* Need to keep iterating over "/proc" in multiple * passes until we no longer find any more threads. This * algorithm eventually completes, when all threads have * been suspended. */ added_entries = 0; lseek(proc, 0, SEEK_SET); continue; } break; } const kernel_dirent64 *entry = reinterpret_cast(buf); const kernel_dirent64 *end = BumpDirentPtr(entry, nbytes); for (;entry < end; entry = BumpDirentPtr(entry, entry->d_reclen)) { if (entry->d_ino == 0) { continue; } const char *ptr = entry->d_name; // fprintf(stderr, "name: %s\n", ptr); pid_t pid; /* Some kernels hide threads by preceding the pid with a '.' */ if (*ptr == '.') ptr++; /* If the directory is not numeric, it cannot be a * process/thread */ if (*ptr < '0' || *ptr > '9') continue; pid = local_atoi(ptr); // fprintf(stderr, "pid = %d (%d)\n", pid, getpid()); if (!pid || pid == clone_pid) { continue; } /* Attach (and suspend) all threads */ long i, j; /* Found one of our threads, make sure it is no duplicate */ for (i = 0; i < num_threads; i++) { /* Linear search is slow, but should not matter much for * the typically small number of threads. */ if (pids[i] == pid) { /* Found a duplicate; most likely on second pass */ goto next_entry; } } /* Check whether data structure needs growing */ if (num_threads >= max_threads) { /* Back to square one, this time with more memory */ goto detach_threads; } /* Attaching to thread suspends it */ pids[num_threads++] = pid; sig_num_threads = num_threads; if (ptrace(PTRACE_ATTACH, pid, (void *)0, (void *)0) < 0) { /* If operation failed, ignore thread. Maybe it * just died? There might also be a race * condition with a concurrent core dumper or * with a debugger. In that case, we will just * make a best effort, rather than failing * entirely. */ num_threads--; sig_num_threads = num_threads; goto next_entry; } while (waitpid(pid, (int *)0, __WALL) < 0) { if (errno != EINTR) { ptrace_detach(pid); num_threads--; sig_num_threads = num_threads; goto next_entry; } } if (syscall(SYS_ptrace, PTRACE_PEEKDATA, pid, &i, &j) || i++ != j || syscall(SYS_ptrace, PTRACE_PEEKDATA, pid, &i, &j) || i != j) { /* Address spaces are distinct. This is probably * a forked child process rather than a thread. */ ptrace_detach(pid); num_threads--; sig_num_threads = num_threads; goto next_entry; } found_parent |= pid == args->ppid; added_entries++; next_entry:; } // entries iterations loop } // getdents loop /* If we never found the parent process, something is very wrong. * Most likely, we are running in debugger. Any attempt to operate * on the threads would be very incomplete. Let's just report an * error to the caller. */ if (!found_parent) { TCMalloc_ResumeAllProcessThreads(num_threads, pids); return 3; } /* Now we are ready to call the callback, * which takes care of resuming the threads for us. */ args->result = args->callback(args->parameter, num_threads, pids, args->ap); args->err = errno; /* Callback should have resumed threads, but better safe than sorry */ if (TCMalloc_ResumeAllProcessThreads(num_threads, pids)) { /* Callback forgot to resume at least one thread, report error */ args->err = EINVAL; args->result = -1; } return 0; detach_threads: /* Resume all threads prior to retrying the operation */ TCMalloc_ResumeAllProcessThreads(num_threads, pids); sig_pids = NULL; num_threads = 0; sig_num_threads = num_threads; max_threads += 100; } // pids[max_threads] scope } // for (;;) failure: args->result = -1; args->err = errno; return 1; } /* This function gets the list of all linux threads of the current process * passes them to the 'callback' along with the 'parameter' pointer; at the * call back call time all the threads are paused via * PTRACE_ATTACH. * The callback is executed from a separate thread which shares only the * address space, the filesystem, and the filehandles with the caller. Most * notably, it does not share the same pid and ppid; and if it terminates, * the rest of the application is still there. 'callback' is supposed to do * or arrange for TCMalloc_ResumeAllProcessThreads. This happens automatically, if * the thread raises a synchronous signal (e.g. SIGSEGV); asynchronous * signals are blocked. If the 'callback' decides to unblock them, it must * ensure that they cannot terminate the application, or that * TCMalloc_ResumeAllProcessThreads will get called. * It is an error for the 'callback' to make any library calls that could * acquire locks. Most notably, this means that most system calls have to * avoid going through libc. Also, this means that it is not legal to call * exit() or abort(). * We return -1 on error and the return value of 'callback' on success. */ int TCMalloc_ListAllProcessThreads(void *parameter, ListAllProcessThreadsCallBack callback, ...) { char altstack_mem[ALT_STACKSIZE]; struct ListerParams args; pid_t clone_pid; int dumpable = 1; int need_sigprocmask = 0; sigset_t sig_blocked, sig_old; int status, rc; SetPTracerSetup ptracer_setup; auto cleanup = MakeSimpleCleanup([&] () { int old_errno = errno; if (need_sigprocmask) { sigprocmask(SIG_SETMASK, &sig_old, nullptr); } if (!dumpable) { prctl(PR_SET_DUMPABLE, dumpable); } errno = old_errno; }); va_start(args.ap, callback); /* If we are short on virtual memory, initializing the alternate stack * might trigger a SIGSEGV. Let's do this early, before it could get us * into more trouble (i.e. before signal handlers try to use the alternate * stack, and before we attach to other threads). */ memset(altstack_mem, 0, sizeof(altstack_mem)); /* Some of our cleanup functions could conceivable use more stack space. * Try to touch the stack right now. This could be defeated by the compiler * being too smart for it's own good, so try really hard. */ DirtyStack(32768); /* Make this process "dumpable". This is necessary in order to ptrace() * after having called setuid(). */ dumpable = prctl(PR_GET_DUMPABLE, 0); if (!dumpable) { prctl(PR_SET_DUMPABLE, 1); } /* Fill in argument block for dumper thread */ args.result = -1; args.err = 0; args.ppid = getpid(); args.altstack_mem = altstack_mem; args.parameter = parameter; args.callback = callback; NO_INTR(args.proc_fd = open("/proc/self/task/", O_RDONLY|O_DIRECTORY|O_CLOEXEC)); UniqueFD proc_closer{args.proc_fd}; if (args.proc_fd < 0) { return -1; } int pipefds[2]; if (pipe2(pipefds, O_CLOEXEC)) { return -1; } UniqueFD pipe_rd_closer{pipefds[0]}; UniqueFD pipe_wr_closer{pipefds[1]}; args.start_pipe_rd = pipefds[0]; args.start_pipe_wr = pipefds[1]; /* Before cloning the thread lister, block all asynchronous signals, as we */ /* are not prepared to handle them. */ sigfillset(&sig_blocked); for (int sig = 0; sig < sizeof(sync_signals)/sizeof(*sync_signals); sig++) { sigdelset(&sig_blocked, sync_signals[sig]); } if (sigprocmask(SIG_BLOCK, &sig_blocked, &sig_old)) { return -1; } need_sigprocmask = 1; // make sure all functions used by parent from local_clone to after // waitpid have plt entries fully initialized. We cannot afford // dynamic linker running relocations and messing with errno (see // comment just below) (void)prctl(PR_GET_PDEATHSIG, 0); (void)close(-1); (void)waitpid(INT_MIN, nullptr, 0); /* After cloning, both the parent and the child share the same * instance of errno. We deal with this by being very * careful. Specifically, child immediately calls into sem_wait * which never fails (cannot even EINTR), so doesn't touch errno. * * Parent sets up PR_SET_PTRACER prctl (if it fails, which usually * doesn't happen, we ignore that failure). Then parent does close * on write side of start pipe. After that child runs complex code, * including arbitrary callback. So parent avoids screwing with * errno by immediately calling waitpid with async signals disabled. * * I.e. errno is parent's up until close below. Then errno belongs * to child up until it exits. */ clone_pid = local_clone((int (*)(void *))ListerThread, &args); if (clone_pid < 0) { return -1; } /* Most Linux kernels in the wild have Yama LSM enabled, so * requires us to explicitly give permission for child to ptrace * us. See man 2 ptrace for details. This then requires us to * synchronize with the child (see close on start pipe * below). I.e. so that child doesn't start ptracing before we've * completed this prctl call. */ ptracer_setup.Prepare(clone_pid); /* Closing write side of pipe works like releasing the lock. It * allows the ListerThread to run past read() call on read side of * pipe and ptrace us. */ close(pipe_wr_closer.ReleaseFD()); /* So here child runs (see ListerThread), it finds and ptraces all * threads, runs whatever callback is setup and then * detaches/resumes everything. In any case we wait for child's * completion to gather status and synchronize everything. */ rc = waitpid(clone_pid, &status, __WALL); if (rc < 0) { if (errno == EINTR) { RAW_LOG(FATAL, "BUG: EINTR from waitpid shouldn't be possible!"); } // Any error waiting for child is sign of some bug, so abort // asap. Continuing is unsafe anyways with child potentially writing to our // stack. RAW_LOG(FATAL, "BUG: waitpid inside TCMalloc_ListAllProcessThreads cannot fail, but it did. Raw errno: %d\n", errno); } else if (WIFEXITED(status)) { errno = args.err; switch (WEXITSTATUS(status)) { case 0: break; /* Normal process termination */ case 2: args.err = EFAULT; /* Some fault (e.g. SIGSEGV) detected */ args.result = -1; break; case 3: args.err = EPERM; /* Process is already being traced */ args.result = -1; break; default:args.err = ECHILD; /* Child died unexpectedly */ args.result = -1; break; } } else if (!WIFEXITED(status)) { args.err = EFAULT; /* Terminated due to an unhandled signal*/ args.result = -1; } errno = args.err; return args.result; } /* This function resumes the list of all linux threads that * TCMalloc_ListAllProcessThreads pauses before giving to its callback. * The function returns non-zero if at least one thread was * suspended and has now been resumed. */ int TCMalloc_ResumeAllProcessThreads(int num_threads, pid_t *thread_pids) { int detached_at_least_one = 0; while (num_threads-- > 0) { detached_at_least_one |= (ptrace_detach(thread_pids[num_threads]) >= 0); } return detached_at_least_one; }