// Software Name : cppuprofile // SPDX-FileCopyrightText: Copyright (c) 2023 Orange // SPDX-License-Identifier: BSD-3-Clause // // This software is distributed under the BSD License; // see the LICENSE file for more details. // // Author: Cédric CHEDALEUX et al. #include "nvidiamonitor.h" #include #include #include #include #include #include #if defined(__linux__) #include #include #endif const string errorMsg = "Failed to monitor nvidia-smi process"; #if defined(__linux__) int read_nvidia_smi_stdout(int fd, string& gpuUsage, string& usedMem, string& totalMem) { string line; while (line.find('\n') == string::npos) { // full line read char buffer[4096]; ssize_t count = read(fd, buffer, sizeof(buffer)); // if child process crashes, we gonna be blocked here forever if (count == -1) { return errno; } else if (count > 0) { // there is something to read line += string(buffer, count); } } // Remove colon to have only spaces and use istringstream auto noSpaceEnd = remove(line.begin(), line.end(), ','); if (noSpaceEnd == line.end()) { // output trace does not have comma so something went wrong with the command return ENODATA; } line.erase(noSpaceEnd, line.end()); std::istringstream ss(line); ss >> gpuUsage >> usedMem >> totalMem; return 0; } #endif uprofile::NvidiaMonitor::NvidiaMonitor() { } uprofile::NvidiaMonitor::~NvidiaMonitor() { stop(); } void uprofile::NvidiaMonitor::start(int period) { watchGPU(period); } void uprofile::NvidiaMonitor::stop() { abortWatchGPU(); } float uprofile::NvidiaMonitor::getUsage() const { std::lock_guard lock(m_mutex); return m_gpuUsage; } void uprofile::NvidiaMonitor::getMemory(int& usedMem, int& totalMem) const { std::lock_guard lock(m_mutex); usedMem = m_usedMem; totalMem = m_totalMem; } void uprofile::NvidiaMonitor::watchGPU(int period) { if (m_watching) { return; } #if defined(__linux__) char* args[5]; args[0] = (char*)"/usr/bin/nvidia-smi"; string period_arg = "-lms=" + to_string(period); // lms stands for continuous watching args[1] = (char*)period_arg.c_str(); args[2] = (char*)"--query-gpu=utilization.gpu,memory.used,memory.total"; args[3] = (char*)"--format=csv,noheader,nounits"; args[4] = NULL; string output; int pipes[2]; // Create the pipe if (pipe(pipes) == -1) { cerr << errorMsg << ": pipe creation failed" << endl; return; } // Create a child process for calling nvidia-smi pid_t pid = fork(); switch (pid) { case -1: /* Error */ cerr << errorMsg << ": process fork failed" << endl; return; case 0: /* We are in the child process */ while ((dup2(pipes[1], STDOUT_FILENO) == -1) && (errno == EINTR)) { } close(pipes[1]); close(pipes[0]); execv(args[0], args); cerr << "Failed to execute '" << args[0] << "': " << strerror(errno) << endl; /* execl doesn't return unless there's an error */ exit(1); default: /* We are in the parent process */ int stdout_fd = pipes[0]; // Start a thread to retrieve the child process stdout m_watching = true; m_watcherThread = unique_ptr(new thread([stdout_fd, pid, this]() { while (watching()) { string gpuUsage, usedMem, totalMem; // if the child process crashes, an error is raised here and threads ends up int err = read_nvidia_smi_stdout(stdout_fd, gpuUsage, usedMem, totalMem); if (err != 0) { cerr << errorMsg << ": read_error = " << strerror(err) << endl; m_mutex.lock(); m_watching = false; m_mutex.unlock(); break; } m_mutex.lock(); m_gpuUsage = !gpuUsage.empty() ? stof(gpuUsage) : 0.f; m_usedMem = !usedMem.empty() ? stoi(usedMem) * 1024 : 0; // MiB to KiB m_totalMem = !totalMem.empty() ? stoi(totalMem) * 1024 : 0; // MiB to KiB m_mutex.unlock(); } })); } #else cerr << errorMsg << endl; #endif } void uprofile::NvidiaMonitor::abortWatchGPU() { #if defined(__linux__) if (m_watcherThread) { m_mutex.lock(); m_watching = false; m_mutex.unlock(); m_watcherThread->join(); m_watcherThread.reset(); } #endif } bool uprofile::NvidiaMonitor::watching() const { std::lock_guard lock(m_mutex); return m_watching; }