#include "../server/tracy_robin_hood.h" #include "TracyProfiler.hpp" #include "TracyThread.hpp" #include "tracy/TracyC.h" #include #include #include #include #include #include #include #include #include #include #define ROCPROFILER_CALL( result, msg ) \ { \ rocprofiler_status_t CHECKSTATUS = result; \ if( CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS ) \ { \ std::string status_msg = rocprofiler_get_status_string( CHECKSTATUS ); \ std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg << " failed with error code " \ << CHECKSTATUS << ": " << status_msg << std::endl; \ std::stringstream errmsg{}; \ errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" << status_msg \ << ")"; \ throw std::runtime_error( errmsg.str() ); \ } \ } namespace { using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; struct DispatchData { int64_t launch_start; int64_t launch_end; uint32_t thread_id; uint16_t query_id; }; struct ToolData { uint32_t version; const char* runtime_version; uint32_t priority; rocprofiler_client_id_t client_id; uint8_t context_id; bool init; uint64_t query_id; int64_t previous_cpu_time; tracy::unordered_map client_kernels; tracy::unordered_map dispatch_data; tracy::unordered_set counter_names = { "SQ_WAVES", "GL2C_MISS", "GL2C_HIT" }; std::unique_ptr cal_thread; std::mutex mut{}; }; using namespace tracy; rocprofiler_context_id_t& get_client_ctx() { static rocprofiler_context_id_t ctx{ 0 }; return ctx; } const char* CTX_NAME = "rocprofv3"; uint8_t gpu_context_allocate( ToolData* data ) { timespec ts; clock_gettime( CLOCK_BOOTTIME, &ts ); uint64_t cpu_timestamp = Profiler::GetTime(); uint64_t gpu_timestamp = ( (uint64_t)ts.tv_sec * 1000000000 ) + ts.tv_nsec; float timestamp_period = 1.0f; data->previous_cpu_time = cpu_timestamp; // Allocate the process-unique GPU context ID. There's a max of 255 available; // if we are recreating devices a lot we may exceed that. Don't do that, or // wrap around and get weird (but probably still usable) numbers. uint8_t context_id = tracy::GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ); if( context_id >= 255 ) { context_id %= 255; } uint8_t context_flags = 0; #ifdef TRACY_ROCPROF_CALIBRATION // Tell tracy we'll be passing calibrated timestamps and not to mess with // the times. We'll periodically send GpuCalibration events in case the // times drift. context_flags |= tracy::GpuContextCalibration; #endif { auto* item = tracy::Profiler::QueueSerial(); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuNewContext ); tracy::MemWrite( &item->gpuNewContext.cpuTime, cpu_timestamp ); tracy::MemWrite( &item->gpuNewContext.gpuTime, gpu_timestamp ); memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) ); tracy::MemWrite( &item->gpuNewContext.period, timestamp_period ); tracy::MemWrite( &item->gpuNewContext.context, context_id ); tracy::MemWrite( &item->gpuNewContext.flags, context_flags ); tracy::MemWrite( &item->gpuNewContext.type, tracy::GpuContextType::Rocprof ); tracy::Profiler::QueueSerialFinish(); } // Send the name of the context along. // NOTE: Tracy will unconditionally free the name so we must clone it here. // Since internally Tracy will use its own rpmalloc implementation we must // make sure we allocate from the same source. size_t name_length = strlen( CTX_NAME ); char* cloned_name = (char*)tracy::tracy_malloc( name_length ); memcpy( cloned_name, CTX_NAME, name_length ); { auto* item = tracy::Profiler::QueueSerial(); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuContextName ); tracy::MemWrite( &item->gpuContextNameFat.context, context_id ); tracy::MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)cloned_name ); tracy::MemWrite( &item->gpuContextNameFat.size, name_length ); tracy::Profiler::QueueSerialFinish(); } return context_id; } uint64_t kernel_src_loc( ToolData* data, uint64_t kernel_id ) { uint64_t src_loc = 0; auto _lk = std::unique_lock{ data->mut }; rocprofiler_kernel_id_t kid = kernel_id; if( data->client_kernels.count( kid ) ) { auto& sym_data = data->client_kernels[kid]; const char* name = sym_data.kernel_name; size_t name_len = strlen( name ); uint32_t line = 0; src_loc = tracy::Profiler::AllocSourceLocation( line, NULL, 0, name, name_len, NULL, 0 ); } return src_loc; } void record_interval( ToolData* data, rocprofiler_timestamp_t start_timestamp, rocprofiler_timestamp_t end_timestamp, uint64_t src_loc, rocprofiler_dispatch_id_t dispatch_id ) { uint16_t query_id = 0; uint8_t context_id = data->context_id; { auto _lk = std::unique_lock{ data->mut }; query_id = data->query_id; data->query_id++; if( dispatch_id != UINT64_MAX ) { DispatchData& dispatch_data = data->dispatch_data[dispatch_id]; dispatch_data.query_id = query_id; dispatch_data.thread_id = tracy::GetThreadHandle(); } } uint64_t cpu_start_time = 0, cpu_end_time = 0; if( dispatch_id == UINT64_MAX ) { cpu_start_time = tracy::Profiler::GetTime(); cpu_end_time = tracy::Profiler::GetTime(); } else { auto _lk = std::unique_lock{ data->mut }; DispatchData& dispatch_data = data->dispatch_data[dispatch_id]; cpu_start_time = dispatch_data.launch_start; cpu_end_time = dispatch_data.launch_end; } if( src_loc != 0 ) { { auto* item = tracy::Profiler::QueueSerial(); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocSerial ); tracy::MemWrite( &item->gpuZoneBegin.cpuTime, cpu_start_time ); tracy::MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)src_loc ); tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); tracy::MemWrite( &item->gpuZoneBegin.queryId, query_id ); tracy::MemWrite( &item->gpuZoneBegin.context, context_id ); tracy::Profiler::QueueSerialFinish(); } } else { static const ___tracy_source_location_data src_loc = { NULL, NULL, NULL, 0, 0 }; { auto* item = tracy::Profiler::QueueSerial(); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginSerial ); tracy::MemWrite( &item->gpuZoneBegin.cpuTime, cpu_start_time ); tracy::MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)&src_loc ); tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); tracy::MemWrite( &item->gpuZoneBegin.queryId, query_id ); tracy::MemWrite( &item->gpuZoneBegin.context, context_id ); tracy::Profiler::QueueSerialFinish(); } } { auto* item = tracy::Profiler::QueueSerial(); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuTime ); tracy::MemWrite( &item->gpuTime.gpuTime, start_timestamp ); tracy::MemWrite( &item->gpuTime.queryId, query_id ); tracy::MemWrite( &item->gpuTime.context, context_id ); tracy::Profiler::QueueSerialFinish(); } { auto* item = tracy::Profiler::QueueSerial(); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneEndSerial ); tracy::MemWrite( &item->gpuZoneEnd.cpuTime, cpu_end_time ); tracy::MemWrite( &item->gpuZoneEnd.thread, tracy::GetThreadHandle() ); tracy::MemWrite( &item->gpuZoneEnd.queryId, query_id ); tracy::MemWrite( &item->gpuZoneEnd.context, context_id ); tracy::Profiler::QueueSerialFinish(); } { auto* item = tracy::Profiler::QueueSerial(); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuTime ); tracy::MemWrite( &item->gpuTime.gpuTime, end_timestamp ); tracy::MemWrite( &item->gpuTime.queryId, query_id ); tracy::MemWrite( &item->gpuTime.context, context_id ); tracy::Profiler::QueueSerialFinish(); } } void record_callback( rocprofiler_dispatch_counting_service_data_t dispatch_data, rocprofiler_record_counter_t* record_data, size_t record_count, rocprofiler_user_data_t /*user_data*/, void* callback_data ) { assert( callback_data != nullptr ); ToolData* data = static_cast( callback_data ); if( !data->init ) return; std::unordered_map sums; for( size_t i = 0; i < record_count; ++i ) { auto _counter_id = rocprofiler_counter_id_t{}; ROCPROFILER_CALL( rocprofiler_query_record_counter_id( record_data[i].id, &_counter_id ), "query record counter id" ); sums[_counter_id.handle] += record_data[i].counter_value; } uint16_t query_id = 0; uint32_t thread_id = 0; { auto _lk = std::unique_lock{ data->mut }; // An assumption is made here that the counter values are supplied after the dispatch // complete callback. assert( data->dispatch_data.count( dispatch_data.dispatch_info.dispatch_id ) ); DispatchData& ddata = data->dispatch_data[dispatch_data.dispatch_info.dispatch_id]; query_id = ddata.query_id; thread_id = ddata.thread_id; } for( auto& p : sums ) { auto* item = tracy::Profiler::QueueSerial(); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneAnnotation ); tracy::MemWrite( &item->zoneAnnotation.noteId, p.first ); tracy::MemWrite( &item->zoneAnnotation.queryId, query_id ); tracy::MemWrite( &item->zoneAnnotation.thread, thread_id ); tracy::MemWrite( &item->zoneAnnotation.value, p.second ); tracy::MemWrite( &item->zoneAnnotation.context, data->context_id ); tracy::Profiler::QueueSerialFinish(); } } /** * Callback from rocprofiler when an kernel dispatch is enqueued into the HSA queue. * rocprofiler_counter_config_id_t* is a return to specify what counters to collect * for this dispatch (dispatch_packet). */ void dispatch_callback( rocprofiler_dispatch_counting_service_data_t dispatch_data, rocprofiler_profile_config_id_t* config, rocprofiler_user_data_t* /*user_data*/, void* callback_data ) { assert( callback_data != nullptr ); ToolData* data = static_cast( callback_data ); if( !data->init ) return; /** * This simple example uses the same profile counter set for all agents. * We store this in a cache to prevent constructing many identical profile counter * sets. We first check the cache to see if we have already constructed a counter" * set for the agent. If we have, return it. Otherwise, construct a new profile counter * set. */ static std::shared_mutex m_mutex = {}; static std::unordered_map profile_cache = {}; auto search_cache = [&]() { if( auto pos = profile_cache.find( dispatch_data.dispatch_info.agent_id.handle ); pos != profile_cache.end() ) { *config = pos->second; return true; } return false; }; { auto rlock = std::shared_lock{ m_mutex }; if( search_cache() ) return; } auto wlock = std::unique_lock{ m_mutex }; if( search_cache() ) return; // GPU Counter IDs std::vector gpu_counters; // Iterate through the agents and get the counters available on that agent ROCPROFILER_CALL( rocprofiler_iterate_agent_supported_counters( dispatch_data.dispatch_info.agent_id, []( rocprofiler_agent_id_t, rocprofiler_counter_id_t* counters, size_t num_counters, void* user_data ) { std::vector* vec = static_cast*>( user_data ); for( size_t i = 0; i < num_counters; i++ ) { vec->push_back( counters[i] ); } return ROCPROFILER_STATUS_SUCCESS; }, static_cast( &gpu_counters ) ), "Could not fetch supported counters" ); std::vector collect_counters; collect_counters.reserve( data->counter_names.size() ); // Look for the counters contained in counters_to_collect in gpu_counters for( auto& counter : gpu_counters ) { rocprofiler_counter_info_v0_t info; ROCPROFILER_CALL( rocprofiler_query_counter_info( counter, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast( &info ) ), "Could not query info" ); if( data->counter_names.count( std::string( info.name ) ) > 0 ) { collect_counters.push_back( counter ); size_t name_length = strlen( info.name ); char* cloned_name = (char*)tracy::tracy_malloc( name_length ); memcpy( cloned_name, info.name, name_length ); { auto* item = tracy::Profiler::QueueSerial(); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuAnnotationName ); tracy::MemWrite( &item->gpuAnnotationNameFat.context, data->context_id ); tracy::MemWrite( &item->gpuAnnotationNameFat.noteId, counter.handle ); tracy::MemWrite( &item->gpuAnnotationNameFat.ptr, (uint64_t)cloned_name ); tracy::MemWrite( &item->gpuAnnotationNameFat.size, name_length ); tracy::Profiler::QueueSerialFinish(); } } } // Create a colleciton profile for the counters rocprofiler_profile_config_id_t profile = { .handle = 0 }; ROCPROFILER_CALL( rocprofiler_create_profile_config( dispatch_data.dispatch_info.agent_id, collect_counters.data(), collect_counters.size(), &profile ), "Could not construct profile cfg" ); profile_cache.emplace( dispatch_data.dispatch_info.agent_id.handle, profile ); // Return the profile to collect those counters for this dispatch *config = profile; } void tool_callback_tracing_callback( rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t* user_data, void* callback_data ) { assert( callback_data != nullptr ); ToolData* data = static_cast( callback_data ); if( !data->init ) return; if( record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER ) { auto* sym_data = static_cast( record.payload ); if( record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD ) { auto _lk = std::unique_lock{ data->mut }; data->client_kernels.emplace( sym_data->kernel_id, *sym_data ); } else if( record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD ) { auto _lk = std::unique_lock{ data->mut }; data->client_kernels.erase( sym_data->kernel_id ); } } else if( record.kind == ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH ) { auto* rdata = static_cast( record.payload ); if( record.operation == ROCPROFILER_KERNEL_DISPATCH_ENQUEUE ) { if( record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER ) { auto _lk = std::unique_lock{ data->mut }; data->dispatch_data[rdata->dispatch_info.dispatch_id].launch_start = tracy::Profiler::GetTime(); } else if( record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT ) { auto _lk = std::unique_lock{ data->mut }; data->dispatch_data[rdata->dispatch_info.dispatch_id].launch_end = tracy::Profiler::GetTime(); } } else if( record.operation == ROCPROFILER_KERNEL_DISPATCH_COMPLETE ) { uint64_t src_loc = kernel_src_loc( data, rdata->dispatch_info.kernel_id ); record_interval( data, rdata->start_timestamp, rdata->end_timestamp, src_loc, rdata->dispatch_info.dispatch_id ); } } else if( record.kind == ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY && record.operation != ROCPROFILER_MEMORY_COPY_NONE && record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT ) { auto* rdata = static_cast( record.payload ); const char* name = nullptr; switch( record.operation ) { case ROCPROFILER_MEMORY_COPY_DEVICE_TO_DEVICE: name = "DeviceToDeviceCopy"; break; case ROCPROFILER_MEMORY_COPY_DEVICE_TO_HOST: name = "DeviceToHostCopy"; break; case ROCPROFILER_MEMORY_COPY_HOST_TO_DEVICE: name = "HostToDeviceCopy"; break; case ROCPROFILER_MEMORY_COPY_HOST_TO_HOST: name = "HostToHostCopy"; break; } size_t name_len = strlen( name ); uint64_t src_loc = tracy::Profiler::AllocSourceLocation( 0, NULL, 0, name, name_len, NULL, 0 ); record_interval( data, rdata->start_timestamp, rdata->end_timestamp, src_loc, UINT64_MAX ); } } void calibration_thread( void* ptr ) { while( !TracyIsStarted ) ; ToolData* data = static_cast( ptr ); data->context_id = gpu_context_allocate( data ); const char* user_counters = GetEnvVar( "TRACY_ROCPROF_COUNTERS" ); if( user_counters ) { data->counter_names.clear(); std::stringstream ss( user_counters ); std::string counter; while( std::getline( ss, counter, ',' ) ) data->counter_names.insert( counter ); } data->init = true; #ifdef TRACY_ROCPROF_CALIBRATION while( data->init ) { sleep( 1 ); timespec ts; // HSA performs a linear interpolation of GPU time to CLOCK_BOOTTIME. However, this is // subject to network time updates and can drift relative to tracy's clock. clock_gettime( CLOCK_BOOTTIME, &ts ); int64_t cpu_timestamp = Profiler::GetTime(); int64_t gpu_timestamp = ts.tv_nsec + ts.tv_sec * 1e9L; if( cpu_timestamp > data->previous_cpu_time ) { auto* item = tracy::Profiler::QueueSerial(); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuCalibration ); tracy::MemWrite( &item->gpuCalibration.gpuTime, gpu_timestamp ); tracy::MemWrite( &item->gpuCalibration.cpuTime, cpu_timestamp ); tracy::MemWrite( &item->gpuCalibration.cpuDelta, cpu_timestamp - data->previous_cpu_time ); tracy::MemWrite( &item->gpuCalibration.context, data->context_id ); tracy::Profiler::QueueSerialFinish(); data->previous_cpu_time = cpu_timestamp; } } #endif } int tool_init( rocprofiler_client_finalize_t fini_func, void* user_data ) { ToolData* data = static_cast( user_data ); data->cal_thread = std::make_unique( calibration_thread, data ); ROCPROFILER_CALL( rocprofiler_create_context( &get_client_ctx() ), "context creation failed" ); ROCPROFILER_CALL( rocprofiler_configure_callback_dispatch_counting_service( get_client_ctx(), dispatch_callback, user_data, record_callback, user_data ), "Could not setup counting service" ); rocprofiler_tracing_operation_t ops[] = { ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER }; ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service( get_client_ctx(), ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, ops, 1, tool_callback_tracing_callback, user_data ), "callback tracing service failed to configure" ); rocprofiler_tracing_operation_t ops2[] = { ROCPROFILER_KERNEL_DISPATCH_COMPLETE, ROCPROFILER_KERNEL_DISPATCH_ENQUEUE }; ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service( get_client_ctx(), ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, ops2, 2, tool_callback_tracing_callback, user_data ), "callback tracing service failed to configure" ); ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service( get_client_ctx(), ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, nullptr, 0, tool_callback_tracing_callback, user_data ), "callback tracing service failed to configure" ); ROCPROFILER_CALL( rocprofiler_start_context( get_client_ctx() ), "start context" ); return 0; } void tool_fini( void* tool_data_v ) { rocprofiler_stop_context( get_client_ctx() ); ToolData* data = static_cast( tool_data_v ); data->init = false; data->cal_thread.reset(); } } extern "C" { rocprofiler_tool_configure_result_t* rocprofiler_configure( uint32_t version, const char* runtime_version, uint32_t priority, rocprofiler_client_id_t* client_id ) { // If not the first tool to register, indicate that the tool doesn't want to do anything if( priority > 0 ) return nullptr; // (optional) Provide a name for this tool to rocprofiler client_id->name = "Tracy"; // (optional) create configure data static ToolData data = ToolData{ version, runtime_version, priority, *client_id, 0, false, 0, 0 }; // construct configure result static auto cfg = rocprofiler_tool_configure_result_t{ sizeof( rocprofiler_tool_configure_result_t ), &tool_init, &tool_fini, static_cast( &data ) }; return &cfg; } }