/** Lightweight profiler library for c++ Copyright(C) 2016-2017 Sergey Yagovtsev, Victor Zarubkin Licensed under either of * MIT license (LICENSE.MIT or http://opensource.org/licenses/MIT) * Apache License, Version 2.0, (LICENSE.APACHE or http://www.apache.org/licenses/LICENSE-2.0) at your option. The MIT License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. The Apache License, Version 2.0 (the "License"); You may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. **/ #ifndef EASY_PROFILER_MANAGER_H #define EASY_PROFILER_MANAGER_H #include #include #include "spin_lock.h" #include "outstream.h" #include "hashed_cstr.h" #include #include #include #include #include #include ////////////////////////////////////////////////////////////////////////// #ifdef _WIN32 #include #elif defined(__APPLE__) #include #include #else #include #include #include #include #include #endif #ifdef max #undef max #endif inline profiler::thread_id_t getCurrentThreadId() { #ifdef _WIN32 return (profiler::thread_id_t)::GetCurrentThreadId(); #elif defined(__APPLE__) # if (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_6) || \ (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_8_0) EASY_THREAD_LOCAL static uint64_t _id = 0; if (!_id) pthread_threadid_np(NULL, &_id); return (profiler::thread_id_t)_id; # else return (profiler::thread_id_t)pthread_self(); # endif #else EASY_THREAD_LOCAL static const profiler::thread_id_t _id = (profiler::thread_id_t)syscall(__NR_gettid); return _id; #endif } namespace profiler { class SerializedBlock; struct do_not_calc_hash { template inline size_t operator()(T _value) const { return static_cast(_value); } }; } ////////////////////////////////////////////////////////////////////////// #ifndef EASY_ENABLE_BLOCK_STATUS # define EASY_ENABLE_BLOCK_STATUS 1 #endif #ifndef EASY_ENABLE_ALIGNMENT # define EASY_ENABLE_ALIGNMENT 0 #endif #ifndef EASY_ALIGNMENT_SIZE # define EASY_ALIGNMENT_SIZE 64 #endif #if EASY_ENABLE_ALIGNMENT == 0 # define EASY_ALIGNED(TYPE, VAR, A) TYPE VAR # define EASY_MALLOC(MEMSIZE, A) malloc(MEMSIZE) # define EASY_FREE(MEMPTR) free(MEMPTR) #else # if defined(_MSC_VER) # define EASY_ALIGNED(TYPE, VAR, A) __declspec(align(A)) TYPE VAR # define EASY_MALLOC(MEMSIZE, A) _aligned_malloc(MEMSIZE, A) # define EASY_FREE(MEMPTR) _aligned_free(MEMPTR) # elif defined(__GNUC__) # define EASY_ALIGNED(TYPE, VAR, A) TYPE VAR __attribute__(aligned(A)) # else # define EASY_ALIGNED(TYPE, VAR, A) TYPE VAR # endif #endif template class chunk_allocator { struct chunk { EASY_ALIGNED(int8_t, data[N], EASY_ALIGNMENT_SIZE); chunk* prev = nullptr; }; struct chunk_list { chunk* last = nullptr; ~chunk_list() { clear(); } void clear() { do { auto p = last; last = last->prev; EASY_FREE(p); } while (last != nullptr); } chunk& back() { return *last; } void emplace_back() { auto prev = last; last = ::new (EASY_MALLOC(sizeof(chunk), EASY_ALIGNMENT_SIZE)) chunk(); last->prev = prev; *(uint16_t*)last->data = 0; } /** Invert current chunks list to enable to iterate over chunks list in direct order. This method is used by serialize(). */ void invert() { chunk* next = nullptr; while (last->prev != nullptr) { auto p = last->prev; last->prev = next; next = last; last = p; } last->prev = next; } }; //typedef std::list chunk_list; chunk_list m_chunks; uint32_t m_size; uint16_t m_shift; public: chunk_allocator() : m_size(0), m_shift(0) { m_chunks.emplace_back(); } /** Allocate n bytes. Automatically checks if there is enough preserved memory to store additional n bytes and allocates additional buffer if needed. */ void* allocate(uint16_t n) { ++m_size; if (!need_expand(n)) { int8_t* data = m_chunks.back().data + m_shift; m_shift += n + sizeof(uint16_t); *(uint16_t*)data = n; data = data + sizeof(uint16_t); if (m_shift + 1 < N) *(uint16_t*)(data + n) = 0; return data; } m_shift = n + sizeof(uint16_t); m_chunks.emplace_back(); auto data = m_chunks.back().data; *(uint16_t*)data = n; data = data + sizeof(uint16_t); *(uint16_t*)(data + n) = 0; return data; } /** Check if current storage is not enough to store additional n bytes. */ inline bool need_expand(uint16_t n) const { return (m_shift + n + sizeof(uint16_t)) > N; } inline uint32_t size() const { return m_size; } inline bool empty() const { return m_size == 0; } void clear() { m_size = 0; m_shift = 0; m_chunks.clear(); m_chunks.emplace_back(); } /** Serialize data to stream. \warning Data will be cleared after serialization. */ void serialize(profiler::OStream& _outputStream) { // Chunks are stored in reversed order (stack). // To be able to iterate them in direct order we have to invert chunks list. m_chunks.invert(); // Iterate over chunks and perform blocks serialization auto current = m_chunks.last; do { const int8_t* data = current->data; uint16_t i = 0; while (i + 1 < N && *(uint16_t*)data != 0) { const uint16_t size = sizeof(uint16_t) + *(uint16_t*)data; _outputStream.write((const char*)data, size); data = data + size; i += size; } current = current->prev; } while (current != nullptr); clear(); } }; // END of class chunk_allocator. ////////////////////////////////////////////////////////////////////////// class NonscopedBlock : public profiler::Block { std::string m_runtimeName; ///< a copy of _runtimeName to make it safe to begin block in one function and end it in another NonscopedBlock() = delete; NonscopedBlock(const NonscopedBlock&) = delete; NonscopedBlock(NonscopedBlock&&) = delete; NonscopedBlock& operator = (const NonscopedBlock&) = delete; NonscopedBlock& operator = (NonscopedBlock&&) = delete; public: NonscopedBlock(const profiler::BaseBlockDescriptor* _desc, const char* _runtimeName, bool = false); ~NonscopedBlock(); /** Copy string from m_name to m_runtimeName to make it safe to end block in another function. Performs any work if block is ON and m_name != "" */ void copyname(); void destroy(); }; // END of class NonscopedBlock. ////////////////////////////////////////////////////////////////////////// template inline void destroy_elem(T*) { } inline void destroy_elem(NonscopedBlock* _elem) { _elem->destroy(); } template class StackBuffer { struct chunk { int8_t data[sizeof(T)]; }; std::list m_overflow; ///< List of additional stack elements if current capacity of buffer is not enough T* m_buffer; ///< Contiguous buffer used for stack uint32_t m_size; ///< Current size of stack uint32_t m_capacity; ///< Current capacity of m_buffer uint32_t m_maxcapacity; ///< Maximum used capacity including m_buffer and m_overflow public: StackBuffer(uint32_t N) : m_buffer((T*)malloc(N * sizeof(T))), m_size(0), m_capacity(N), m_maxcapacity(N) { } ~StackBuffer() { for (uint32_t i = 0; i < m_size; ++i) destroy_elem(m_buffer + i); free(m_buffer); for (auto& elem : m_overflow) destroy_elem(reinterpret_cast(elem.data + 0)); } template T& push(TArgs ... _args) { if (m_size < m_capacity) return *(::new (m_buffer + m_size++) T(_args...)); m_overflow.emplace_back(); const uint32_t cap = m_capacity + (uint32_t)m_overflow.size(); if (m_maxcapacity < cap) m_maxcapacity = cap; return *(::new (m_overflow.back().data + 0) T(_args...)); } void pop() { if (m_overflow.empty()) { // m_size should not be equal to 0 here because ProfileManager behavior does not allow such situation destroy_elem(m_buffer + --m_size); if (m_size == 0 && m_maxcapacity > m_capacity) { // When stack gone empty we can resize buffer to use enough space in the future free(m_buffer); m_maxcapacity = m_capacity = std::max(m_maxcapacity, m_capacity << 1); m_buffer = (T*)malloc(m_capacity * sizeof(T)); } return; } destroy_elem(reinterpret_cast(m_overflow.back().data + 0)); m_overflow.pop_back(); } }; // END of class StackBuffer. ////////////////////////////////////////////////////////////////////////// template struct BlocksList { BlocksList() = default; std::vector openedList; chunk_allocator closedList; uint64_t usedMemorySize = 0; void clearClosed() { //closedList.clear(); usedMemorySize = 0; } }; // END of struct BlocksList. ////////////////////////////////////////////////////////////////////////// class CSwitchBlock : public ::profiler::Block { ::profiler::thread_id_t m_thread_id; public: CSwitchBlock(::profiler::timestamp_t _begin_time, ::profiler::thread_id_t _tid, const char* _runtimeName); CSwitchBlock(CSwitchBlock&& _that); inline ::profiler::thread_id_t tid() const { return m_thread_id; } private: CSwitchBlock(const CSwitchBlock&) = delete; CSwitchBlock& operator = (const CSwitchBlock&) = delete; }; ////////////////////////////////////////////////////////////////////////// const uint16_t SIZEOF_BLOCK = sizeof(profiler::BaseBlockData) + 1 + sizeof(uint16_t); // SerializedBlock stores BaseBlockData + at least 1 character for name ('\0') + 2 bytes for size of serialized data const uint16_t SIZEOF_CSWITCH = SIZEOF_BLOCK + 4; // SerializedCSwitch also stores additional 4 bytes to be able to save 64-bit thread_id struct ThreadStorage { StackBuffer nonscopedBlocks; BlocksList, SIZEOF_BLOCK * (uint16_t)128U> blocks; BlocksList sync; std::string name; ///< Thread name #ifndef _WIN32 const pthread_t pthread_id; ///< Thread pointer #endif const profiler::thread_id_t id; ///< Thread ID std::atomic expired; ///< Is thread expired std::atomic_bool frame; ///< Is new frame opened bool allowChildren; ///< False if one of previously opened blocks has OFF_RECURSIVE or ON_WITHOUT_CHILDREN status bool named; ///< True if thread name was set bool guarded; ///< True if thread has been registered using ThreadGuard void storeBlock(const profiler::Block& _block); void storeCSwitch(const CSwitchBlock& _block); void clearClosed(); void popSilent(); ThreadStorage(); }; // END of struct ThreadStorage. ////////////////////////////////////////////////////////////////////////// typedef uint64_t processid_t; class BlockDescriptor; class ProfileManager { #ifndef EASY_MAGIC_STATIC_CPP11 friend class ProfileManagerInstance; #endif ProfileManager(); ProfileManager(const ProfileManager& p) = delete; ProfileManager& operator=(const ProfileManager&) = delete; typedef profiler::guard_lock guard_lock_t; typedef std::map map_of_threads_stacks; typedef std::vector block_descriptors_t; #ifdef EASY_PROFILER_HASHED_CSTR_DEFINED typedef std::unordered_map descriptors_map_t; #else typedef std::unordered_map descriptors_map_t; #endif const processid_t m_processId; map_of_threads_stacks m_threads; block_descriptors_t m_descriptors; descriptors_map_t m_descriptorsMap; uint64_t m_usedMemorySize; profiler::timestamp_t m_beginTime; profiler::timestamp_t m_endTime; std::atomic m_frameMax; std::atomic m_frameAvg; std::atomic m_frameCur; profiler::spin_lock m_spin; profiler::spin_lock m_storedSpin; profiler::spin_lock m_dumpSpin; std::atomic m_mainThreadId; std::atomic m_profilerStatus; std::atomic_bool m_isEventTracingEnabled; std::atomic_bool m_isAlreadyListening; std::atomic_bool m_frameMaxReset; std::atomic_bool m_frameAvgReset; std::string m_csInfoFilename = "/tmp/cs_profiling_info.log"; uint32_t dumpBlocksToStream(profiler::OStream& _outputStream, bool _lockSpin); void setBlockStatus(profiler::block_id_t _id, profiler::EasyBlockStatus _status); std::thread m_listenThread; void listen(uint16_t _port); std::atomic_bool m_stopListen; public: static ProfileManager& instance(); ~ProfileManager(); const profiler::BaseBlockDescriptor* addBlockDescriptor(profiler::EasyBlockStatus _defaultStatus, const char* _autogenUniqueId, const char* _name, const char* _filename, int _line, profiler::block_type_t _block_type, profiler::color_t _color, bool _copyName = false); bool storeBlock(const profiler::BaseBlockDescriptor* _desc, const char* _runtimeName); bool storeBlock(const profiler::BaseBlockDescriptor* _desc, const char* _runtimeName, profiler::timestamp_t _beginTime, profiler::timestamp_t _endTime); void beginBlock(profiler::Block& _block); void beginNonScopedBlock(const profiler::BaseBlockDescriptor* _desc, const char* _runtimeName); void endBlock(); profiler::timestamp_t maxFrameDuration(); profiler::timestamp_t avgFrameDuration(); profiler::timestamp_t curFrameDuration() const; void setEnabled(bool isEnable); bool isEnabled() const; void setEventTracingEnabled(bool _isEnable); bool isEventTracingEnabled() const; uint32_t dumpBlocksToFile(const char* filename); const char* registerThread(const char* name, profiler::ThreadGuard& threadGuard); const char* registerThread(const char* name); void setContextSwitchLogFilename(const char* name) { m_csInfoFilename = name; } const char* getContextSwitchLogFilename() const { return m_csInfoFilename.c_str(); } void beginContextSwitch(profiler::thread_id_t _thread_id, profiler::timestamp_t _time, profiler::thread_id_t _target_thread_id, const char* _target_process, bool _lockSpin = true); void endContextSwitch(profiler::thread_id_t _thread_id, processid_t _process_id, profiler::timestamp_t _endtime, bool _lockSpin = true); void startListen(uint16_t _port); void stopListen(); bool isListening() const; private: void beginFrame(); void endFrame(); void enableEventTracer(); void disableEventTracer(); char checkThreadExpired(ThreadStorage& _registeredThread); void storeBlockForce(const profiler::BaseBlockDescriptor* _desc, const char* _runtimeName, ::profiler::timestamp_t& _timestamp); void storeBlockForce2(const profiler::BaseBlockDescriptor* _desc, const char* _runtimeName, ::profiler::timestamp_t _timestamp); void storeBlockForce2(ThreadStorage& _registeredThread, const profiler::BaseBlockDescriptor* _desc, const char* _runtimeName, ::profiler::timestamp_t _timestamp); ThreadStorage& _threadStorage(profiler::thread_id_t _thread_id); ThreadStorage* _findThreadStorage(profiler::thread_id_t _thread_id); inline ThreadStorage& threadStorage(profiler::thread_id_t _thread_id) { guard_lock_t lock(m_spin); return _threadStorage(_thread_id); } inline ThreadStorage* findThreadStorage(profiler::thread_id_t _thread_id) { guard_lock_t lock(m_spin); return _findThreadStorage(_thread_id); } }; // END of class ProfileManager. ////////////////////////////////////////////////////////////////////////// #endif // EASY_PROFILER_MANAGER_H