//===-- xray_profile_collector.cc ------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file is a part of XRay, a dynamic runtime instrumentation system. // // This implements the interface for the profileCollectorService. // //===----------------------------------------------------------------------===// #include "xray_profile_collector.h" #include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_vector.h" #include "xray_profiling_flags.h" #include #include #include namespace __xray { namespace profileCollectorService { namespace { SpinMutex GlobalMutex; struct ThreadTrie { tid_t TId; FunctionCallTrie *Trie; }; struct ProfileBuffer { void *Data; size_t Size; }; struct BlockHeader { u32 BlockSize; u32 BlockNum; u64 ThreadId; }; // These need to be pointers that point to heap/internal-allocator-allocated // objects because these are accessed even at program exit. Vector *ThreadTries = nullptr; Vector *ProfileBuffers = nullptr; FunctionCallTrie::Allocators *GlobalAllocators = nullptr; } // namespace void post(const FunctionCallTrie &T, tid_t TId) { static pthread_once_t Once = PTHREAD_ONCE_INIT; pthread_once(&Once, +[] { SpinMutexLock Lock(&GlobalMutex); GlobalAllocators = reinterpret_cast( InternalAlloc(sizeof(FunctionCallTrie::Allocators))); new (GlobalAllocators) FunctionCallTrie::Allocators(); *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom( profilingFlags()->global_allocator_max); ThreadTries = reinterpret_cast *>( InternalAlloc(sizeof(Vector))); new (ThreadTries) Vector(); ProfileBuffers = reinterpret_cast *>( InternalAlloc(sizeof(Vector))); new (ProfileBuffers) Vector(); }); DCHECK_NE(GlobalAllocators, nullptr); DCHECK_NE(ThreadTries, nullptr); DCHECK_NE(ProfileBuffers, nullptr); ThreadTrie *Item = nullptr; { SpinMutexLock Lock(&GlobalMutex); if (GlobalAllocators == nullptr) return; Item = ThreadTries->PushBack(); Item->TId = TId; // Here we're using the internal allocator instead of the managed allocator // because: // // 1) We're not using the segmented array data structure to host // FunctionCallTrie objects. We're using a Vector (from sanitizer_common) // which works like a std::vector<...> keeping elements contiguous in // memory. The segmented array data structure assumes that elements are // trivially destructible, where FunctionCallTrie isn't. // // 2) Using a managed allocator means we need to manage that separately, // which complicates the nature of this code. To get around that, we're // using the internal allocator instead, which has its own global state // and is decoupled from the lifetime management required by the managed // allocator we have in XRay. // Item->Trie = reinterpret_cast(InternalAlloc( sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie))); DCHECK_NE(Item->Trie, nullptr); new (Item->Trie) FunctionCallTrie(*GlobalAllocators); } T.deepCopyInto(*Item->Trie); } // A PathArray represents the function id's representing a stack trace. In this // context a path is almost always represented from the leaf function in a call // stack to a root of the call trie. using PathArray = Array; struct ProfileRecord { using PathAllocator = typename PathArray::AllocatorType; // The Path in this record is the function id's from the leaf to the root of // the function call stack as represented from a FunctionCallTrie. PathArray *Path = nullptr; const FunctionCallTrie::Node *Node = nullptr; // Constructor for in-place construction. ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N) : Path([&] { auto P = reinterpret_cast(InternalAlloc(sizeof(PathArray))); new (P) PathArray(A); return P; }()), Node(N) {} }; namespace { using ProfileRecordArray = Array; // Walk a depth-first traversal of each root of the FunctionCallTrie to generate // the path(s) and the data associated with the path. static void populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, const FunctionCallTrie &Trie) { using StackArray = Array; using StackAllocator = typename StackArray::AllocatorType; StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); StackArray DFSStack(StackAlloc); for (const auto R : Trie.getRoots()) { DFSStack.Append(R); while (!DFSStack.empty()) { auto Node = DFSStack.back(); DFSStack.trim(1); auto Record = PRs.AppendEmplace(PA, Node); if (Record == nullptr) return; DCHECK_NE(Record, nullptr); // Traverse the Node's parents and as we're doing so, get the FIds in // the order they appear. for (auto N = Node; N != nullptr; N = N->Parent) Record->Path->Append(N->FId); DCHECK(!Record->Path->empty()); for (const auto C : Node->Callees) DFSStack.Append(C.NodePtr); } } } static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, const ProfileRecordArray &ProfileRecords) { auto NextPtr = static_cast( internal_memcpy(Buffer->Data, &Header, sizeof(Header))) + sizeof(Header); for (const auto &Record : ProfileRecords) { // List of IDs follow: for (const auto FId : *Record.Path) NextPtr = static_cast(internal_memcpy(NextPtr, &FId, sizeof(FId))) + sizeof(FId); // Add the sentinel here. constexpr int32_t SentinelFId = 0; NextPtr = static_cast( internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) + sizeof(SentinelFId); // Add the node data here. NextPtr = static_cast(internal_memcpy(NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) + sizeof(Record.Node->CallCount); NextPtr = static_cast( internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime, sizeof(Record.Node->CumulativeLocalTime))) + sizeof(Record.Node->CumulativeLocalTime); } DCHECK_EQ(NextPtr - static_cast(Buffer->Data), Buffer->Size); } } // namespace void serialize() { SpinMutexLock Lock(&GlobalMutex); // Clear out the global ProfileBuffers. for (uptr I = 0; I < ProfileBuffers->Size(); ++I) InternalFree((*ProfileBuffers)[I].Data); ProfileBuffers->Reset(); if (ThreadTries->Size() == 0) return; // Then repopulate the global ProfileBuffers. for (u32 I = 0; I < ThreadTries->Size(); ++I) { using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max); ProfileRecord::PathAllocator PathAlloc( profilingFlags()->global_allocator_max); ProfileRecordArray ProfileRecords(PRAlloc); // First, we want to compute the amount of space we're going to need. We'll // use a local allocator and an __xray::Array<...> to store the intermediary // data, then compute the size as we're going along. Then we'll allocate the // contiguous space to contain the thread buffer data. const auto &Trie = *(*ThreadTries)[I].Trie; if (Trie.getRoots().empty()) continue; populateRecords(ProfileRecords, PathAlloc, Trie); DCHECK(!Trie.getRoots().empty()); DCHECK(!ProfileRecords.empty()); // Go through each record, to compute the sizes. // // header size = block size (4 bytes) // + block number (4 bytes) // + thread id (8 bytes) // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) // + call count (8 bytes) // + local time (8 bytes) // + end of record (8 bytes) u32 CumulativeSizes = 0; for (const auto &Record : ProfileRecords) CumulativeSizes += 20 + (4 * Record.Path->size()); BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId}; auto Buffer = ProfileBuffers->PushBack(); Buffer->Size = sizeof(Header) + CumulativeSizes; Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64); DCHECK_NE(Buffer->Data, nullptr); serializeRecords(Buffer, Header, ProfileRecords); // Now clean up the ProfileRecords array, one at a time. for (auto &Record : ProfileRecords) { Record.Path->~PathArray(); InternalFree(Record.Path); } } } void reset() { SpinMutexLock Lock(&GlobalMutex); if (ProfileBuffers != nullptr) { // Clear out the profile buffers that have been serialized. for (uptr I = 0; I < ProfileBuffers->Size(); ++I) InternalFree((*ProfileBuffers)[I].Data); ProfileBuffers->Reset(); InternalFree(ProfileBuffers); ProfileBuffers = nullptr; } if (ThreadTries != nullptr) { // Clear out the function call tries per thread. for (uptr I = 0; I < ThreadTries->Size(); ++I) { auto &T = (*ThreadTries)[I]; T.Trie->~FunctionCallTrie(); InternalFree(T.Trie); } ThreadTries->Reset(); InternalFree(ThreadTries); ThreadTries = nullptr; } // Reset the global allocators. if (GlobalAllocators != nullptr) { GlobalAllocators->~Allocators(); InternalFree(GlobalAllocators); GlobalAllocators = nullptr; } GlobalAllocators = reinterpret_cast( InternalAlloc(sizeof(FunctionCallTrie::Allocators))); new (GlobalAllocators) FunctionCallTrie::Allocators(); *GlobalAllocators = FunctionCallTrie::InitAllocators(); ThreadTries = reinterpret_cast *>( InternalAlloc(sizeof(Vector))); new (ThreadTries) Vector(); ProfileBuffers = reinterpret_cast *>( InternalAlloc(sizeof(Vector))); new (ProfileBuffers) Vector(); } XRayBuffer nextBuffer(XRayBuffer B) { SpinMutexLock Lock(&GlobalMutex); if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0) return {nullptr, 0}; if (B.Data == nullptr) return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size}; BlockHeader Header; internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); auto NextBlock = Header.BlockNum + 1; if (NextBlock < ProfileBuffers->Size()) return {(*ProfileBuffers)[NextBlock].Data, (*ProfileBuffers)[NextBlock].Size}; return {nullptr, 0}; } } // namespace profileCollectorService } // namespace __xray