summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorDean Michael Berris <dberris@google.com>2018-07-18 01:53:39 +0000
committerDean Michael Berris <dberris@google.com>2018-07-18 01:53:39 +0000
commit1d40550d23737645acb8d9f4fd90472b52328906 (patch)
tree8a46145b7b54ff6ec7d8cf75963a36465b2da312 /lib
parentf8e9626b112c22014212116b8d53dcc0d2c657d1 (diff)
[XRay][compiler-rt] Simplify Allocator Implementation
Summary: This change simplifies the XRay Allocator implementation to self-manage an mmap'ed memory segment instead of using the internal allocator implementation in sanitizer_common. We've found through benchmarks and profiling these benchmarks in D48879 that using the internal allocator in sanitizer_common introduces a bottleneck on allocating memory through a central spinlock. This change allows thread-local allocators to eliminate contention on the centralized allocator. To get the most benefit from this approach, we also use a managed allocator for the chunk elements used by the segmented array implementation. This gives us the chance to amortize the cost of allocating memory when creating these internal segmented array data structures. We also took the opportunity to remove the preallocation argument from the allocator API, simplifying the usage of the allocator throughout the profiling implementation. In this change we also tweak some of the flag values to reduce the amount of maximum memory we use/need for each thread, when requesting memory through mmap. Depends on D48956. Reviewers: kpw, eizan Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D49217 git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@337342 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib')
-rw-r--r--lib/xray/tests/unit/allocator_test.cc6
-rw-r--r--lib/xray/tests/unit/function_call_trie_test.cc11
-rw-r--r--lib/xray/tests/unit/segmented_array_test.cc57
-rw-r--r--lib/xray/xray_allocator.h170
-rw-r--r--lib/xray/xray_function_call_trie.h80
-rw-r--r--lib/xray/xray_profile_collector.cc26
-rw-r--r--lib/xray/xray_profiling.cc37
-rw-r--r--lib/xray/xray_profiling_flags.inc2
-rw-r--r--lib/xray/xray_segmented_array.h62
-rw-r--r--lib/xray/xray_utils.h10
10 files changed, 245 insertions, 216 deletions
diff --git a/lib/xray/tests/unit/allocator_test.cc b/lib/xray/tests/unit/allocator_test.cc
index 7636d296a..be404160e 100644
--- a/lib/xray/tests/unit/allocator_test.cc
+++ b/lib/xray/tests/unit/allocator_test.cc
@@ -22,16 +22,16 @@ struct TestData {
s64 Second;
};
-TEST(AllocatorTest, Construction) { Allocator<sizeof(TestData)> A(2 << 11, 0); }
+TEST(AllocatorTest, Construction) { Allocator<sizeof(TestData)> A(2 << 11); }
TEST(AllocatorTest, Allocate) {
- Allocator<sizeof(TestData)> A(2 << 11, 0);
+ Allocator<sizeof(TestData)> A(2 << 11);
auto B = A.Allocate();
ASSERT_NE(B.Data, nullptr);
}
TEST(AllocatorTest, OverAllocate) {
- Allocator<sizeof(TestData)> A(sizeof(TestData), 0);
+ Allocator<sizeof(TestData)> A(sizeof(TestData));
auto B1 = A.Allocate();
(void)B1;
auto B2 = A.Allocate();
diff --git a/lib/xray/tests/unit/function_call_trie_test.cc b/lib/xray/tests/unit/function_call_trie_test.cc
index 6ad51aa14..049ecfb07 100644
--- a/lib/xray/tests/unit/function_call_trie_test.cc
+++ b/lib/xray/tests/unit/function_call_trie_test.cc
@@ -19,8 +19,6 @@ namespace __xray {
namespace {
TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) {
- // FIXME: Support passing in configuration for allocators in the allocator
- // constructors.
profilingFlags()->setDefaults();
FunctionCallTrie::Allocators Allocators = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(Allocators);
@@ -47,6 +45,7 @@ TEST(FunctionCallTrieTest, EnterAndExitFunction) {
}
TEST(FunctionCallTrieTest, MissingFunctionEntry) {
+ profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
Trie.exitFunction(1, 1);
@@ -56,6 +55,7 @@ TEST(FunctionCallTrieTest, MissingFunctionEntry) {
}
TEST(FunctionCallTrieTest, NoMatchingEntersForExit) {
+ profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
Trie.enterFunction(2, 1);
@@ -63,16 +63,19 @@ TEST(FunctionCallTrieTest, NoMatchingEntersForExit) {
Trie.exitFunction(1, 5);
const auto &R = Trie.getRoots();
- ASSERT_TRUE(R.empty());
+ ASSERT_FALSE(R.empty());
+ EXPECT_EQ(R.size(), size_t{1});
}
TEST(FunctionCallTrieTest, MissingFunctionExit) {
+ profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
Trie.enterFunction(1, 1);
const auto &R = Trie.getRoots();
- ASSERT_TRUE(R.empty());
+ ASSERT_FALSE(R.empty());
+ EXPECT_EQ(R.size(), size_t{1});
}
TEST(FunctionCallTrieTest, MultipleRoots) {
diff --git a/lib/xray/tests/unit/segmented_array_test.cc b/lib/xray/tests/unit/segmented_array_test.cc
index 539162d7d..b48d481ff 100644
--- a/lib/xray/tests/unit/segmented_array_test.cc
+++ b/lib/xray/tests/unit/segmented_array_test.cc
@@ -12,27 +12,29 @@ struct TestData {
TestData(s64 F, s64 S) : First(F), Second(S) {}
};
-TEST(SegmentedArrayTest, Construction) {
- Array<TestData> Data;
- (void)Data;
-}
-
-TEST(SegmentedArrayTest, ConstructWithAllocator) {
+TEST(SegmentedArrayTest, ConstructWithAllocators) {
using AllocatorType = typename Array<TestData>::AllocatorType;
- AllocatorType A(1 << 4, 0);
- Array<TestData> Data(A);
+ AllocatorType A(1 << 4);
+ ChunkAllocator CA(1 << 4);
+ Array<TestData> Data(A, CA);
(void)Data;
}
TEST(SegmentedArrayTest, ConstructAndPopulate) {
- Array<TestData> data;
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 4);
+ ChunkAllocator CA(1 << 4);
+ Array<TestData> data(A, CA);
ASSERT_NE(data.Append(TestData{0, 0}), nullptr);
ASSERT_NE(data.Append(TestData{1, 1}), nullptr);
ASSERT_EQ(data.size(), 2u);
}
TEST(SegmentedArrayTest, ConstructPopulateAndLookup) {
- Array<TestData> data;
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 4);
+ ChunkAllocator CA(1 << 4);
+ Array<TestData> data(A, CA);
ASSERT_NE(data.Append(TestData{0, 1}), nullptr);
ASSERT_EQ(data.size(), 1u);
ASSERT_EQ(data[0].First, 0);
@@ -40,7 +42,10 @@ TEST(SegmentedArrayTest, ConstructPopulateAndLookup) {
}
TEST(SegmentedArrayTest, PopulateWithMoreElements) {
- Array<TestData> data;
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 24);
+ ChunkAllocator CA(1 << 20);
+ Array<TestData> data(A, CA);
static const auto kMaxElements = 100u;
for (auto I = 0u; I < kMaxElements; ++I) {
ASSERT_NE(data.Append(TestData{I, I + 1}), nullptr);
@@ -53,14 +58,20 @@ TEST(SegmentedArrayTest, PopulateWithMoreElements) {
}
TEST(SegmentedArrayTest, AppendEmplace) {
- Array<TestData> data;
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 4);
+ ChunkAllocator CA(1 << 4);
+ Array<TestData> data(A, CA);
ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
ASSERT_EQ(data[0].First, 1);
ASSERT_EQ(data[0].Second, 1);
}
TEST(SegmentedArrayTest, AppendAndTrim) {
- Array<TestData> data;
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 4);
+ ChunkAllocator CA(1 << 4);
+ Array<TestData> data(A, CA);
ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
ASSERT_EQ(data.size(), 1u);
data.trim(1);
@@ -69,7 +80,10 @@ TEST(SegmentedArrayTest, AppendAndTrim) {
}
TEST(SegmentedArrayTest, IteratorAdvance) {
- Array<TestData> data;
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 4);
+ ChunkAllocator CA(1 << 4);
+ Array<TestData> data(A, CA);
ASSERT_TRUE(data.empty());
ASSERT_EQ(data.begin(), data.end());
auto I0 = data.begin();
@@ -88,7 +102,10 @@ TEST(SegmentedArrayTest, IteratorAdvance) {
}
TEST(SegmentedArrayTest, IteratorRetreat) {
- Array<TestData> data;
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 4);
+ ChunkAllocator CA(1 << 4);
+ Array<TestData> data(A, CA);
ASSERT_TRUE(data.empty());
ASSERT_EQ(data.begin(), data.end());
ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
@@ -108,8 +125,9 @@ TEST(SegmentedArrayTest, IteratorRetreat) {
TEST(SegmentedArrayTest, IteratorTrimBehaviour) {
using AllocatorType = typename Array<TestData>::AllocatorType;
- AllocatorType A(1 << 10, 0);
- Array<TestData> Data(A);
+ AllocatorType A(1 << 20);
+ ChunkAllocator CA(1 << 10);
+ Array<TestData> Data(A, CA);
ASSERT_TRUE(Data.empty());
auto I0Begin = Data.begin(), I0End = Data.end();
// Add enough elements in Data to have more than one chunk.
@@ -160,8 +178,9 @@ struct ShadowStackEntry {
TEST(SegmentedArrayTest, SimulateStackBehaviour) {
using AllocatorType = typename Array<ShadowStackEntry>::AllocatorType;
- AllocatorType A(1 << 10, 0);
- Array<ShadowStackEntry> Data(A);
+ AllocatorType A(1 << 10);
+ ChunkAllocator CA(1 << 10);
+ Array<ShadowStackEntry> Data(A, CA);
static uint64_t Dummy = 0;
constexpr uint64_t Max = 9;
diff --git a/lib/xray/xray_allocator.h b/lib/xray/xray_allocator.h
index c05c0485c..5d0a62547 100644
--- a/lib/xray/xray_allocator.h
+++ b/lib/xray/xray_allocator.h
@@ -16,10 +16,11 @@
#ifndef XRAY_ALLOCATOR_H
#define XRAY_ALLOCATOR_H
-#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_internal_defs.h"
#include "sanitizer_common/sanitizer_mutex.h"
+#include "sanitizer_common/sanitizer_posix.h"
+#include "sys/mman.h"
#include "xray_utils.h"
#include <cstddef>
#include <cstdint>
@@ -36,130 +37,87 @@ namespace __xray {
/// allocation function. N is used to compute the size of a block, which is
/// cache-line-size multiples worth of memory. We compute the size of a block by
/// determining how many cache lines worth of memory is required to subsume N.
+///
+/// The Allocator instance will manage its own memory acquired through mmap.
+/// This severely constrains the platforms on which this can be used to POSIX
+/// systems where mmap semantics are well-defined.
+///
+/// FIXME: Isolate the lower-level memory management to a different abstraction
+/// that can be platform-specific.
template <size_t N> struct Allocator {
// The Allocator returns memory as Block instances.
struct Block {
/// Compute the minimum cache-line size multiple that is >= N.
static constexpr auto Size = nearest_boundary(N, kCacheLineSize);
- void *Data = nullptr;
+ void *Data;
};
private:
- // A BlockLink will contain a fixed number of blocks, each with an identifier
- // to specify whether it's been handed out or not. We keep track of BlockLink
- // iterators, which are basically a pointer to the link and an offset into
- // the fixed set of blocks associated with a link. The iterators are
- // bidirectional.
- //
- // We're calling it a "link" in the context of seeing these as a chain of
- // block pointer containers (i.e. links in a chain).
- struct BlockLink {
- static_assert(kCacheLineSize % sizeof(void *) == 0,
- "Cache line size is not divisible by size of void*; none of "
- "the assumptions of the BlockLink will hold.");
-
- // We compute the number of pointers to areas in memory where we consider as
- // individual blocks we've allocated. To ensure that instances of the
- // BlockLink object are cache-line sized, we deduct two pointers worth
- // representing the pointer to the previous link and the backing store for
- // the whole block.
- //
- // This structure corresponds to the following layout:
- //
- // Blocks [ 0, 1, 2, .., BlockPtrCount - 2]
- //
- static constexpr auto BlockPtrCount =
- (kCacheLineSize / sizeof(Block *)) - 2;
-
- BlockLink() {
- // Zero out Blocks.
- // FIXME: Use a braced member initializer when we drop support for GCC
- // 4.8.
- internal_memset(Blocks, 0, sizeof(Blocks));
- }
-
- // FIXME: Align this to cache-line address boundaries?
- Block Blocks[BlockPtrCount];
- BlockLink *Prev = nullptr;
- void *BackingStore = nullptr;
- };
-
- static_assert(sizeof(BlockLink) == kCacheLineSize,
- "BlockLink instances must be cache-line-sized.");
+ const size_t MaxMemory{0};
+ void *BackingStore = nullptr;
+ void *AlignedNextBlock = nullptr;
+ size_t AllocatedBlocks = 0;
+ SpinMutex Mutex{};
- static BlockLink NullLink;
+ void *Alloc() {
+ SpinMutexLock Lock(&Mutex);
+ if (UNLIKELY(BackingStore == nullptr)) {
+ BackingStore = reinterpret_cast<void *>(
+ internal_mmap(NULL, MaxMemory, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, 0, 0));
+ if (BackingStore == MAP_FAILED) {
+ BackingStore = nullptr;
+ if (Verbosity())
+ Report("XRay Profiling: Failed to allocate memory for allocator.\n");
+ return nullptr;
+ }
+
+ AlignedNextBlock = BackingStore;
+
+ // Ensure that NextBlock is aligned appropriately.
+ auto BackingStoreNum = reinterpret_cast<uintptr_t>(BackingStore);
+ auto AlignedNextBlockNum = nearest_boundary(
+ reinterpret_cast<uintptr_t>(AlignedNextBlock), kCacheLineSize);
+ if (diff(AlignedNextBlockNum, BackingStoreNum) > ptrdiff_t(MaxMemory)) {
+ munmap(BackingStore, MaxMemory);
+ AlignedNextBlock = BackingStore = nullptr;
+ if (Verbosity())
+ Report("XRay Profiling: Cannot obtain enough memory from "
+ "preallocated region.\n");
+ return nullptr;
+ }
+
+ AlignedNextBlock = reinterpret_cast<void *>(AlignedNextBlockNum);
+
+ // Assert that AlignedNextBlock is cache-line aligned.
+ DCHECK_EQ(reinterpret_cast<uintptr_t>(AlignedNextBlock) % kCacheLineSize,
+ 0);
+ }
- // FIXME: Implement a freelist, in case we actually do intend to return memory
- // to the allocator, as opposed to just de-allocating everything in one go?
+ if ((AllocatedBlocks * Block::Size) >= MaxMemory)
+ return nullptr;
- size_t MaxMemory;
- SpinMutex Mutex{};
- BlockLink *Tail = &NullLink;
- size_t Counter = 0;
-
- BlockLink *NewChainLink(uint64_t Alignment) {
- auto NewChain = reinterpret_cast<BlockLink *>(
- InternalAlloc(sizeof(BlockLink), nullptr, kCacheLineSize));
- auto BackingStore = reinterpret_cast<char *>(InternalAlloc(
- (BlockLink::BlockPtrCount + 1) * Block::Size, nullptr, Alignment));
- size_t Offset = 0;
- DCHECK_NE(NewChain, nullptr);
- DCHECK_NE(BackingStore, nullptr);
- NewChain->BackingStore = BackingStore;
-
- // Here we ensure that the alignment of the pointers we're handing out
- // adhere to the alignment requirements of the call to Allocate().
- for (auto &B : NewChain->Blocks) {
- auto AlignmentAdjustment =
- nearest_boundary(reinterpret_cast<uintptr_t>(BackingStore + Offset),
- Alignment) -
- reinterpret_cast<uintptr_t>(BackingStore + Offset);
- B.Data = BackingStore + AlignmentAdjustment + Offset;
- DCHECK_EQ(reinterpret_cast<uintptr_t>(B.Data) % Alignment, 0);
- Offset += AlignmentAdjustment + Block::Size;
- }
- NewChain->Prev = Tail;
- return NewChain;
+ // Align the pointer we'd like to return to an appropriate alignment, then
+ // advance the pointer from where to start allocations.
+ void *Result = AlignedNextBlock;
+ AlignedNextBlock = reinterpret_cast<void *>(
+ reinterpret_cast<char *>(AlignedNextBlock) + N);
+ ++AllocatedBlocks;
+ return Result;
}
public:
- Allocator(size_t M, size_t PreAllocate) : MaxMemory(M) {
- // FIXME: Implement PreAllocate support!
- }
-
- Block Allocate(uint64_t Alignment = 8) {
- SpinMutexLock Lock(&Mutex);
- // Check whether we're over quota.
- if (Counter * Block::Size >= MaxMemory)
- return {};
+ explicit Allocator(size_t M)
+ : MaxMemory(nearest_boundary(M, kCacheLineSize)) {}
- size_t ChainOffset = Counter % BlockLink::BlockPtrCount;
-
- Block B{};
- BlockLink *Link = Tail;
- if (UNLIKELY(Counter == 0 || ChainOffset == 0))
- Tail = Link = NewChainLink(Alignment);
-
- B = Link->Blocks[ChainOffset];
- ++Counter;
- return B;
- }
+ Block Allocate() { return {Alloc()}; }
~Allocator() NOEXCEPT {
- // We need to deallocate all the blocks, including the chain links.
- for (auto *C = Tail; C != &NullLink;) {
- // We know that the data block is a large contiguous page, we deallocate
- // that at once.
- InternalFree(C->BackingStore);
- auto Prev = C->Prev;
- InternalFree(C);
- C = Prev;
+ if (BackingStore != nullptr) {
+ internal_munmap(BackingStore, MaxMemory);
}
}
-}; // namespace __xray
-
-// Storage for the NullLink sentinel.
-template <size_t N> typename Allocator<N>::BlockLink Allocator<N>::NullLink;
+};
} // namespace __xray
diff --git a/lib/xray/xray_function_call_trie.h b/lib/xray/xray_function_call_trie.h
index 6c4710584..e6e31cec2 100644
--- a/lib/xray/xray_function_call_trie.h
+++ b/lib/xray/xray_function_call_trie.h
@@ -15,6 +15,7 @@
#ifndef XRAY_FUNCTION_CALL_TRIE_H
#define XRAY_FUNCTION_CALL_TRIE_H
+#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "xray_profiling_flags.h"
#include "xray_segmented_array.h"
#include <memory> // For placement new.
@@ -118,9 +119,9 @@ public:
// We add a constructor here to allow us to inplace-construct through
// Array<...>'s AppendEmplace.
- Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
- int32_t F)
- : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT),
+ Node(Node *P, NodeIdPairAllocatorType &A, ChunkAllocator &CA, int64_t CC,
+ int64_t CLT, int32_t F)
+ : Parent(P), Callees(A, CA), CallCount(CC), CumulativeLocalTime(CLT),
FId(F) {}
// TODO: Include the compact histogram.
@@ -152,6 +153,7 @@ public:
RootAllocatorType *RootAllocator = nullptr;
ShadowStackAllocatorType *ShadowStackAllocator = nullptr;
NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
+ ChunkAllocator *ChunkAlloc = nullptr;
Allocators() {}
Allocators(const Allocators &) = delete;
@@ -160,11 +162,12 @@ public:
Allocators(Allocators &&O)
: NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator),
ShadowStackAllocator(O.ShadowStackAllocator),
- NodeIdPairAllocator(O.NodeIdPairAllocator) {
+ NodeIdPairAllocator(O.NodeIdPairAllocator), ChunkAlloc(O.ChunkAlloc) {
O.NodeAllocator = nullptr;
O.RootAllocator = nullptr;
O.ShadowStackAllocator = nullptr;
O.NodeIdPairAllocator = nullptr;
+ O.ChunkAlloc = nullptr;
}
Allocators &operator=(Allocators &&O) {
@@ -188,6 +191,11 @@ public:
O.NodeIdPairAllocator = this->NodeIdPairAllocator;
this->NodeIdPairAllocator = Tmp;
}
+ {
+ auto Tmp = O.ChunkAlloc;
+ O.ChunkAlloc = this->ChunkAlloc;
+ this->ChunkAlloc = Tmp;
+ }
return *this;
}
@@ -198,18 +206,27 @@ public:
if (NodeAllocator != nullptr) {
NodeAllocator->~NodeAllocatorType();
InternalFree(NodeAllocator);
+ NodeAllocator = nullptr;
}
if (RootAllocator != nullptr) {
RootAllocator->~RootAllocatorType();
InternalFree(RootAllocator);
+ RootAllocator = nullptr;
}
if (ShadowStackAllocator != nullptr) {
ShadowStackAllocator->~ShadowStackAllocatorType();
InternalFree(ShadowStackAllocator);
+ ShadowStackAllocator = nullptr;
}
if (NodeIdPairAllocator != nullptr) {
NodeIdPairAllocator->~NodeIdPairAllocatorType();
InternalFree(NodeIdPairAllocator);
+ NodeIdPairAllocator = nullptr;
+ }
+ if (ChunkAlloc != nullptr) {
+ ChunkAlloc->~ChunkAllocator();
+ InternalFree(ChunkAlloc);
+ ChunkAlloc = nullptr;
}
}
};
@@ -223,24 +240,29 @@ public:
Allocators A;
auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>(
InternalAlloc(sizeof(Allocators::NodeAllocatorType)));
- new (NodeAllocator) Allocators::NodeAllocatorType(Max, 0);
+ new (NodeAllocator) Allocators::NodeAllocatorType(Max);
A.NodeAllocator = NodeAllocator;
auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>(
InternalAlloc(sizeof(Allocators::RootAllocatorType)));
- new (RootAllocator) Allocators::RootAllocatorType(Max, 0);
+ new (RootAllocator) Allocators::RootAllocatorType(Max);
A.RootAllocator = RootAllocator;
auto ShadowStackAllocator =
reinterpret_cast<Allocators::ShadowStackAllocatorType *>(
InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType)));
- new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max, 0);
+ new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max);
A.ShadowStackAllocator = ShadowStackAllocator;
auto NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
InternalAlloc(sizeof(NodeIdPairAllocatorType)));
- new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max, 0);
+ new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max);
A.NodeIdPairAllocator = NodeIdPairAllocator;
+
+ auto ChunkAlloc = reinterpret_cast<ChunkAllocator *>(
+ InternalAlloc(sizeof(ChunkAllocator)));
+ new (ChunkAlloc) ChunkAllocator(Max);
+ A.ChunkAlloc = ChunkAlloc;
return A;
}
@@ -249,20 +271,22 @@ private:
RootArray Roots;
ShadowStackArray ShadowStack;
NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
+ ChunkAllocator *ChunkAlloc = nullptr;
public:
explicit FunctionCallTrie(const Allocators &A)
- : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
- ShadowStack(*A.ShadowStackAllocator),
- NodeIdPairAllocator(A.NodeIdPairAllocator) {}
+ : Nodes(*A.NodeAllocator, *A.ChunkAlloc),
+ Roots(*A.RootAllocator, *A.ChunkAlloc),
+ ShadowStack(*A.ShadowStackAllocator, *A.ChunkAlloc),
+ NodeIdPairAllocator(A.NodeIdPairAllocator), ChunkAlloc(A.ChunkAlloc) {}
void enterFunction(const int32_t FId, uint64_t TSC) {
DCHECK_NE(FId, 0);
// This function primarily deals with ensuring that the ShadowStack is
// consistent and ready for when an exit event is encountered.
if (UNLIKELY(ShadowStack.empty())) {
- auto NewRoot =
- Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId);
+ auto NewRoot = Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator,
+ *ChunkAlloc, 0, 0, FId);
if (UNLIKELY(NewRoot == nullptr))
return;
Roots.Append(NewRoot);
@@ -285,8 +309,8 @@ public:
}
// This means we've never seen this stack before, create a new node here.
- auto NewNode =
- Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
+ auto NewNode = Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator,
+ *ChunkAlloc, 0, 0, FId);
if (UNLIKELY(NewNode == nullptr))
return;
DCHECK_NE(NewNode, nullptr);
@@ -345,13 +369,14 @@ public:
using Stack = Array<NodeAndParent>;
typename Stack::AllocatorType StackAllocator(
- profilingFlags()->stack_allocator_max, 0);
- Stack DFSStack(StackAllocator);
+ profilingFlags()->stack_allocator_max);
+ ChunkAllocator StackChunkAllocator(profilingFlags()->stack_allocator_max);
+ Stack DFSStack(StackAllocator, StackChunkAllocator);
for (const auto Root : getRoots()) {
// Add a node in O for this root.
auto NewRoot = O.Nodes.AppendEmplace(
- nullptr, *O.NodeIdPairAllocator, Root->CallCount,
+ nullptr, *O.NodeIdPairAllocator, *O.ChunkAlloc, Root->CallCount,
Root->CumulativeLocalTime, Root->FId);
// Because we cannot allocate more memory we should bail out right away.
@@ -370,8 +395,9 @@ public:
DFSStack.trim(1);
for (const auto Callee : NP.Node->Callees) {
auto NewNode = O.Nodes.AppendEmplace(
- NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount,
- Callee.NodePtr->CumulativeLocalTime, Callee.FId);
+ NP.NewNode, *O.NodeIdPairAllocator, *O.ChunkAlloc,
+ Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime,
+ Callee.FId);
if (UNLIKELY(NewNode == nullptr))
return;
NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
@@ -396,16 +422,17 @@ public:
};
using Stack = Array<NodeAndTarget>;
typename Stack::AllocatorType StackAllocator(
- profilingFlags()->stack_allocator_max, 0);
- Stack DFSStack(StackAllocator);
+ profilingFlags()->stack_allocator_max);
+ ChunkAllocator CA(profilingFlags()->stack_allocator_max);
+ Stack DFSStack(StackAllocator, CA);
for (const auto Root : getRoots()) {
Node *TargetRoot = nullptr;
auto R = O.Roots.find_element(
[&](const Node *Node) { return Node->FId == Root->FId; });
if (R == nullptr) {
- TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
- 0, Root->FId);
+ TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator,
+ *O.ChunkAlloc, 0, 0, Root->FId);
if (UNLIKELY(TargetRoot == nullptr))
return;
@@ -429,8 +456,9 @@ public:
return C.FId == Callee.FId;
});
if (TargetCallee == nullptr) {
- auto NewTargetNode = O.Nodes.AppendEmplace(
- NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+ auto NewTargetNode =
+ O.Nodes.AppendEmplace(NT.TargetNode, *O.NodeIdPairAllocator,
+ *O.ChunkAlloc, 0, 0, Callee.FId);
if (UNLIKELY(NewTargetNode == nullptr))
return;
diff --git a/lib/xray/xray_profile_collector.cc b/lib/xray/xray_profile_collector.cc
index 972649193..ba347d13b 100644
--- a/lib/xray/xray_profile_collector.cc
+++ b/lib/xray/xray_profile_collector.cc
@@ -13,6 +13,7 @@
//
//===----------------------------------------------------------------------===//
#include "xray_profile_collector.h"
+#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_vector.h"
#include "xray_profiling_flags.h"
@@ -117,11 +118,12 @@ struct ProfileRecord {
const FunctionCallTrie::Node *Node = nullptr;
// Constructor for in-place construction.
- ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
+ ProfileRecord(PathAllocator &A, ChunkAllocator &CA,
+ const FunctionCallTrie::Node *N)
: Path([&] {
auto P =
reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
- new (P) PathArray(A);
+ new (P) PathArray(A, CA);
return P;
}()),
Node(N) {}
@@ -135,17 +137,20 @@ using ProfileRecordArray = Array<ProfileRecord>;
// the path(s) and the data associated with the path.
static void populateRecords(ProfileRecordArray &PRs,
ProfileRecord::PathAllocator &PA,
- const FunctionCallTrie &Trie) {
+ ChunkAllocator &CA, const FunctionCallTrie &Trie) {
using StackArray = Array<const FunctionCallTrie::Node *>;
using StackAllocator = typename StackArray::AllocatorType;
- StackAllocator StackAlloc(profilingFlags()->stack_allocator_max, 0);
- StackArray DFSStack(StackAlloc);
+ StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
+ ChunkAllocator StackChunkAlloc(profilingFlags()->stack_allocator_max);
+ StackArray DFSStack(StackAlloc, StackChunkAlloc);
for (const auto R : Trie.getRoots()) {
DFSStack.Append(R);
while (!DFSStack.empty()) {
auto Node = DFSStack.back();
DFSStack.trim(1);
- auto Record = PRs.AppendEmplace(PA, Node);
+ auto Record = PRs.AppendEmplace(PA, CA, Node);
+ if (Record == nullptr)
+ return;
DCHECK_NE(Record, nullptr);
// Traverse the Node's parents and as we're doing so, get the FIds in
@@ -208,10 +213,11 @@ void serialize() {
// Then repopulate the global ProfileBuffers.
for (u32 I = 0; I < ThreadTries->Size(); ++I) {
using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
- ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max, 0);
+ ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
+ ChunkAllocator CA(profilingFlags()->global_allocator_max);
ProfileRecord::PathAllocator PathAlloc(
- profilingFlags()->global_allocator_max, 0);
- ProfileRecordArray ProfileRecords(PRAlloc);
+ profilingFlags()->global_allocator_max);
+ ProfileRecordArray ProfileRecords(PRAlloc, CA);
// First, we want to compute the amount of space we're going to need. We'll
// use a local allocator and an __xray::Array<...> to store the intermediary
@@ -220,7 +226,7 @@ void serialize() {
const auto &Trie = *(*ThreadTries)[I].Trie;
if (Trie.getRoots().empty())
continue;
- populateRecords(ProfileRecords, PathAlloc, Trie);
+ populateRecords(ProfileRecords, PathAlloc, CA, Trie);
DCHECK(!Trie.getRoots().empty());
DCHECK(!ProfileRecords.empty());
diff --git a/lib/xray/xray_profiling.cc b/lib/xray/xray_profiling.cc
index 2c5b82959..3a0c0b90e 100644
--- a/lib/xray/xray_profiling.cc
+++ b/lib/xray/xray_profiling.cc
@@ -56,8 +56,9 @@ struct alignas(64) ProfilingData {
static pthread_key_t ProfilingKey;
-ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
- thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage;
+thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage{};
+
+static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
if (pthread_getspecific(ProfilingKey) == NULL) {
new (&ThreadStorage) ProfilingData{};
pthread_setspecific(ProfilingKey, &ThreadStorage);
@@ -86,6 +87,18 @@ ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
return TLD;
}
+static void cleanupTLD() XRAY_NEVER_INSTRUMENT {
+ auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
+ if (TLD.Allocators != nullptr && TLD.FCT != nullptr) {
+ TLD.FCT->~FunctionCallTrie();
+ TLD.Allocators->~Allocators();
+ InternalFree(TLD.FCT);
+ InternalFree(TLD.Allocators);
+ TLD.FCT = nullptr;
+ TLD.Allocators = nullptr;
+ }
+}
+
} // namespace
const char *profilingCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
@@ -148,6 +161,9 @@ XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
profileCollectorService::reset();
+ // Flush the current thread's local data structures as well.
+ cleanupTLD();
+
atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
memory_order_release);
@@ -158,17 +174,12 @@ namespace {
thread_local atomic_uint8_t ReentranceGuard{0};
-void postCurrentThreadFCT(ProfilingData &TLD) {
+static void postCurrentThreadFCT(ProfilingData &TLD) {
if (TLD.Allocators == nullptr || TLD.FCT == nullptr)
return;
profileCollectorService::post(*TLD.FCT, GetTid());
- TLD.FCT->~FunctionCallTrie();
- TLD.Allocators->~Allocators();
- InternalFree(TLD.FCT);
- InternalFree(TLD.Allocators);
- TLD.FCT = nullptr;
- TLD.Allocators = nullptr;
+ cleanupTLD();
}
} // namespace
@@ -294,10 +305,14 @@ profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options,
// ABI functions for registering exit handlers.
Atexit(+[] {
// Finalize and flush.
- if (profilingFinalize() != XRAY_LOG_FINALIZED)
+ if (profilingFinalize() != XRAY_LOG_FINALIZED) {
+ cleanupTLD();
return;
- if (profilingFlush() != XRAY_LOG_FLUSHED)
+ }
+ if (profilingFlush() != XRAY_LOG_FLUSHED) {
+ cleanupTLD();
return;
+ }
if (Verbosity())
Report("XRay Profile flushed at exit.");
});
diff --git a/lib/xray/xray_profiling_flags.inc b/lib/xray/xray_profiling_flags.inc
index 04ccd459d..e9230ae64 100644
--- a/lib/xray/xray_profiling_flags.inc
+++ b/lib/xray/xray_profiling_flags.inc
@@ -18,7 +18,7 @@ XRAY_FLAG(uptr, per_thread_allocator_max, 2 << 20,
"Maximum size of any single per-thread allocator.")
XRAY_FLAG(uptr, global_allocator_max, 2 << 24,
"Maximum size of the global allocator for profile storage.")
-XRAY_FLAG(uptr, stack_allocator_max, 2 << 24,
+XRAY_FLAG(uptr, stack_allocator_max, 2 << 20,
"Maximum size of the traversal stack allocator.")
XRAY_FLAG(int, grace_period_ms, 1,
"Profile collection will wait this much time in milliseconds before "
diff --git a/lib/xray/xray_segmented_array.h b/lib/xray/xray_segmented_array.h
index ddfdfb3c7..f001b230c 100644
--- a/lib/xray/xray_segmented_array.h
+++ b/lib/xray/xray_segmented_array.h
@@ -24,6 +24,13 @@
#include <utility>
namespace __xray {
+struct Chunk {
+ void *Data;
+ Chunk *Prev;
+ Chunk *Next;
+};
+
+using ChunkAllocator = Allocator<next_pow2(sizeof(Chunk)) * 8>;
/// The Array type provides an interface similar to std::vector<...> but does
/// not shrink in size. Once constructed, elements can be appended but cannot be
@@ -51,16 +58,10 @@ private:
// TODO: Consider co-locating the chunk information with the data in the
// Block, as in an intrusive list -- i.e. putting the next and previous
// pointer values inside the Block storage.
- struct Chunk {
- typename AllocatorType::Block Block;
- static constexpr size_t Size = N;
- Chunk *Prev = this;
- Chunk *Next = this;
- };
-
static Chunk SentinelChunk;
AllocatorType *Alloc;
+ ChunkAllocator *ChunkAlloc;
Chunk *Head = &SentinelChunk;
Chunk *Tail = &SentinelChunk;
size_t Size = 0;
@@ -82,30 +83,19 @@ private:
return FreeChunk;
}
- auto Block = Alloc->Allocate(ElementStorageSize);
+ auto Block = Alloc->Allocate();
if (Block.Data == nullptr)
return nullptr;
- // TODO: Maybe use a separate managed allocator for Chunk instances?
- auto C = reinterpret_cast<Chunk *>(
- InternalAlloc(sizeof(Chunk), nullptr, kCacheLineSize));
- if (C == nullptr)
+ auto ChunkElement = ChunkAlloc->Allocate();
+ if (ChunkElement.Data == nullptr)
return nullptr;
- C->Block = Block;
- C->Prev = &SentinelChunk;
- C->Next = &SentinelChunk;
- return C;
- }
- static AllocatorType &GetGlobalAllocator() {
- static AllocatorType *const GlobalAllocator = [] {
- AllocatorType *A = reinterpret_cast<AllocatorType *>(
- InternalAlloc(sizeof(AllocatorType)));
- new (A) AllocatorType(2 << 10, 0);
- return A;
- }();
-
- return *GlobalAllocator;
+ // Placement-new the Chunk element at the appropriate location.
+ auto C = reinterpret_cast<Chunk *>(ChunkElement.Data);
+ Chunk LocalC{Block.Data, &SentinelChunk, &SentinelChunk};
+ internal_memcpy(C, &LocalC, sizeof(LocalC));
+ return C;
}
Chunk *InitHeadAndTail() {
@@ -201,21 +191,21 @@ private:
U &operator*() const {
DCHECK_NE(C, &SentinelChunk);
auto RelOff = Offset % N;
- return *reinterpret_cast<U *>(reinterpret_cast<char *>(C->Block.Data) +
+ return *reinterpret_cast<U *>(reinterpret_cast<char *>(C->Data) +
(RelOff * ElementStorageSize));
}
U *operator->() const {
DCHECK_NE(C, &SentinelChunk);
auto RelOff = Offset % N;
- return reinterpret_cast<U *>(reinterpret_cast<char *>(C->Block.Data) +
+ return reinterpret_cast<U *>(reinterpret_cast<char *>(C->Data) +
(RelOff * ElementStorageSize));
}
};
public:
- explicit Array(AllocatorType &A) : Alloc(&A) {}
- Array() : Array(GetGlobalAllocator()) {}
+ explicit Array(AllocatorType &A, ChunkAllocator &CA)
+ : Alloc(&A), ChunkAlloc(&CA) {}
Array(const Array &) = delete;
Array(Array &&O) NOEXCEPT : Alloc(O.Alloc),
@@ -246,9 +236,8 @@ public:
if (AppendNewChunk() == nullptr)
return nullptr;
- auto Position =
- reinterpret_cast<T *>(reinterpret_cast<char *>(Tail->Block.Data) +
- (Offset * ElementStorageSize));
+ auto Position = reinterpret_cast<T *>(reinterpret_cast<char *>(Tail->Data) +
+ (Offset * ElementStorageSize));
*Position = E;
++Size;
return Position;
@@ -268,7 +257,7 @@ public:
}
DCHECK_NE(Tail, &SentinelChunk);
- auto Position = reinterpret_cast<char *>(LatestChunk->Block.Data) +
+ auto Position = reinterpret_cast<char *>(LatestChunk->Data) +
(Offset * ElementStorageSize);
DCHECK_EQ(reinterpret_cast<uintptr_t>(Position) % ElementStorageSize, 0);
// In-place construct at Position.
@@ -286,7 +275,7 @@ public:
Offset -= N;
DCHECK_NE(C, &SentinelChunk);
}
- return *reinterpret_cast<T *>(reinterpret_cast<char *>(C->Block.Data) +
+ return *reinterpret_cast<T *>(reinterpret_cast<char *>(C->Data) +
(Offset * ElementStorageSize));
}
@@ -360,7 +349,8 @@ public:
// ensure that storage for the SentinelChunk is defined and has a single
// address.
template <class T, size_t N>
-typename Array<T, N>::Chunk Array<T, N>::SentinelChunk;
+Chunk Array<T, N>::SentinelChunk{nullptr, &Array<T, N>::SentinelChunk,
+ &Array<T, N>::SentinelChunk};
} // namespace __xray
diff --git a/lib/xray/xray_utils.h b/lib/xray/xray_utils.h
index ce64528f9..eafa16e1a 100644
--- a/lib/xray/xray_utils.h
+++ b/lib/xray/xray_utils.h
@@ -15,6 +15,8 @@
#ifndef XRAY_UTILS_H
#define XRAY_UTILS_H
+#include <cstddef>
+#include <cstdint>
#include <sys/types.h>
#include <utility>
@@ -54,6 +56,14 @@ constexpr size_t next_pow2(size_t number) {
return next_pow2_helper(number, 1);
}
+template <class T> constexpr T &max(T &A, T &B) { return A > B ? A : B; }
+
+template <class T> constexpr T &min(T &A, T &B) { return A <= B ? A : B; }
+
+constexpr ptrdiff_t diff(uintptr_t A, uintptr_t B) {
+ return max(A, B) - min(A, B);
+}
+
} // namespace __xray
#endif // XRAY_UTILS_H