summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorDean Michael Berris <dberris@google.com>2018-07-10 08:25:44 +0000
committerDean Michael Berris <dberris@google.com>2018-07-10 08:25:44 +0000
commit652950651ff6b1748fa97d152e670c2f92902e50 (patch)
tree38a172c80f4bb603a432b33d83af8bbefb2a18df /lib
parenta164037c709a816851d82be4643cacfafc4672f9 (diff)
[XRay][compiler-rt] xray::Array Freelist and Iterator Updates
Summary: We found a bug while working on a benchmark for the profiling mode which manifests as a segmentation fault in the profiling handler's implementation. This change adds unit tests which replicate the issues in isolation. We've tracked this down as a bug in the implementation of the Freelist in the `xray::Array` type. This happens when we trim the array by a number of elements, where we've been incorrectly assigning pointers for the links in the freelist of chunk nodes. We've taken the chance to add more debug-only assertions to the code path and allow us to verify these assumptions in debug builds. In the process, we also took the opportunity to use iterators to implement both `front()` and `back()` which exposes a bug in the iterator decrement operation. In particular, when we decrement past a chunk size boundary, we end up moving too far back and reaching the `SentinelChunk` prematurely. This change unblocks us to allow for contributing the non-crashing version of the benchmarks in the test-suite as well. Reviewers: kpw Subscribers: mgorny, llvm-commits Differential Revision: https://reviews.llvm.org/D48653 git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@336644 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib')
-rw-r--r--lib/xray/tests/CMakeLists.txt8
-rw-r--r--lib/xray/tests/unit/function_call_trie_test.cc42
-rw-r--r--lib/xray/tests/unit/segmented_array_test.cc80
-rw-r--r--lib/xray/xray_allocator.h40
-rw-r--r--lib/xray/xray_function_call_trie.h97
-rw-r--r--lib/xray/xray_profile_collector.cc10
-rw-r--r--lib/xray/xray_profiling.cc7
-rw-r--r--lib/xray/xray_segmented_array.h149
-rw-r--r--lib/xray/xray_utils.h18
9 files changed, 301 insertions, 150 deletions
diff --git a/lib/xray/tests/CMakeLists.txt b/lib/xray/tests/CMakeLists.txt
index eb17b095d..ea5f2da4e 100644
--- a/lib/xray/tests/CMakeLists.txt
+++ b/lib/xray/tests/CMakeLists.txt
@@ -68,7 +68,11 @@ function(get_xray_lib_for_arch arch lib)
endfunction()
set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH})
-set(XRAY_UNITTEST_LINK_FLAGS ${CMAKE_THREAD_LIBS_INIT})
+set(XRAY_UNITTEST_LINK_FLAGS
+ ${CMAKE_THREAD_LIBS_INIT}
+ -fxray-instrument
+ -lstdc++ # TODO: Detect the correct standard library used!
+ )
if (NOT APPLE)
append_list_if(COMPILER_RT_HAS_LIBM -lm XRAY_UNITTEST_LINK_FLAGS)
append_list_if(COMPILER_RT_HAS_LIBRT -lrt XRAY_UNITTEST_LINK_FLAGS)
@@ -94,7 +98,7 @@ macro(add_xray_unittest testname)
RUNTIME "${XRAY_RUNTIME_LIBS}"
DEPS gtest xray
CFLAGS ${XRAY_UNITTEST_CFLAGS}
- LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS} -lstdc++)
+ LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS})
set_target_properties(XRayUnitTests
PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endforeach()
diff --git a/lib/xray/tests/unit/function_call_trie_test.cc b/lib/xray/tests/unit/function_call_trie_test.cc
index d4a79d902..6ad51aa14 100644
--- a/lib/xray/tests/unit/function_call_trie_test.cc
+++ b/lib/xray/tests/unit/function_call_trie_test.cc
@@ -18,12 +18,6 @@ namespace __xray {
namespace {
-TEST(FunctionCallTrieTest, Construction) {
- // We want to make sure that we can create one of these without the set of
- // allocators we need. This will by default use the global allocators.
- FunctionCallTrie Trie;
-}
-
TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) {
// FIXME: Support passing in configuration for allocators in the allocator
// constructors.
@@ -61,6 +55,17 @@ TEST(FunctionCallTrieTest, MissingFunctionEntry) {
ASSERT_TRUE(R.empty());
}
+TEST(FunctionCallTrieTest, NoMatchingEntersForExit) {
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+ Trie.enterFunction(2, 1);
+ Trie.enterFunction(3, 3);
+ Trie.exitFunction(1, 5);
+ const auto &R = Trie.getRoots();
+
+ ASSERT_TRUE(R.empty());
+}
+
TEST(FunctionCallTrieTest, MissingFunctionExit) {
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
@@ -153,6 +158,31 @@ TEST(FunctionCallTrieTest, MissingIntermediaryExit) {
EXPECT_EQ(F1.CumulativeLocalTime, 100);
}
+TEST(FunctionCallTrieTest, DeepCallStack) {
+ // Simulate a relatively deep call stack (32 levels) and ensure that we can
+ // properly pop all the way up the stack.
+ profilingFlags()->setDefaults();
+ auto A = FunctionCallTrie::InitAllocators();
+ FunctionCallTrie Trie(A);
+ for (int i = 0; i < 32; ++i)
+ Trie.enterFunction(i + 1, i);
+ Trie.exitFunction(1, 33);
+
+ // Here, validate that we have a 32-level deep function call path from the
+ // root (1) down to the leaf (33).
+ const auto &R = Trie.getRoots();
+ ASSERT_EQ(R.size(), 1u);
+ auto F = R[0];
+ for (int i = 0; i < 32; ++i) {
+ EXPECT_EQ(F->FId, i + 1);
+ EXPECT_EQ(F->CallCount, 1);
+ if (F->Callees.empty() && i != 31)
+ FAIL() << "Empty callees for FId " << F->FId;
+ if (i != 31)
+ F = F->Callees[0].NodePtr;
+ }
+}
+
// TODO: Test that we can handle cross-CPU migrations, where TSCs are not
// guaranteed to be synchronised.
TEST(FunctionCallTrieTest, DeepCopy) {
diff --git a/lib/xray/tests/unit/segmented_array_test.cc b/lib/xray/tests/unit/segmented_array_test.cc
index dd07e7f13..539162d7d 100644
--- a/lib/xray/tests/unit/segmented_array_test.cc
+++ b/lib/xray/tests/unit/segmented_array_test.cc
@@ -107,32 +107,84 @@ TEST(SegmentedArrayTest, IteratorRetreat) {
}
TEST(SegmentedArrayTest, IteratorTrimBehaviour) {
- Array<TestData> data;
- ASSERT_TRUE(data.empty());
- auto I0Begin = data.begin(), I0End = data.end();
- // Add enough elements in data to have more than one chunk.
- constexpr auto ChunkX2 = Array<TestData>::ChunkSize * 2;
+ using AllocatorType = typename Array<TestData>::AllocatorType;
+ AllocatorType A(1 << 10, 0);
+ Array<TestData> Data(A);
+ ASSERT_TRUE(Data.empty());
+ auto I0Begin = Data.begin(), I0End = Data.end();
+ // Add enough elements in Data to have more than one chunk.
+ constexpr auto Chunk = Array<TestData>::ChunkSize;
+ constexpr auto ChunkX2 = Chunk * 2;
for (auto i = ChunkX2; i > 0u; --i) {
- data.Append({static_cast<s64>(i), static_cast<s64>(i)});
+ Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i));
}
- ASSERT_EQ(data.size(), ChunkX2);
+ ASSERT_EQ(Data.size(), ChunkX2);
+ {
+ auto &Back = Data.back();
+ ASSERT_EQ(Back.First, 1);
+ ASSERT_EQ(Back.Second, 1);
+ }
+
// Trim one chunk's elements worth.
- data.trim(ChunkX2 / 2);
- ASSERT_EQ(data.size(), ChunkX2 / 2);
+ Data.trim(Chunk);
+ ASSERT_EQ(Data.size(), Chunk);
+
+ // Check that we are still able to access 'back' properly.
+ {
+ auto &Back = Data.back();
+ ASSERT_EQ(Back.First, static_cast<s64>(Chunk + 1));
+ ASSERT_EQ(Back.Second, static_cast<s64>(Chunk + 1));
+ }
+
// Then trim until it's empty.
- data.trim(ChunkX2 / 2);
- ASSERT_TRUE(data.empty());
+ Data.trim(Chunk);
+ ASSERT_TRUE(Data.empty());
// Here our iterators should be the same.
- auto I1Begin = data.begin(), I1End = data.end();
+ auto I1Begin = Data.begin(), I1End = Data.end();
EXPECT_EQ(I0Begin, I1Begin);
EXPECT_EQ(I0End, I1End);
// Then we ensure that adding elements back works just fine.
for (auto i = ChunkX2; i > 0u; --i) {
- data.Append({static_cast<s64>(i), static_cast<s64>(i)});
+ Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i));
+ }
+ EXPECT_EQ(Data.size(), ChunkX2);
+}
+
+struct ShadowStackEntry {
+ uint64_t EntryTSC = 0;
+ uint64_t *NodePtr = nullptr;
+ ShadowStackEntry(uint64_t T, uint64_t *N) : EntryTSC(T), NodePtr(N) {}
+};
+
+TEST(SegmentedArrayTest, SimulateStackBehaviour) {
+ using AllocatorType = typename Array<ShadowStackEntry>::AllocatorType;
+ AllocatorType A(1 << 10, 0);
+ Array<ShadowStackEntry> Data(A);
+ static uint64_t Dummy = 0;
+ constexpr uint64_t Max = 9;
+
+ for (uint64_t i = 0; i < Max; ++i) {
+ auto P = Data.Append({i, &Dummy});
+ ASSERT_NE(P, nullptr);
+ ASSERT_EQ(P->NodePtr, &Dummy);
+ auto &Back = Data.back();
+ ASSERT_EQ(Back.NodePtr, &Dummy);
+ ASSERT_EQ(Back.EntryTSC, i);
+ }
+
+ // Simulate a stack by checking the data from the end as we're trimming.
+ auto Counter = Max;
+ ASSERT_EQ(Data.size(), size_t(Max));
+ while (!Data.empty()) {
+ const auto &Top = Data.back();
+ uint64_t *TopNode = Top.NodePtr;
+ EXPECT_EQ(TopNode, &Dummy) << "Counter = " << Counter;
+ Data.trim(1);
+ --Counter;
+ ASSERT_EQ(Data.size(), size_t(Counter));
}
- EXPECT_EQ(data.size(), ChunkX2);
}
} // namespace
diff --git a/lib/xray/xray_allocator.h b/lib/xray/xray_allocator.h
index d62a6910c..c05c0485c 100644
--- a/lib/xray/xray_allocator.h
+++ b/lib/xray/xray_allocator.h
@@ -18,12 +18,12 @@
#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
#include "sanitizer_common/sanitizer_mutex.h"
+#include "xray_utils.h"
#include <cstddef>
#include <cstdint>
-#include "sanitizer_common/sanitizer_internal_defs.h"
-
namespace __xray {
/// The Allocator type hands out fixed-sized chunks of memory that are
@@ -40,8 +40,7 @@ template <size_t N> struct Allocator {
// The Allocator returns memory as Block instances.
struct Block {
/// Compute the minimum cache-line size multiple that is >= N.
- static constexpr auto Size =
- kCacheLineSize * ((N / kCacheLineSize) + (N % kCacheLineSize ? 1 : 0));
+ static constexpr auto Size = nearest_boundary(N, kCacheLineSize);
void *Data = nullptr;
};
@@ -61,15 +60,16 @@ private:
// We compute the number of pointers to areas in memory where we consider as
// individual blocks we've allocated. To ensure that instances of the
- // BlockLink object are cache-line sized, we deduct one additional
- // pointers worth representing the pointer to the previous link.
+ // BlockLink object are cache-line sized, we deduct two pointers worth
+ // representing the pointer to the previous link and the backing store for
+ // the whole block.
//
// This structure corresponds to the following layout:
//
- // Blocks [ 0, 1, 2, .., BlockPtrCount - 1]
+ // Blocks [ 0, 1, 2, .., BlockPtrCount - 2]
//
static constexpr auto BlockPtrCount =
- (kCacheLineSize / sizeof(Block *)) - 1;
+ (kCacheLineSize / sizeof(Block *)) - 2;
BlockLink() {
// Zero out Blocks.
@@ -81,6 +81,7 @@ private:
// FIXME: Align this to cache-line address boundaries?
Block Blocks[BlockPtrCount];
BlockLink *Prev = nullptr;
+ void *BackingStore = nullptr;
};
static_assert(sizeof(BlockLink) == kCacheLineSize,
@@ -96,17 +97,26 @@ private:
BlockLink *Tail = &NullLink;
size_t Counter = 0;
- BlockLink *NewChainLink() {
+ BlockLink *NewChainLink(uint64_t Alignment) {
auto NewChain = reinterpret_cast<BlockLink *>(
InternalAlloc(sizeof(BlockLink), nullptr, kCacheLineSize));
auto BackingStore = reinterpret_cast<char *>(InternalAlloc(
- BlockLink::BlockPtrCount * Block::Size, nullptr, kCacheLineSize));
+ (BlockLink::BlockPtrCount + 1) * Block::Size, nullptr, Alignment));
size_t Offset = 0;
DCHECK_NE(NewChain, nullptr);
DCHECK_NE(BackingStore, nullptr);
+ NewChain->BackingStore = BackingStore;
+
+ // Here we ensure that the alignment of the pointers we're handing out
+ // adhere to the alignment requirements of the call to Allocate().
for (auto &B : NewChain->Blocks) {
- B.Data = BackingStore + Offset;
- Offset += Block::Size;
+ auto AlignmentAdjustment =
+ nearest_boundary(reinterpret_cast<uintptr_t>(BackingStore + Offset),
+ Alignment) -
+ reinterpret_cast<uintptr_t>(BackingStore + Offset);
+ B.Data = BackingStore + AlignmentAdjustment + Offset;
+ DCHECK_EQ(reinterpret_cast<uintptr_t>(B.Data) % Alignment, 0);
+ Offset += AlignmentAdjustment + Block::Size;
}
NewChain->Prev = Tail;
return NewChain;
@@ -117,7 +127,7 @@ public:
// FIXME: Implement PreAllocate support!
}
- Block Allocate() {
+ Block Allocate(uint64_t Alignment = 8) {
SpinMutexLock Lock(&Mutex);
// Check whether we're over quota.
if (Counter * Block::Size >= MaxMemory)
@@ -128,7 +138,7 @@ public:
Block B{};
BlockLink *Link = Tail;
if (UNLIKELY(Counter == 0 || ChainOffset == 0))
- Tail = Link = NewChainLink();
+ Tail = Link = NewChainLink(Alignment);
B = Link->Blocks[ChainOffset];
++Counter;
@@ -140,7 +150,7 @@ public:
for (auto *C = Tail; C != &NullLink;) {
// We know that the data block is a large contiguous page, we deallocate
// that at once.
- InternalFree(C->Blocks[0].Data);
+ InternalFree(C->BackingStore);
auto Prev = C->Prev;
InternalFree(C);
C = Prev;
diff --git a/lib/xray/xray_function_call_trie.h b/lib/xray/xray_function_call_trie.h
index 4bff5daff..5ad822795 100644
--- a/lib/xray/xray_function_call_trie.h
+++ b/lib/xray/xray_function_call_trie.h
@@ -17,8 +17,8 @@
#include "xray_profiling_flags.h"
#include "xray_segmented_array.h"
+#include <memory> // For placement new.
#include <utility>
-#include <memory> // For placement new.
namespace __xray {
@@ -101,6 +101,8 @@ public:
NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {}
};
+ static_assert(sizeof(NodeIdPair) == 16, "Wrong size for NodeIDPair.");
+
using NodeIdPairArray = Array<NodeIdPair>;
using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType;
@@ -128,15 +130,12 @@ public:
private:
struct ShadowStackEntry {
- int32_t FId; // We're copying the function ID into the stack to avoid having
- // to reach into the node just to get the function ID.
uint64_t EntryTSC;
Node *NodePtr;
// We add a constructor here to allow us to inplace-construct through
// Array<...>'s AppendEmplace.
- ShadowStackEntry(int32_t F, uint64_t T, Node *N)
- : FId(F), EntryTSC(T), NodePtr(N) {}
+ ShadowStackEntry(uint64_t T, Node *N) : EntryTSC{T}, NodePtr{N} {}
};
using NodeArray = Array<Node>;
@@ -219,30 +218,30 @@ public:
// TODO: Support configuration of options through the arguments.
static Allocators InitAllocators() {
+ return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max);
+ }
+
+ static Allocators InitAllocatorsCustom(uptr Max) {
Allocators A;
auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>(
InternalAlloc(sizeof(Allocators::NodeAllocatorType)));
- new (NodeAllocator) Allocators::NodeAllocatorType(
- profilingFlags()->per_thread_allocator_max, 0);
+ new (NodeAllocator) Allocators::NodeAllocatorType(Max, 0);
A.NodeAllocator = NodeAllocator;
auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>(
InternalAlloc(sizeof(Allocators::RootAllocatorType)));
- new (RootAllocator) Allocators::RootAllocatorType(
- profilingFlags()->per_thread_allocator_max, 0);
+ new (RootAllocator) Allocators::RootAllocatorType(Max, 0);
A.RootAllocator = RootAllocator;
auto ShadowStackAllocator =
reinterpret_cast<Allocators::ShadowStackAllocatorType *>(
InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType)));
- new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(
- profilingFlags()->per_thread_allocator_max, 0);
+ new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max, 0);
A.ShadowStackAllocator = ShadowStackAllocator;
auto NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
InternalAlloc(sizeof(NodeIdPairAllocatorType)));
- new (NodeIdPairAllocator)
- NodeIdPairAllocatorType(profilingFlags()->per_thread_allocator_max, 0);
+ new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max, 0);
A.NodeIdPairAllocator = NodeIdPairAllocator;
return A;
}
@@ -253,20 +252,14 @@ private:
ShadowStackArray ShadowStack;
NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
- const Allocators &GetGlobalAllocators() {
- static const Allocators A = [] { return InitAllocators(); }();
- return A;
- }
-
public:
explicit FunctionCallTrie(const Allocators &A)
: Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
ShadowStack(*A.ShadowStackAllocator),
NodeIdPairAllocator(A.NodeIdPairAllocator) {}
- FunctionCallTrie() : FunctionCallTrie(GetGlobalAllocators()) {}
-
- void enterFunction(int32_t FId, uint64_t TSC) {
+ void enterFunction(const int32_t FId, uint64_t TSC) {
+ DCHECK_NE(FId, 0);
// This function primarily deals with ensuring that the ShadowStack is
// consistent and ready for when an exit event is encountered.
if (UNLIKELY(ShadowStack.empty())) {
@@ -275,12 +268,13 @@ public:
if (UNLIKELY(NewRoot == nullptr))
return;
Roots.Append(NewRoot);
- ShadowStack.AppendEmplace(FId, TSC, NewRoot);
+ ShadowStack.AppendEmplace(TSC, NewRoot);
return;
}
auto &Top = ShadowStack.back();
auto TopNode = Top.NodePtr;
+ DCHECK_NE(TopNode, nullptr);
// If we've seen this callee before, then we just access that node and place
// that on the top of the stack.
@@ -288,7 +282,7 @@ public:
[FId](const NodeIdPair &NR) { return NR.FId == FId; });
if (Callee != nullptr) {
CHECK_NE(Callee->NodePtr, nullptr);
- ShadowStack.AppendEmplace(FId, TSC, Callee->NodePtr);
+ ShadowStack.AppendEmplace(TSC, Callee->NodePtr);
return;
}
@@ -297,8 +291,10 @@ public:
Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
if (UNLIKELY(NewNode == nullptr))
return;
+ DCHECK_NE(NewNode, nullptr);
TopNode->Callees.AppendEmplace(NewNode, FId);
- ShadowStack.AppendEmplace(FId, TSC, NewNode);
+ ShadowStack.AppendEmplace(TSC, NewNode);
+ DCHECK_NE(ShadowStack.back().NodePtr, nullptr);
return;
}
@@ -307,14 +303,11 @@ public:
// entered this function before. We do as little processing here as we can,
// since most of the hard work would have already been done at function
// entry.
- if (UNLIKELY(ShadowStack.empty()))
- return;
-
uint64_t CumulativeTreeTime = 0;
while (!ShadowStack.empty()) {
- auto &Top = ShadowStack.back();
+ const auto &Top = ShadowStack.back();
auto TopNode = Top.NodePtr;
- auto TopFId = TopNode->FId;
+ DCHECK_NE(TopNode, nullptr);
auto LocalTime = TSC - Top.EntryTSC;
TopNode->CallCount++;
TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
@@ -322,7 +315,7 @@ public:
ShadowStack.trim(1);
// TODO: Update the histogram for the node.
- if (TopFId == FId)
+ if (TopNode->FId == FId)
break;
}
}
@@ -344,28 +337,34 @@ public:
// This function must *not* be called with a non-empty FunctionCallTrie |O|.
void deepCopyInto(FunctionCallTrie &O) const {
DCHECK(O.getRoots().empty());
+
+ // We then push the root into a stack, to use as the parent marker for new
+ // nodes we push in as we're traversing depth-first down the call tree.
+ struct NodeAndParent {
+ FunctionCallTrie::Node *Node;
+ FunctionCallTrie::Node *NewNode;
+ };
+ using Stack = Array<NodeAndParent>;
+
+ typename Stack::AllocatorType StackAllocator(
+ profilingFlags()->stack_allocator_max, 0);
+ Stack DFSStack(StackAllocator);
+
for (const auto Root : getRoots()) {
// Add a node in O for this root.
auto NewRoot = O.Nodes.AppendEmplace(
nullptr, *O.NodeIdPairAllocator, Root->CallCount,
Root->CumulativeLocalTime, Root->FId);
- O.Roots.Append(NewRoot);
- // We then push the root into a stack, to use as the parent marker for new
- // nodes we push in as we're traversing depth-first down the call tree.
- struct NodeAndParent {
- FunctionCallTrie::Node *Node;
- FunctionCallTrie::Node *NewNode;
- };
- using Stack = Array<NodeAndParent>;
+ // Because we cannot allocate more memory we should bail out right away.
+ if (UNLIKELY(NewRoot == nullptr))
+ return;
- typename Stack::AllocatorType StackAllocator(
- profilingFlags()->stack_allocator_max, 0);
- Stack DFSStack(StackAllocator);
+ O.Roots.Append(NewRoot);
// TODO: Figure out what to do if we fail to allocate any more stack
// space. Maybe warn or report once?
- DFSStack.Append(NodeAndParent{Root, NewRoot});
+ DFSStack.AppendEmplace(Root, NewRoot);
while (!DFSStack.empty()) {
NodeAndParent NP = DFSStack.back();
DCHECK_NE(NP.Node, nullptr);
@@ -375,9 +374,10 @@ public:
auto NewNode = O.Nodes.AppendEmplace(
NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount,
Callee.NodePtr->CumulativeLocalTime, Callee.FId);
- DCHECK_NE(NewNode, nullptr);
+ if (UNLIKELY(NewNode == nullptr))
+ return;
NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
- DFSStack.Append(NodeAndParent{Callee.NodePtr, NewNode});
+ DFSStack.AppendEmplace(Callee.NodePtr, NewNode);
}
}
}
@@ -408,6 +408,9 @@ public:
if (R == nullptr) {
TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
0, Root->FId);
+ if (UNLIKELY(TargetRoot == nullptr))
+ return;
+
O.Roots.Append(TargetRoot);
} else {
TargetRoot = *R;
@@ -430,10 +433,14 @@ public:
if (TargetCallee == nullptr) {
auto NewTargetNode = O.Nodes.AppendEmplace(
NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+
+ if (UNLIKELY(NewTargetNode == nullptr))
+ return;
+
TargetCallee =
NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId);
}
- DFSStack.Append(NodeAndTarget{Callee.NodePtr, TargetCallee->NodePtr});
+ DFSStack.AppendEmplace(Callee.NodePtr, TargetCallee->NodePtr);
}
}
}
diff --git a/lib/xray/xray_profile_collector.cc b/lib/xray/xray_profile_collector.cc
index 9d057863e..5da8073f5 100644
--- a/lib/xray/xray_profile_collector.cc
+++ b/lib/xray/xray_profile_collector.cc
@@ -16,8 +16,8 @@
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_vector.h"
#include "xray_profiling_flags.h"
-#include <pthread.h>
#include <memory>
+#include <pthread.h>
#include <utility>
namespace __xray {
@@ -55,7 +55,8 @@ void post(const FunctionCallTrie &T, tid_t TId) {
GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
new (GlobalAllocators) FunctionCallTrie::Allocators();
- *GlobalAllocators = FunctionCallTrie::InitAllocators();
+ *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
+ profilingFlags()->global_allocator_max);
});
DCHECK_NE(GlobalAllocators, nullptr);
@@ -83,12 +84,11 @@ void post(const FunctionCallTrie &T, tid_t TId) {
// and is decoupled from the lifetime management required by the managed
// allocator we have in XRay.
//
- Item->Trie = reinterpret_cast<FunctionCallTrie *>(
- InternalAlloc(sizeof(FunctionCallTrie)));
+ Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
+ sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
DCHECK_NE(Item->Trie, nullptr);
new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
}
- DCHECK_NE(Item, nullptr);
T.deepCopyInto(*Item->Trie);
}
diff --git a/lib/xray/xray_profiling.cc b/lib/xray/xray_profiling.cc
index 9a4dd90c6..fa60263c2 100644
--- a/lib/xray/xray_profiling.cc
+++ b/lib/xray/xray_profiling.cc
@@ -258,9 +258,9 @@ profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options,
{
SpinMutexLock Lock(&ProfilerOptionsMutex);
FlagParser ConfigParser;
- auto *F = profilingFlags();
- F->setDefaults();
- registerProfilerFlags(&ConfigParser, F);
+ ProfilerFlags Flags;
+ Flags.setDefaults();
+ registerProfilerFlags(&ConfigParser, &Flags);
ConfigParser.ParseString(profilingCompilerDefinedFlags());
const char *Env = GetEnv("XRAY_PROFILING_OPTIONS");
if (Env == nullptr)
@@ -271,6 +271,7 @@ profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options,
ConfigParser.ParseString(static_cast<const char *>(Options));
if (Verbosity())
ReportUnrecognizedFlags();
+ *profilingFlags() = Flags;
}
// We need to reset the profile data collection implementation now.
diff --git a/lib/xray/xray_segmented_array.h b/lib/xray/xray_segmented_array.h
index 3205a9697..ddfdfb3c7 100644
--- a/lib/xray/xray_segmented_array.h
+++ b/lib/xray/xray_segmented_array.h
@@ -18,21 +18,13 @@
#include "sanitizer_common/sanitizer_allocator.h"
#include "xray_allocator.h"
+#include "xray_utils.h"
+#include <cassert>
#include <type_traits>
#include <utility>
namespace __xray {
-namespace {
-
-constexpr size_t gcd(size_t a, size_t b) {
- return (b == 0) ? a : gcd(b, a % b);
-}
-
-constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }
-
-} // namespace
-
/// The Array type provides an interface similar to std::vector<...> but does
/// not shrink in size. Once constructed, elements can be appended but cannot be
/// removed. The implementation is heavily dependent on the contract provided by
@@ -45,10 +37,12 @@ constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }
/// size, to allow us to maximise the number of T objects we can place in
/// cache-line multiple sized blocks. To get back the number of T's, we divide
/// this least common multiple by the size of T.
-template <class T, size_t N = lcm(sizeof(T), kCacheLineSize) / sizeof(T)>
+template <class T,
+ size_t N = lcm(next_pow2(sizeof(T)), kCacheLineSize) / sizeof(T)>
struct Array {
static constexpr size_t ChunkSize = N;
- static constexpr size_t AllocatorChunkSize = sizeof(T) * ChunkSize;
+ static constexpr size_t ElementStorageSize = next_pow2(sizeof(T));
+ static constexpr size_t AllocatorChunkSize = ElementStorageSize * ChunkSize;
using AllocatorType = Allocator<AllocatorChunkSize>;
static_assert(std::is_trivially_destructible<T>::value,
"T must be trivially destructible.");
@@ -60,8 +54,8 @@ private:
struct Chunk {
typename AllocatorType::Block Block;
static constexpr size_t Size = N;
- Chunk *Prev = nullptr;
- Chunk *Next = nullptr;
+ Chunk *Prev = this;
+ Chunk *Next = this;
};
static Chunk SentinelChunk;
@@ -70,7 +64,6 @@ private:
Chunk *Head = &SentinelChunk;
Chunk *Tail = &SentinelChunk;
size_t Size = 0;
- size_t FreeElements = 0;
// Here we keep track of chunks in the freelist, to allow us to re-use chunks
// when elements are trimmed off the end.
@@ -85,17 +78,22 @@ private:
auto *FreeChunk = Freelist;
Freelist = FreeChunk->Next;
FreeChunk->Next = &SentinelChunk;
+ Freelist->Prev = &SentinelChunk;
return FreeChunk;
}
- auto Block = Alloc->Allocate();
+ auto Block = Alloc->Allocate(ElementStorageSize);
if (Block.Data == nullptr)
return nullptr;
+
// TODO: Maybe use a separate managed allocator for Chunk instances?
- auto C = reinterpret_cast<Chunk *>(InternalAlloc(sizeof(Chunk)));
+ auto C = reinterpret_cast<Chunk *>(
+ InternalAlloc(sizeof(Chunk), nullptr, kCacheLineSize));
if (C == nullptr)
return nullptr;
C->Block = Block;
+ C->Prev = &SentinelChunk;
+ C->Next = &SentinelChunk;
return C;
}
@@ -116,42 +114,52 @@ private:
auto Chunk = NewChunk();
if (Chunk == nullptr)
return nullptr;
- Chunk->Prev = &SentinelChunk;
- Chunk->Next = &SentinelChunk;
- Head = Chunk;
- Tail = Chunk;
- return Chunk;
+ DCHECK_EQ(Chunk->Next, &SentinelChunk);
+ DCHECK_EQ(Chunk->Prev, &SentinelChunk);
+ return Head = Tail = Chunk;
}
Chunk *AppendNewChunk() {
auto Chunk = NewChunk();
if (Chunk == nullptr)
return nullptr;
+ DCHECK_NE(Tail, &SentinelChunk);
+ DCHECK_EQ(Tail->Next, &SentinelChunk);
+ DCHECK_EQ(Chunk->Prev, &SentinelChunk);
+ DCHECK_EQ(Chunk->Next, &SentinelChunk);
Tail->Next = Chunk;
Chunk->Prev = Tail;
- Chunk->Next = &SentinelChunk;
Tail = Chunk;
- return Chunk;
+ return Tail;
}
// This Iterator models a BidirectionalIterator.
template <class U> class Iterator {
- Chunk *C = nullptr;
+ Chunk *C = &SentinelChunk;
size_t Offset = 0;
+ size_t Size = 0;
public:
- Iterator(Chunk *IC, size_t Off) : C(IC), Offset(Off) {}
+ Iterator(Chunk *IC, size_t Off, size_t S) : C(IC), Offset(Off), Size(S) {}
+ Iterator(const Iterator &) noexcept = default;
+ Iterator() noexcept = default;
+ Iterator(Iterator &&) noexcept = default;
+ Iterator &operator=(const Iterator &) = default;
+ Iterator &operator=(Iterator &&) = default;
+ ~Iterator() = default;
Iterator &operator++() {
- if (++Offset % N)
+ if (++Offset % N || Offset == Size)
return *this;
- DCHECK_NE(C, &SentinelChunk);
-
// At this point, we know that Offset % N == 0, so we must advance the
// chunk pointer.
DCHECK_EQ(Offset % N, 0);
+ DCHECK_NE(Offset, Size);
+ DCHECK_NE(C, &SentinelChunk);
+ DCHECK_NE(C->Next, &SentinelChunk);
C = C->Next;
+ DCHECK_NE(C, &SentinelChunk);
return *this;
}
@@ -159,10 +167,12 @@ private:
DCHECK_NE(C, &SentinelChunk);
DCHECK_GT(Offset, 0);
- // We check whether the offset was on a boundary before decrement, to see
- // whether we need to retreat to the previous chunk.
- if ((Offset-- % N) == 0)
+ auto PreviousOffset = Offset--;
+ if (PreviousOffset != Size && PreviousOffset % N == 0) {
+ DCHECK_NE(C->Prev, &SentinelChunk);
C = C->Prev;
+ }
+
return *this;
}
@@ -190,12 +200,16 @@ private:
U &operator*() const {
DCHECK_NE(C, &SentinelChunk);
- return reinterpret_cast<U *>(C->Block.Data)[Offset % N];
+ auto RelOff = Offset % N;
+ return *reinterpret_cast<U *>(reinterpret_cast<char *>(C->Block.Data) +
+ (RelOff * ElementStorageSize));
}
U *operator->() const {
DCHECK_NE(C, &SentinelChunk);
- return reinterpret_cast<U *>(C->Block.Data) + (Offset % N);
+ auto RelOff = Offset % N;
+ return reinterpret_cast<U *>(reinterpret_cast<char *>(C->Block.Data) +
+ (RelOff * ElementStorageSize));
}
};
@@ -232,10 +246,11 @@ public:
if (AppendNewChunk() == nullptr)
return nullptr;
- auto Position = reinterpret_cast<T *>(Tail->Block.Data) + Offset;
+ auto Position =
+ reinterpret_cast<T *>(reinterpret_cast<char *>(Tail->Block.Data) +
+ (Offset * ElementStorageSize));
*Position = E;
++Size;
- FreeElements -= FreeElements ? 1 : 0;
return Position;
}
@@ -245,16 +260,21 @@ public:
return nullptr;
auto Offset = Size % N;
- if (UNLIKELY(Size != 0 && Offset == 0))
- if (AppendNewChunk() == nullptr)
+ auto *LatestChunk = Tail;
+ if (UNLIKELY(Size != 0 && Offset == 0)) {
+ LatestChunk = AppendNewChunk();
+ if (LatestChunk == nullptr)
return nullptr;
+ }
- auto Position = reinterpret_cast<T *>(Tail->Block.Data) + Offset;
+ DCHECK_NE(Tail, &SentinelChunk);
+ auto Position = reinterpret_cast<char *>(LatestChunk->Block.Data) +
+ (Offset * ElementStorageSize);
+ DCHECK_EQ(reinterpret_cast<uintptr_t>(Position) % ElementStorageSize, 0);
// In-place construct at Position.
- new (Position) T(std::forward<Args>(args)...);
+ new (Position) T{std::forward<Args>(args)...};
++Size;
- FreeElements -= FreeElements ? 1 : 0;
- return Position;
+ return reinterpret_cast<T *>(Position);
}
T &operator[](size_t Offset) const {
@@ -266,20 +286,22 @@ public:
Offset -= N;
DCHECK_NE(C, &SentinelChunk);
}
- auto Position = reinterpret_cast<T *>(C->Block.Data) + Offset;
- return *Position;
+ return *reinterpret_cast<T *>(reinterpret_cast<char *>(C->Block.Data) +
+ (Offset * ElementStorageSize));
}
T &front() const {
DCHECK_NE(Head, &SentinelChunk);
DCHECK_NE(Size, 0u);
- return *reinterpret_cast<T *>(Head->Block.Data);
+ return *begin();
}
T &back() const {
DCHECK_NE(Tail, &SentinelChunk);
- auto Offset = (Size - 1) % N;
- return *(reinterpret_cast<T *>(Tail->Block.Data) + Offset);
+ DCHECK_NE(Size, 0u);
+ auto It = end();
+ --It;
+ return *It;
}
template <class Predicate> T *find_element(Predicate P) const {
@@ -298,15 +320,18 @@ public:
/// require allocation of new blocks for new elements added after trimming.
void trim(size_t Elements) {
DCHECK_LE(Elements, Size);
+ DCHECK_GT(Size, 0);
+ auto OldSize = Size;
Size -= Elements;
- FreeElements += Elements;
-
- // Here we need to check whether we've cleared enough elements to warrant
- // putting blocks on to the freelist. We determine whether we need to
- // right-size the internal list, by keeping track of the number of "free"
- // elements still in the array.
- auto ChunksToTrim = FreeElements / N;
- for (size_t i = 0; i < ChunksToTrim; ++i, FreeElements -= N) {
+
+ DCHECK_NE(Head, &SentinelChunk);
+ DCHECK_NE(Tail, &SentinelChunk);
+
+ for (auto ChunksToTrim =
+ (nearest_boundary(OldSize, N) - nearest_boundary(Size, N)) / N;
+ ChunksToTrim > 0; --ChunksToTrim) {
+ DCHECK_NE(Head, &SentinelChunk);
+ DCHECK_NE(Tail, &SentinelChunk);
// Put the tail into the Freelist.
auto *FreeChunk = Tail;
Tail = Tail->Prev;
@@ -314,17 +339,21 @@ public:
Head = Tail;
else
Tail->Next = &SentinelChunk;
+
+ DCHECK_EQ(Tail->Next, &SentinelChunk);
FreeChunk->Next = Freelist;
- FreeChunk->Prev = Freelist->Prev;
+ FreeChunk->Prev = &SentinelChunk;
+ if (Freelist != &SentinelChunk)
+ Freelist->Prev = FreeChunk;
Freelist = FreeChunk;
}
}
// Provide iterators.
- Iterator<T> begin() const { return Iterator<T>(Head, 0); }
- Iterator<T> end() const { return Iterator<T>(Tail, Size); }
- Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0); }
- Iterator<const T> cend() const { return Iterator<const T>(Tail, Size); }
+ Iterator<T> begin() const { return Iterator<T>(Head, 0, Size); }
+ Iterator<T> end() const { return Iterator<T>(Tail, Size, Size); }
+ Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0, Size); }
+ Iterator<const T> cend() const { return Iterator<const T>(Tail, Size, Size); }
};
// We need to have this storage definition out-of-line so that the compiler can
diff --git a/lib/xray/xray_utils.h b/lib/xray/xray_utils.h
index a06a59491..0a7911f12 100644
--- a/lib/xray/xray_utils.h
+++ b/lib/xray/xray_utils.h
@@ -36,6 +36,24 @@ std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin, char *End);
// file.
int getLogFD();
+constexpr size_t gcd(size_t a, size_t b) {
+ return (b == 0) ? a : gcd(b, a % b);
+}
+
+constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }
+
+template <class T> constexpr T nearest_boundary(T number, T multiple) {
+ return multiple * ((number / multiple) + (number % multiple ? 1 : 0));
+}
+
+constexpr size_t next_pow2_helper(size_t num, size_t acc) {
+ return (1u << acc) >= num ? (1u << acc) : next_pow2_helper(num, acc + 1);
+}
+
+constexpr size_t next_pow2(size_t number) {
+ return next_pow2_helper(number, 1);
+}
+
} // namespace __xray
#endif // XRAY_UTILS_H