[XRay][compiler-rt] xray::Array Freelist and Iterator Updates

Summary: We found a bug while working on a benchmark for the profiling mode which manifests as a segmentation fault in the profiling handler's implementation. This change adds unit tests which replicate the issues in isolation. We've tracked this down as a bug in the implementation of the Freelist in the `xray::Array` type. This happens when we trim the array by a number of elements, where we've been incorrectly assigning pointers for the links in the freelist of chunk nodes. We've taken the chance to add more debug-only assertions to the code path and allow us to verify these assumptions in debug builds. In the process, we also took the opportunity to use iterators to implement both `front()` and `back()` which exposes a bug in the iterator decrement operation. In particular, when we decrement past a chunk size boundary, we end up moving too far back and reaching the `SentinelChunk` prematurely. This change unblocks us to allow for contributing the non-crashing version of the benchmarks in the test-suite as well. Reviewers: kpw Subscribers: mgorny, llvm-commits Differential Revision: https://reviews.llvm.org/D48653 git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@336644 91177308-0d34-0410-b5e6-96231b3b80d8
author: Dean Michael Berris <dberris@google.com> 2018-07-10 08:25:44 +0000
committer: Dean Michael Berris <dberris@google.com> 2018-07-10 08:25:44 +0000
commit: 652950651ff6b1748fa97d152e670c2f92902e50 (patch)
tree: 38a172c80f4bb603a432b33d83af8bbefb2a18df /lib
parent: a164037c709a816851d82be4643cacfafc4672f9 (diff)
9 files changed, 301 insertions, 150 deletions
diff --git a/lib/xray/tests/CMakeLists.txt b/lib/xray/tests/CMakeLists.txt
index eb17b095d..ea5f2da4e 100644
--- a/lib/xray/tests/CMakeLists.txt
+++ b/lib/xray/tests/CMakeLists.txt
@@ -68,7 +68,11 @@ function(get_xray_lib_for_arch arch lib)
 endfunction()
 
 set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH})
-set(XRAY_UNITTEST_LINK_FLAGS ${CMAKE_THREAD_LIBS_INIT})
+set(XRAY_UNITTEST_LINK_FLAGS
+  ${CMAKE_THREAD_LIBS_INIT}
+  -fxray-instrument
+  -lstdc++  # TODO: Detect the correct standard library used!
+  )
 if (NOT APPLE)
   append_list_if(COMPILER_RT_HAS_LIBM -lm XRAY_UNITTEST_LINK_FLAGS)
   append_list_if(COMPILER_RT_HAS_LIBRT -lrt XRAY_UNITTEST_LINK_FLAGS)
@@ -94,7 +98,7 @@ macro(add_xray_unittest testname)
         RUNTIME "${XRAY_RUNTIME_LIBS}"
         DEPS gtest xray
         CFLAGS ${XRAY_UNITTEST_CFLAGS}
-        LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS} -lstdc++)
+        LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS})
       set_target_properties(XRayUnitTests
         PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endforeach()
diff --git a/lib/xray/tests/unit/function_call_trie_test.cc b/lib/xray/tests/unit/function_call_trie_test.cc
index d4a79d902..6ad51aa14 100644
--- a/lib/xray/tests/unit/function_call_trie_test.cc
+++ b/lib/xray/tests/unit/function_call_trie_test.cc
@@ -18,12 +18,6 @@ namespace __xray {
 
 namespace {
 
-TEST(FunctionCallTrieTest, Construction) {
-  // We want to make sure that we can create one of these without the set of
-  // allocators we need. This will by default use the global allocators.
-  FunctionCallTrie Trie;
-}
-
 TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) {
   // FIXME: Support passing in configuration for allocators in the allocator
   // constructors.
@@ -61,6 +55,17 @@ TEST(FunctionCallTrieTest, MissingFunctionEntry) {
   ASSERT_TRUE(R.empty());
 }
 
+TEST(FunctionCallTrieTest, NoMatchingEntersForExit) {
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+  Trie.enterFunction(2, 1);
+  Trie.enterFunction(3, 3);
+  Trie.exitFunction(1, 5);
+  const auto &R = Trie.getRoots();
+
+  ASSERT_TRUE(R.empty());
+}
+
 TEST(FunctionCallTrieTest, MissingFunctionExit) {
   auto A = FunctionCallTrie::InitAllocators();
   FunctionCallTrie Trie(A);
@@ -153,6 +158,31 @@ TEST(FunctionCallTrieTest, MissingIntermediaryExit) {
   EXPECT_EQ(F1.CumulativeLocalTime, 100);
 }
 
+TEST(FunctionCallTrieTest, DeepCallStack) {
+  // Simulate a relatively deep call stack (32 levels) and ensure that we can
+  // properly pop all the way up the stack.
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+  for (int i = 0; i < 32; ++i)
+    Trie.enterFunction(i + 1, i);
+  Trie.exitFunction(1, 33);
+
+  // Here, validate that we have a 32-level deep function call path from the
+  // root (1) down to the leaf (33).
+  const auto &R = Trie.getRoots();
+  ASSERT_EQ(R.size(), 1u);
+  auto F = R[0];
+  for (int i = 0; i < 32; ++i) {
+    EXPECT_EQ(F->FId, i + 1);
+    EXPECT_EQ(F->CallCount, 1);
+    if (F->Callees.empty() && i != 31)
+      FAIL() << "Empty callees for FId " << F->FId;
+    if (i != 31)
+      F = F->Callees[0].NodePtr;
+  }
+}
+
 // TODO: Test that we can handle cross-CPU migrations, where TSCs are not
 // guaranteed to be synchronised.
 TEST(FunctionCallTrieTest, DeepCopy) {
diff --git a/lib/xray/tests/unit/segmented_array_test.cc b/lib/xray/tests/unit/segmented_array_test.cc
index dd07e7f13..539162d7d 100644
--- a/lib/xray/tests/unit/segmented_array_test.cc
+++ b/lib/xray/tests/unit/segmented_array_test.cc
@@ -107,32 +107,84 @@ TEST(SegmentedArrayTest, IteratorRetreat) {
 }
 
 TEST(SegmentedArrayTest, IteratorTrimBehaviour) {
-  Array<TestData> data;
-  ASSERT_TRUE(data.empty());
-  auto I0Begin = data.begin(), I0End = data.end();
-  // Add enough elements in data to have more than one chunk.
-  constexpr auto ChunkX2 = Array<TestData>::ChunkSize * 2;
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 10, 0);
+  Array<TestData> Data(A);
+  ASSERT_TRUE(Data.empty());
+  auto I0Begin = Data.begin(), I0End = Data.end();
+  // Add enough elements in Data to have more than one chunk.
+  constexpr auto Chunk = Array<TestData>::ChunkSize;
+  constexpr auto ChunkX2 = Chunk * 2;
   for (auto i = ChunkX2; i > 0u; --i) {
-    data.Append({static_cast<s64>(i), static_cast<s64>(i)});
+    Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i));
   }
-  ASSERT_EQ(data.size(), ChunkX2);
+  ASSERT_EQ(Data.size(), ChunkX2);
+  {
+    auto &Back = Data.back();
+    ASSERT_EQ(Back.First, 1);
+    ASSERT_EQ(Back.Second, 1);
+  }
+
   // Trim one chunk's elements worth.
-  data.trim(ChunkX2 / 2);
-  ASSERT_EQ(data.size(), ChunkX2 / 2);
+  Data.trim(Chunk);
+  ASSERT_EQ(Data.size(), Chunk);
+
+  // Check that we are still able to access 'back' properly.
+  {
+    auto &Back = Data.back();
+    ASSERT_EQ(Back.First, static_cast<s64>(Chunk + 1));
+    ASSERT_EQ(Back.Second, static_cast<s64>(Chunk + 1));
+  }
+
   // Then trim until it's empty.
-  data.trim(ChunkX2 / 2);
-  ASSERT_TRUE(data.empty());
+  Data.trim(Chunk);
+  ASSERT_TRUE(Data.empty());
 
   // Here our iterators should be the same.
-  auto I1Begin = data.begin(), I1End = data.end();
+  auto I1Begin = Data.begin(), I1End = Data.end();
   EXPECT_EQ(I0Begin, I1Begin);
   EXPECT_EQ(I0End, I1End);
 
   // Then we ensure that adding elements back works just fine.
   for (auto i = ChunkX2; i > 0u; --i) {
-    data.Append({static_cast<s64>(i), static_cast<s64>(i)});
+    Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i));
+  }
+  EXPECT_EQ(Data.size(), ChunkX2);
+}
+
+struct ShadowStackEntry {
+  uint64_t EntryTSC = 0;
+  uint64_t *NodePtr = nullptr;
+  ShadowStackEntry(uint64_t T, uint64_t *N) : EntryTSC(T), NodePtr(N) {}
+};
+
+TEST(SegmentedArrayTest, SimulateStackBehaviour) {
+  using AllocatorType = typename Array<ShadowStackEntry>::AllocatorType;
+  AllocatorType A(1 << 10, 0);
+  Array<ShadowStackEntry> Data(A);
+  static uint64_t Dummy = 0;
+  constexpr uint64_t Max = 9;
+
+  for (uint64_t i = 0; i < Max; ++i) {
+    auto P = Data.Append({i, &Dummy});
+    ASSERT_NE(P, nullptr);
+    ASSERT_EQ(P->NodePtr, &Dummy);
+    auto &Back = Data.back();
+    ASSERT_EQ(Back.NodePtr, &Dummy);
+    ASSERT_EQ(Back.EntryTSC, i);
+  }
+
+  // Simulate a stack by checking the data from the end as we're trimming.
+  auto Counter = Max;
+  ASSERT_EQ(Data.size(), size_t(Max));
+  while (!Data.empty()) {
+    const auto &Top = Data.back();
+    uint64_t *TopNode = Top.NodePtr;
+    EXPECT_EQ(TopNode, &Dummy) << "Counter = " << Counter;
+    Data.trim(1);
+    --Counter;
+    ASSERT_EQ(Data.size(), size_t(Counter));
   }
-  EXPECT_EQ(data.size(), ChunkX2);
 }
 
 } // namespace
diff --git a/lib/xray/xray_allocator.h b/lib/xray/xray_allocator.h
index d62a6910c..c05c0485c 100644
--- a/lib/xray/xray_allocator.h
+++ b/lib/xray/xray_allocator.h
@@ -18,12 +18,12 @@
 
 #include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
 #include "sanitizer_common/sanitizer_mutex.h"
+#include "xray_utils.h"
 #include <cstddef>
 #include <cstdint>
 
-#include "sanitizer_common/sanitizer_internal_defs.h"
-
 namespace __xray {
 
 /// The Allocator type hands out fixed-sized chunks of memory that are
@@ -40,8 +40,7 @@ template <size_t N> struct Allocator {
   // The Allocator returns memory as Block instances.
   struct Block {
     /// Compute the minimum cache-line size multiple that is >= N.
-    static constexpr auto Size =
-        kCacheLineSize * ((N / kCacheLineSize) + (N % kCacheLineSize ? 1 : 0));
+    static constexpr auto Size = nearest_boundary(N, kCacheLineSize);
     void *Data = nullptr;
   };
 
@@ -61,15 +60,16 @@ private:
 
     // We compute the number of pointers to areas in memory where we consider as
     // individual blocks we've allocated. To ensure that instances of the
-    // BlockLink object are cache-line sized, we deduct one additional
-    // pointers worth representing the pointer to the previous link.
+    // BlockLink object are cache-line sized, we deduct two pointers worth
+    // representing the pointer to the previous link and the backing store for
+    // the whole block.
     //
     // This structure corresponds to the following layout:
     //
-    //   Blocks [ 0, 1, 2, .., BlockPtrCount - 1]
+    //   Blocks [ 0, 1, 2, .., BlockPtrCount - 2]
     //
     static constexpr auto BlockPtrCount =
-        (kCacheLineSize / sizeof(Block *)) - 1;
+        (kCacheLineSize / sizeof(Block *)) - 2;
 
     BlockLink() {
       // Zero out Blocks.
@@ -81,6 +81,7 @@ private:
     // FIXME: Align this to cache-line address boundaries?
     Block Blocks[BlockPtrCount];
     BlockLink *Prev = nullptr;
+    void *BackingStore = nullptr;
   };
 
   static_assert(sizeof(BlockLink) == kCacheLineSize,
@@ -96,17 +97,26 @@ private:
   BlockLink *Tail = &NullLink;
   size_t Counter = 0;
 
-  BlockLink *NewChainLink() {
+  BlockLink *NewChainLink(uint64_t Alignment) {
     auto NewChain = reinterpret_cast<BlockLink *>(
         InternalAlloc(sizeof(BlockLink), nullptr, kCacheLineSize));
     auto BackingStore = reinterpret_cast<char *>(InternalAlloc(
-        BlockLink::BlockPtrCount * Block::Size, nullptr, kCacheLineSize));
+        (BlockLink::BlockPtrCount + 1) * Block::Size, nullptr, Alignment));
     size_t Offset = 0;
     DCHECK_NE(NewChain, nullptr);
     DCHECK_NE(BackingStore, nullptr);
+    NewChain->BackingStore = BackingStore;
+
+    // Here we ensure that the alignment of the pointers we're handing out
+    // adhere to the alignment requirements of the call to Allocate().
     for (auto &B : NewChain->Blocks) {
-      B.Data = BackingStore + Offset;
-      Offset += Block::Size;
+      auto AlignmentAdjustment =
+          nearest_boundary(reinterpret_cast<uintptr_t>(BackingStore + Offset),
+                           Alignment) -
+          reinterpret_cast<uintptr_t>(BackingStore + Offset);
+      B.Data = BackingStore + AlignmentAdjustment + Offset;
+      DCHECK_EQ(reinterpret_cast<uintptr_t>(B.Data) % Alignment, 0);
+      Offset += AlignmentAdjustment + Block::Size;
     }
     NewChain->Prev = Tail;
     return NewChain;
@@ -117,7 +127,7 @@ public:
     // FIXME: Implement PreAllocate support!
   }
 
-  Block Allocate() {
+  Block Allocate(uint64_t Alignment = 8) {
     SpinMutexLock Lock(&Mutex);
     // Check whether we're over quota.
     if (Counter * Block::Size >= MaxMemory)
@@ -128,7 +138,7 @@ public:
     Block B{};
     BlockLink *Link = Tail;
     if (UNLIKELY(Counter == 0 || ChainOffset == 0))
-      Tail = Link = NewChainLink();
+      Tail = Link = NewChainLink(Alignment);
 
     B = Link->Blocks[ChainOffset];
     ++Counter;
@@ -140,7 +150,7 @@ public:
     for (auto *C = Tail; C != &NullLink;) {
       // We know that the data block is a large contiguous page, we deallocate
       // that at once.
-      InternalFree(C->Blocks[0].Data);
+      InternalFree(C->BackingStore);
       auto Prev = C->Prev;
       InternalFree(C);
       C = Prev;
diff --git a/lib/xray/xray_function_call_trie.h b/lib/xray/xray_function_call_trie.h
index 4bff5daff..5ad822795 100644
--- a/lib/xray/xray_function_call_trie.h
+++ b/lib/xray/xray_function_call_trie.h
@@ -17,8 +17,8 @@
 
 #include "xray_profiling_flags.h"
 #include "xray_segmented_array.h"
+#include <memory> // For placement new.
 #include <utility>
-#include <memory>  // For placement new.
 
 namespace __xray {
 
@@ -101,6 +101,8 @@ public:
     NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {}
   };
 
+  static_assert(sizeof(NodeIdPair) == 16, "Wrong size for NodeIDPair.");
+
   using NodeIdPairArray = Array<NodeIdPair>;
   using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType;
 
@@ -128,15 +130,12 @@ public:
 
 private:
   struct ShadowStackEntry {
-    int32_t FId; // We're copying the function ID into the stack to avoid having
-                 // to reach into the node just to get the function ID.
     uint64_t EntryTSC;
     Node *NodePtr;
 
     // We add a constructor here to allow us to inplace-construct through
     // Array<...>'s AppendEmplace.
-    ShadowStackEntry(int32_t F, uint64_t T, Node *N)
-        : FId(F), EntryTSC(T), NodePtr(N) {}
+    ShadowStackEntry(uint64_t T, Node *N) : EntryTSC{T}, NodePtr{N} {}
   };
 
   using NodeArray = Array<Node>;
@@ -219,30 +218,30 @@ public:
 
   // TODO: Support configuration of options through the arguments.
   static Allocators InitAllocators() {
+    return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max);
+  }
+
+  static Allocators InitAllocatorsCustom(uptr Max) {
     Allocators A;
     auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>(
         InternalAlloc(sizeof(Allocators::NodeAllocatorType)));
-    new (NodeAllocator) Allocators::NodeAllocatorType(
-        profilingFlags()->per_thread_allocator_max, 0);
+    new (NodeAllocator) Allocators::NodeAllocatorType(Max, 0);
     A.NodeAllocator = NodeAllocator;
 
     auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>(
         InternalAlloc(sizeof(Allocators::RootAllocatorType)));
-    new (RootAllocator) Allocators::RootAllocatorType(
-        profilingFlags()->per_thread_allocator_max, 0);
+    new (RootAllocator) Allocators::RootAllocatorType(Max, 0);
     A.RootAllocator = RootAllocator;
 
     auto ShadowStackAllocator =
         reinterpret_cast<Allocators::ShadowStackAllocatorType *>(
             InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType)));
-    new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(
-        profilingFlags()->per_thread_allocator_max, 0);
+    new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max, 0);
     A.ShadowStackAllocator = ShadowStackAllocator;
 
     auto NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
         InternalAlloc(sizeof(NodeIdPairAllocatorType)));
-    new (NodeIdPairAllocator)
-        NodeIdPairAllocatorType(profilingFlags()->per_thread_allocator_max, 0);
+    new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max, 0);
     A.NodeIdPairAllocator = NodeIdPairAllocator;
     return A;
   }
@@ -253,20 +252,14 @@ private:
   ShadowStackArray ShadowStack;
   NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
 
-  const Allocators &GetGlobalAllocators() {
-    static const Allocators A = [] { return InitAllocators(); }();
-    return A;
-  }
-
 public:
   explicit FunctionCallTrie(const Allocators &A)
       : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
         ShadowStack(*A.ShadowStackAllocator),
         NodeIdPairAllocator(A.NodeIdPairAllocator) {}
 
-  FunctionCallTrie() : FunctionCallTrie(GetGlobalAllocators()) {}
-
-  void enterFunction(int32_t FId, uint64_t TSC) {
+  void enterFunction(const int32_t FId, uint64_t TSC) {
+    DCHECK_NE(FId, 0);
     // This function primarily deals with ensuring that the ShadowStack is
     // consistent and ready for when an exit event is encountered.
     if (UNLIKELY(ShadowStack.empty())) {
@@ -275,12 +268,13 @@ public:
       if (UNLIKELY(NewRoot == nullptr))
         return;
       Roots.Append(NewRoot);
-      ShadowStack.AppendEmplace(FId, TSC, NewRoot);
+      ShadowStack.AppendEmplace(TSC, NewRoot);
       return;
     }
 
     auto &Top = ShadowStack.back();
     auto TopNode = Top.NodePtr;
+    DCHECK_NE(TopNode, nullptr);
 
     // If we've seen this callee before, then we just access that node and place
     // that on the top of the stack.
@@ -288,7 +282,7 @@ public:
         [FId](const NodeIdPair &NR) { return NR.FId == FId; });
     if (Callee != nullptr) {
       CHECK_NE(Callee->NodePtr, nullptr);
-      ShadowStack.AppendEmplace(FId, TSC, Callee->NodePtr);
+      ShadowStack.AppendEmplace(TSC, Callee->NodePtr);
       return;
     }
 
@@ -297,8 +291,10 @@ public:
         Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
     if (UNLIKELY(NewNode == nullptr))
       return;
+    DCHECK_NE(NewNode, nullptr);
     TopNode->Callees.AppendEmplace(NewNode, FId);
-    ShadowStack.AppendEmplace(FId, TSC, NewNode);
+    ShadowStack.AppendEmplace(TSC, NewNode);
+    DCHECK_NE(ShadowStack.back().NodePtr, nullptr);
     return;
   }
 
@@ -307,14 +303,11 @@ public:
     // entered this function before. We do as little processing here as we can,
     // since most of the hard work would have already been done at function
     // entry.
-    if (UNLIKELY(ShadowStack.empty()))
-      return;
-
     uint64_t CumulativeTreeTime = 0;
     while (!ShadowStack.empty()) {
-      auto &Top = ShadowStack.back();
+      const auto &Top = ShadowStack.back();
       auto TopNode = Top.NodePtr;
-      auto TopFId = TopNode->FId;
+      DCHECK_NE(TopNode, nullptr);
       auto LocalTime = TSC - Top.EntryTSC;
       TopNode->CallCount++;
       TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
@@ -322,7 +315,7 @@ public:
       ShadowStack.trim(1);
 
       // TODO: Update the histogram for the node.
-      if (TopFId == FId)
+      if (TopNode->FId == FId)
         break;
     }
   }
@@ -344,28 +337,34 @@ public:
   // This function must *not* be called with a non-empty FunctionCallTrie |O|.
   void deepCopyInto(FunctionCallTrie &O) const {
     DCHECK(O.getRoots().empty());
+
+    // We then push the root into a stack, to use as the parent marker for new
+    // nodes we push in as we're traversing depth-first down the call tree.
+    struct NodeAndParent {
+      FunctionCallTrie::Node *Node;
+      FunctionCallTrie::Node *NewNode;
+    };
+    using Stack = Array<NodeAndParent>;
+
+    typename Stack::AllocatorType StackAllocator(
+        profilingFlags()->stack_allocator_max, 0);
+    Stack DFSStack(StackAllocator);
+
     for (const auto Root : getRoots()) {
       // Add a node in O for this root.
       auto NewRoot = O.Nodes.AppendEmplace(
           nullptr, *O.NodeIdPairAllocator, Root->CallCount,
           Root->CumulativeLocalTime, Root->FId);
-      O.Roots.Append(NewRoot);
 
-      // We then push the root into a stack, to use as the parent marker for new
-      // nodes we push in as we're traversing depth-first down the call tree.
-      struct NodeAndParent {
-        FunctionCallTrie::Node *Node;
-        FunctionCallTrie::Node *NewNode;
-      };
-      using Stack = Array<NodeAndParent>;
+      // Because we cannot allocate more memory we should bail out right away.
+      if (UNLIKELY(NewRoot == nullptr))
+        return;
 
-      typename Stack::AllocatorType StackAllocator(
-          profilingFlags()->stack_allocator_max, 0);
-      Stack DFSStack(StackAllocator);
+      O.Roots.Append(NewRoot);
 
       // TODO: Figure out what to do if we fail to allocate any more stack
       // space. Maybe warn or report once?
-      DFSStack.Append(NodeAndParent{Root, NewRoot});
+      DFSStack.AppendEmplace(Root, NewRoot);
       while (!DFSStack.empty()) {
         NodeAndParent NP = DFSStack.back();
         DCHECK_NE(NP.Node, nullptr);
@@ -375,9 +374,10 @@ public:
           auto NewNode = O.Nodes.AppendEmplace(
               NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount,
               Callee.NodePtr->CumulativeLocalTime, Callee.FId);
-          DCHECK_NE(NewNode, nullptr);
+          if (UNLIKELY(NewNode == nullptr))
+            return;
           NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
-          DFSStack.Append(NodeAndParent{Callee.NodePtr, NewNode});
+          DFSStack.AppendEmplace(Callee.NodePtr, NewNode);
         }
       }
     }
@@ -408,6 +408,9 @@ public:
       if (R == nullptr) {
         TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
                                            0, Root->FId);
+        if (UNLIKELY(TargetRoot == nullptr))
+          return;
+
         O.Roots.Append(TargetRoot);
       } else {
         TargetRoot = *R;
@@ -430,10 +433,14 @@ public:
           if (TargetCallee == nullptr) {
             auto NewTargetNode = O.Nodes.AppendEmplace(
                 NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+
+            if (UNLIKELY(NewTargetNode == nullptr))
+              return;
+
             TargetCallee =
                 NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId);
           }
-          DFSStack.Append(NodeAndTarget{Callee.NodePtr, TargetCallee->NodePtr});
+          DFSStack.AppendEmplace(Callee.NodePtr, TargetCallee->NodePtr);
         }
       }
     }
diff --git a/lib/xray/xray_profile_collector.cc b/lib/xray/xray_profile_collector.cc
index 9d057863e..5da8073f5 100644
--- a/lib/xray/xray_profile_collector.cc
+++ b/lib/xray/xray_profile_collector.cc
@@ -16,8 +16,8 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_vector.h"
 #include "xray_profiling_flags.h"
-#include <pthread.h>
 #include <memory>
+#include <pthread.h>
 #include <utility>
 
 namespace __xray {
@@ -55,7 +55,8 @@ void post(const FunctionCallTrie &T, tid_t TId) {
     GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
         InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
     new (GlobalAllocators) FunctionCallTrie::Allocators();
-    *GlobalAllocators = FunctionCallTrie::InitAllocators();
+    *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
+        profilingFlags()->global_allocator_max);
   });
   DCHECK_NE(GlobalAllocators, nullptr);
 
@@ -83,12 +84,11 @@ void post(const FunctionCallTrie &T, tid_t TId) {
     //    and is decoupled from the lifetime management required by the managed
     //    allocator we have in XRay.
     //
-    Item->Trie = reinterpret_cast<FunctionCallTrie *>(
-        InternalAlloc(sizeof(FunctionCallTrie)));
+    Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
+        sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
     DCHECK_NE(Item->Trie, nullptr);
     new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
   }
-  DCHECK_NE(Item, nullptr);
 
   T.deepCopyInto(*Item->Trie);
 }
diff --git a/lib/xray/xray_profiling.cc b/lib/xray/xray_profiling.cc
index 9a4dd90c6..fa60263c2 100644
--- a/lib/xray/xray_profiling.cc
+++ b/lib/xray/xray_profiling.cc
@@ -258,9 +258,9 @@ profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options,
   {
     SpinMutexLock Lock(&ProfilerOptionsMutex);
     FlagParser ConfigParser;
-    auto *F = profilingFlags();
-    F->setDefaults();
-    registerProfilerFlags(&ConfigParser, F);
+    ProfilerFlags Flags;
+    Flags.setDefaults();
+    registerProfilerFlags(&ConfigParser, &Flags);
     ConfigParser.ParseString(profilingCompilerDefinedFlags());
     const char *Env = GetEnv("XRAY_PROFILING_OPTIONS");
     if (Env == nullptr)
@@ -271,6 +271,7 @@ profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options,
     ConfigParser.ParseString(static_cast<const char *>(Options));
     if (Verbosity())
       ReportUnrecognizedFlags();
+    *profilingFlags() = Flags;
   }
 
   // We need to reset the profile data collection implementation now.
diff --git a/lib/xray/xray_segmented_array.h b/lib/xray/xray_segmented_array.h
index 3205a9697..ddfdfb3c7 100644
--- a/lib/xray/xray_segmented_array.h
+++ b/lib/xray/xray_segmented_array.h
@@ -18,21 +18,13 @@
 
 #include "sanitizer_common/sanitizer_allocator.h"
 #include "xray_allocator.h"
+#include "xray_utils.h"
+#include <cassert>
 #include <type_traits>
 #include <utility>
 
 namespace __xray {
 
-namespace {
-
-constexpr size_t gcd(size_t a, size_t b) {
-  return (b == 0) ? a : gcd(b, a % b);
-}
-
-constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }
-
-} // namespace
-
 /// The Array type provides an interface similar to std::vector<...> but does
 /// not shrink in size. Once constructed, elements can be appended but cannot be
 /// removed. The implementation is heavily dependent on the contract provided by
@@ -45,10 +37,12 @@ constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }
 /// size, to allow us to maximise the number of T objects we can place in
 /// cache-line multiple sized blocks. To get back the number of T's, we divide
 /// this least common multiple by the size of T.
-template <class T, size_t N = lcm(sizeof(T), kCacheLineSize) / sizeof(T)>
+template <class T,
+          size_t N = lcm(next_pow2(sizeof(T)), kCacheLineSize) / sizeof(T)>
 struct Array {
   static constexpr size_t ChunkSize = N;
-  static constexpr size_t AllocatorChunkSize = sizeof(T) * ChunkSize;
+  static constexpr size_t ElementStorageSize = next_pow2(sizeof(T));
+  static constexpr size_t AllocatorChunkSize = ElementStorageSize * ChunkSize;
   using AllocatorType = Allocator<AllocatorChunkSize>;
   static_assert(std::is_trivially_destructible<T>::value,
                 "T must be trivially destructible.");
@@ -60,8 +54,8 @@ private:
   struct Chunk {
     typename AllocatorType::Block Block;
     static constexpr size_t Size = N;
-    Chunk *Prev = nullptr;
-    Chunk *Next = nullptr;
+    Chunk *Prev = this;
+    Chunk *Next = this;
   };
 
   static Chunk SentinelChunk;
@@ -70,7 +64,6 @@ private:
   Chunk *Head = &SentinelChunk;
   Chunk *Tail = &SentinelChunk;
   size_t Size = 0;
-  size_t FreeElements = 0;
 
   // Here we keep track of chunks in the freelist, to allow us to re-use chunks
   // when elements are trimmed off the end.
@@ -85,17 +78,22 @@ private:
       auto *FreeChunk = Freelist;
       Freelist = FreeChunk->Next;
       FreeChunk->Next = &SentinelChunk;
+      Freelist->Prev = &SentinelChunk;
       return FreeChunk;
     }
 
-    auto Block = Alloc->Allocate();
+    auto Block = Alloc->Allocate(ElementStorageSize);
     if (Block.Data == nullptr)
       return nullptr;
+
     // TODO: Maybe use a separate managed allocator for Chunk instances?
-    auto C = reinterpret_cast<Chunk *>(InternalAlloc(sizeof(Chunk)));
+    auto C = reinterpret_cast<Chunk *>(
+        InternalAlloc(sizeof(Chunk), nullptr, kCacheLineSize));
     if (C == nullptr)
       return nullptr;
     C->Block = Block;
+    C->Prev = &SentinelChunk;
+    C->Next = &SentinelChunk;
     return C;
   }
 
@@ -116,42 +114,52 @@ private:
     auto Chunk = NewChunk();
     if (Chunk == nullptr)
       return nullptr;
-    Chunk->Prev = &SentinelChunk;
-    Chunk->Next = &SentinelChunk;
-    Head = Chunk;
-    Tail = Chunk;
-    return Chunk;
+    DCHECK_EQ(Chunk->Next, &SentinelChunk);
+    DCHECK_EQ(Chunk->Prev, &SentinelChunk);
+    return Head = Tail = Chunk;
   }
 
   Chunk *AppendNewChunk() {
     auto Chunk = NewChunk();
     if (Chunk == nullptr)
       return nullptr;
+    DCHECK_NE(Tail, &SentinelChunk);
+    DCHECK_EQ(Tail->Next, &SentinelChunk);
+    DCHECK_EQ(Chunk->Prev, &SentinelChunk);
+    DCHECK_EQ(Chunk->Next, &SentinelChunk);
     Tail->Next = Chunk;
     Chunk->Prev = Tail;
-    Chunk->Next = &SentinelChunk;
     Tail = Chunk;
-    return Chunk;
+    return Tail;
   }
 
   // This Iterator models a BidirectionalIterator.
   template <class U> class Iterator {
-    Chunk *C = nullptr;
+    Chunk *C = &SentinelChunk;
     size_t Offset = 0;
+    size_t Size = 0;
 
   public:
-    Iterator(Chunk *IC, size_t Off) : C(IC), Offset(Off) {}
+    Iterator(Chunk *IC, size_t Off, size_t S) : C(IC), Offset(Off), Size(S) {}
+    Iterator(const Iterator &) noexcept = default;
+    Iterator() noexcept = default;
+    Iterator(Iterator &&) noexcept = default;
+    Iterator &operator=(const Iterator &) = default;
+    Iterator &operator=(Iterator &&) = default;
+    ~Iterator() = default;
 
     Iterator &operator++() {
-      if (++Offset % N)
+      if (++Offset % N || Offset == Size)
         return *this;
 
-      DCHECK_NE(C, &SentinelChunk);
-
       // At this point, we know that Offset % N == 0, so we must advance the
       // chunk pointer.
       DCHECK_EQ(Offset % N, 0);
+      DCHECK_NE(Offset, Size);
+      DCHECK_NE(C, &SentinelChunk);
+      DCHECK_NE(C->Next, &SentinelChunk);
       C = C->Next;
+      DCHECK_NE(C, &SentinelChunk);
       return *this;
     }
 
@@ -159,10 +167,12 @@ private:
       DCHECK_NE(C, &SentinelChunk);
       DCHECK_GT(Offset, 0);
 
-      // We check whether the offset was on a boundary before decrement, to see
-      // whether we need to retreat to the previous chunk.
-      if ((Offset-- % N) == 0)
+      auto PreviousOffset = Offset--;
+      if (PreviousOffset != Size && PreviousOffset % N == 0) {
+        DCHECK_NE(C->Prev, &SentinelChunk);
         C = C->Prev;
+      }
+
       return *this;
     }
 
@@ -190,12 +200,16 @@ private:
 
     U &operator*() const {
       DCHECK_NE(C, &SentinelChunk);
-      return reinterpret_cast<U *>(C->Block.Data)[Offset % N];
+      auto RelOff = Offset % N;
+      return *reinterpret_cast<U *>(reinterpret_cast<char *>(C->Block.Data) +
+                                    (RelOff * ElementStorageSize));
     }
 
     U *operator->() const {
       DCHECK_NE(C, &SentinelChunk);
-      return reinterpret_cast<U *>(C->Block.Data) + (Offset % N);
+      auto RelOff = Offset % N;
+      return reinterpret_cast<U *>(reinterpret_cast<char *>(C->Block.Data) +
+                                   (RelOff * ElementStorageSize));
     }
   };
 
@@ -232,10 +246,11 @@ public:
       if (AppendNewChunk() == nullptr)
         return nullptr;
 
-    auto Position = reinterpret_cast<T *>(Tail->Block.Data) + Offset;
+    auto Position =
+        reinterpret_cast<T *>(reinterpret_cast<char *>(Tail->Block.Data) +
+                              (Offset * ElementStorageSize));
     *Position = E;
     ++Size;
-    FreeElements -= FreeElements ? 1 : 0;
     return Position;
   }
 
@@ -245,16 +260,21 @@ public:
         return nullptr;
 
     auto Offset = Size % N;
-    if (UNLIKELY(Size != 0 && Offset == 0))
-      if (AppendNewChunk() == nullptr)
+    auto *LatestChunk = Tail;
+    if (UNLIKELY(Size != 0 && Offset == 0)) {
+      LatestChunk = AppendNewChunk();
+      if (LatestChunk == nullptr)
         return nullptr;
+    }
 
-    auto Position = reinterpret_cast<T *>(Tail->Block.Data) + Offset;
+    DCHECK_NE(Tail, &SentinelChunk);
+    auto Position = reinterpret_cast<char *>(LatestChunk->Block.Data) +
+                    (Offset * ElementStorageSize);
+    DCHECK_EQ(reinterpret_cast<uintptr_t>(Position) % ElementStorageSize, 0);
     // In-place construct at Position.
-    new (Position) T(std::forward<Args>(args)...);
+    new (Position) T{std::forward<Args>(args)...};
     ++Size;
-    FreeElements -= FreeElements ? 1 : 0;
-    return Position;
+    return reinterpret_cast<T *>(Position);
   }
 
   T &operator[](size_t Offset) const {
@@ -266,20 +286,22 @@ public:
       Offset -= N;
       DCHECK_NE(C, &SentinelChunk);
     }
-    auto Position = reinterpret_cast<T *>(C->Block.Data) + Offset;
-    return *Position;
+    return *reinterpret_cast<T *>(reinterpret_cast<char *>(C->Block.Data) +
+                                  (Offset * ElementStorageSize));
   }
 
   T &front() const {
     DCHECK_NE(Head, &SentinelChunk);
     DCHECK_NE(Size, 0u);
-    return *reinterpret_cast<T *>(Head->Block.Data);
+    return *begin();
   }
 
   T &back() const {
     DCHECK_NE(Tail, &SentinelChunk);
-    auto Offset = (Size - 1) % N;
-    return *(reinterpret_cast<T *>(Tail->Block.Data) + Offset);
+    DCHECK_NE(Size, 0u);
+    auto It = end();
+    --It;
+    return *It;
   }
 
   template <class Predicate> T *find_element(Predicate P) const {
@@ -298,15 +320,18 @@ public:
   /// require allocation of new blocks for new elements added after trimming.
   void trim(size_t Elements) {
     DCHECK_LE(Elements, Size);
+    DCHECK_GT(Size, 0);
+    auto OldSize = Size;
     Size -= Elements;
-    FreeElements += Elements;
-
-    // Here we need to check whether we've cleared enough elements to warrant
-    // putting blocks on to the freelist. We determine whether we need to
-    // right-size the internal list, by keeping track of the number of "free"
-    // elements still in the array.
-    auto ChunksToTrim = FreeElements / N;
-    for (size_t i = 0; i < ChunksToTrim; ++i, FreeElements -= N) {
+
+    DCHECK_NE(Head, &SentinelChunk);
+    DCHECK_NE(Tail, &SentinelChunk);
+
+    for (auto ChunksToTrim =
+             (nearest_boundary(OldSize, N) - nearest_boundary(Size, N)) / N;
+         ChunksToTrim > 0; --ChunksToTrim) {
+      DCHECK_NE(Head, &SentinelChunk);
+      DCHECK_NE(Tail, &SentinelChunk);
       // Put the tail into the Freelist.
       auto *FreeChunk = Tail;
       Tail = Tail->Prev;
@@ -314,17 +339,21 @@ public:
         Head = Tail;
       else
         Tail->Next = &SentinelChunk;
+
+      DCHECK_EQ(Tail->Next, &SentinelChunk);
       FreeChunk->Next = Freelist;
-      FreeChunk->Prev = Freelist->Prev;
+      FreeChunk->Prev = &SentinelChunk;
+      if (Freelist != &SentinelChunk)
+        Freelist->Prev = FreeChunk;
       Freelist = FreeChunk;
     }
   }
 
   // Provide iterators.
-  Iterator<T> begin() const { return Iterator<T>(Head, 0); }
-  Iterator<T> end() const { return Iterator<T>(Tail, Size); }
-  Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0); }
-  Iterator<const T> cend() const { return Iterator<const T>(Tail, Size); }
+  Iterator<T> begin() const { return Iterator<T>(Head, 0, Size); }
+  Iterator<T> end() const { return Iterator<T>(Tail, Size, Size); }
+  Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0, Size); }
+  Iterator<const T> cend() const { return Iterator<const T>(Tail, Size, Size); }
 };
 
 // We need to have this storage definition out-of-line so that the compiler can
diff --git a/lib/xray/xray_utils.h b/lib/xray/xray_utils.h
index a06a59491..0a7911f12 100644
--- a/lib/xray/xray_utils.h
+++ b/lib/xray/xray_utils.h
@@ -36,6 +36,24 @@ std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin, char *End);
 // file.
 int getLogFD();
 
+constexpr size_t gcd(size_t a, size_t b) {
+  return (b == 0) ? a : gcd(b, a % b);
+}
+
+constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }
+
+template <class T> constexpr T nearest_boundary(T number, T multiple) {
+  return multiple * ((number / multiple) + (number % multiple ? 1 : 0));
+}
+
+constexpr size_t next_pow2_helper(size_t num, size_t acc) {
+  return (1u << acc) >= num ? (1u << acc) : next_pow2_helper(num, acc + 1);
+}
+
+constexpr size_t next_pow2(size_t number) {
+  return next_pow2_helper(number, 1);
+}
+
 } // namespace __xray
 
 #endif // XRAY_UTILS_H
author	Dean Michael Berris <dberris@google.com>	2018-07-10 08:25:44 +0000
committer	Dean Michael Berris <dberris@google.com>	2018-07-10 08:25:44 +0000
commit	652950651ff6b1748fa97d152e670c2f92902e50 (patch)
tree	38a172c80f4bb603a432b33d83af8bbefb2a18df /lib
parent	a164037c709a816851d82be4643cacfafc4672f9 (diff)