5 files changed, 215 insertions, 232 deletions
diff --git a/lib/xray/tests/unit/segmented_array_test.cc b/lib/xray/tests/unit/segmented_array_test.cc
index b48d481ff..035674ccf 100644
--- a/lib/xray/tests/unit/segmented_array_test.cc
+++ b/lib/xray/tests/unit/segmented_array_test.cc
@@ -15,16 +15,14 @@ struct TestData {
 TEST(SegmentedArrayTest, ConstructWithAllocators) {
   using AllocatorType = typename Array<TestData>::AllocatorType;
   AllocatorType A(1 << 4);
-  ChunkAllocator CA(1 << 4);
-  Array<TestData> Data(A, CA);
+  Array<TestData> Data(A);
   (void)Data;
 }
 
 TEST(SegmentedArrayTest, ConstructAndPopulate) {
   using AllocatorType = typename Array<TestData>::AllocatorType;
   AllocatorType A(1 << 4);
-  ChunkAllocator CA(1 << 4);
-  Array<TestData> data(A, CA);
+  Array<TestData> data(A);
   ASSERT_NE(data.Append(TestData{0, 0}), nullptr);
   ASSERT_NE(data.Append(TestData{1, 1}), nullptr);
   ASSERT_EQ(data.size(), 2u);
@@ -33,8 +31,7 @@ TEST(SegmentedArrayTest, ConstructAndPopulate) {
 TEST(SegmentedArrayTest, ConstructPopulateAndLookup) {
   using AllocatorType = typename Array<TestData>::AllocatorType;
   AllocatorType A(1 << 4);
-  ChunkAllocator CA(1 << 4);
-  Array<TestData> data(A, CA);
+  Array<TestData> data(A);
   ASSERT_NE(data.Append(TestData{0, 1}), nullptr);
   ASSERT_EQ(data.size(), 1u);
   ASSERT_EQ(data[0].First, 0);
@@ -44,8 +41,7 @@ TEST(SegmentedArrayTest, ConstructPopulateAndLookup) {
 TEST(SegmentedArrayTest, PopulateWithMoreElements) {
   using AllocatorType = typename Array<TestData>::AllocatorType;
   AllocatorType A(1 << 24);
-  ChunkAllocator CA(1 << 20);
-  Array<TestData> data(A, CA);
+  Array<TestData> data(A);
   static const auto kMaxElements = 100u;
   for (auto I = 0u; I < kMaxElements; ++I) {
     ASSERT_NE(data.Append(TestData{I, I + 1}), nullptr);
@@ -60,8 +56,7 @@ TEST(SegmentedArrayTest, PopulateWithMoreElements) {
 TEST(SegmentedArrayTest, AppendEmplace) {
   using AllocatorType = typename Array<TestData>::AllocatorType;
   AllocatorType A(1 << 4);
-  ChunkAllocator CA(1 << 4);
-  Array<TestData> data(A, CA);
+  Array<TestData> data(A);
   ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
   ASSERT_EQ(data[0].First, 1);
   ASSERT_EQ(data[0].Second, 1);
@@ -70,8 +65,7 @@ TEST(SegmentedArrayTest, AppendEmplace) {
 TEST(SegmentedArrayTest, AppendAndTrim) {
   using AllocatorType = typename Array<TestData>::AllocatorType;
   AllocatorType A(1 << 4);
-  ChunkAllocator CA(1 << 4);
-  Array<TestData> data(A, CA);
+  Array<TestData> data(A);
   ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
   ASSERT_EQ(data.size(), 1u);
   data.trim(1);
@@ -82,8 +76,7 @@ TEST(SegmentedArrayTest, AppendAndTrim) {
 TEST(SegmentedArrayTest, IteratorAdvance) {
   using AllocatorType = typename Array<TestData>::AllocatorType;
   AllocatorType A(1 << 4);
-  ChunkAllocator CA(1 << 4);
-  Array<TestData> data(A, CA);
+  Array<TestData> data(A);
   ASSERT_TRUE(data.empty());
   ASSERT_EQ(data.begin(), data.end());
   auto I0 = data.begin();
@@ -104,8 +97,7 @@ TEST(SegmentedArrayTest, IteratorAdvance) {
 TEST(SegmentedArrayTest, IteratorRetreat) {
   using AllocatorType = typename Array<TestData>::AllocatorType;
   AllocatorType A(1 << 4);
-  ChunkAllocator CA(1 << 4);
-  Array<TestData> data(A, CA);
+  Array<TestData> data(A);
   ASSERT_TRUE(data.empty());
   ASSERT_EQ(data.begin(), data.end());
   ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
@@ -126,17 +118,16 @@ TEST(SegmentedArrayTest, IteratorRetreat) {
 TEST(SegmentedArrayTest, IteratorTrimBehaviour) {
   using AllocatorType = typename Array<TestData>::AllocatorType;
   AllocatorType A(1 << 20);
-  ChunkAllocator CA(1 << 10);
-  Array<TestData> Data(A, CA);
+  Array<TestData> Data(A);
   ASSERT_TRUE(Data.empty());
   auto I0Begin = Data.begin(), I0End = Data.end();
   // Add enough elements in Data to have more than one chunk.
-  constexpr auto Chunk = Array<TestData>::ChunkSize;
-  constexpr auto ChunkX2 = Chunk * 2;
-  for (auto i = ChunkX2; i > 0u; --i) {
+  constexpr auto Segment = Array<TestData>::SegmentSize;
+  constexpr auto SegmentX2 = Segment * 2;
+  for (auto i = SegmentX2; i > 0u; --i) {
     Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i));
   }
-  ASSERT_EQ(Data.size(), ChunkX2);
+  ASSERT_EQ(Data.size(), SegmentX2);
   {
     auto &Back = Data.back();
     ASSERT_EQ(Back.First, 1);
@@ -144,18 +135,18 @@ TEST(SegmentedArrayTest, IteratorTrimBehaviour) {
   }
 
   // Trim one chunk's elements worth.
-  Data.trim(Chunk);
-  ASSERT_EQ(Data.size(), Chunk);
+  Data.trim(Segment);
+  ASSERT_EQ(Data.size(), Segment);
 
   // Check that we are still able to access 'back' properly.
   {
     auto &Back = Data.back();
-    ASSERT_EQ(Back.First, static_cast<s64>(Chunk + 1));
-    ASSERT_EQ(Back.Second, static_cast<s64>(Chunk + 1));
+    ASSERT_EQ(Back.First, static_cast<s64>(Segment + 1));
+    ASSERT_EQ(Back.Second, static_cast<s64>(Segment + 1));
   }
 
   // Then trim until it's empty.
-  Data.trim(Chunk);
+  Data.trim(Segment);
   ASSERT_TRUE(Data.empty());
 
   // Here our iterators should be the same.
@@ -164,10 +155,10 @@ TEST(SegmentedArrayTest, IteratorTrimBehaviour) {
   EXPECT_EQ(I0End, I1End);
 
   // Then we ensure that adding elements back works just fine.
-  for (auto i = ChunkX2; i > 0u; --i) {
+  for (auto i = SegmentX2; i > 0u; --i) {
     Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i));
   }
-  EXPECT_EQ(Data.size(), ChunkX2);
+  EXPECT_EQ(Data.size(), SegmentX2);
 }
 
 struct ShadowStackEntry {
@@ -179,8 +170,7 @@ struct ShadowStackEntry {
 TEST(SegmentedArrayTest, SimulateStackBehaviour) {
   using AllocatorType = typename Array<ShadowStackEntry>::AllocatorType;
   AllocatorType A(1 << 10);
-  ChunkAllocator CA(1 << 10);
-  Array<ShadowStackEntry> Data(A, CA);
+  Array<ShadowStackEntry> Data(A);
   static uint64_t Dummy = 0;
   constexpr uint64_t Max = 9;
 
diff --git a/lib/xray/xray_function_call_trie.h b/lib/xray/xray_function_call_trie.h
index e6e31cec2..2acf14aa5 100644
--- a/lib/xray/xray_function_call_trie.h
+++ b/lib/xray/xray_function_call_trie.h
@@ -119,9 +119,9 @@ public:
 
     // We add a constructor here to allow us to inplace-construct through
     // Array<...>'s AppendEmplace.
-    Node(Node *P, NodeIdPairAllocatorType &A, ChunkAllocator &CA, int64_t CC,
-         int64_t CLT, int32_t F)
-        : Parent(P), Callees(A, CA), CallCount(CC), CumulativeLocalTime(CLT),
+    Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
+         int32_t F)
+        : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT),
           FId(F) {}
 
     // TODO: Include the compact histogram.
@@ -153,7 +153,6 @@ public:
     RootAllocatorType *RootAllocator = nullptr;
     ShadowStackAllocatorType *ShadowStackAllocator = nullptr;
     NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
-    ChunkAllocator *ChunkAlloc = nullptr;
 
     Allocators() {}
     Allocators(const Allocators &) = delete;
@@ -162,12 +161,11 @@ public:
     Allocators(Allocators &&O)
         : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator),
           ShadowStackAllocator(O.ShadowStackAllocator),
-          NodeIdPairAllocator(O.NodeIdPairAllocator), ChunkAlloc(O.ChunkAlloc) {
+          NodeIdPairAllocator(O.NodeIdPairAllocator) {
       O.NodeAllocator = nullptr;
       O.RootAllocator = nullptr;
       O.ShadowStackAllocator = nullptr;
       O.NodeIdPairAllocator = nullptr;
-      O.ChunkAlloc = nullptr;
     }
 
     Allocators &operator=(Allocators &&O) {
@@ -191,11 +189,6 @@ public:
         O.NodeIdPairAllocator = this->NodeIdPairAllocator;
         this->NodeIdPairAllocator = Tmp;
       }
-      {
-        auto Tmp = O.ChunkAlloc;
-        O.ChunkAlloc = this->ChunkAlloc;
-        this->ChunkAlloc = Tmp;
-      }
       return *this;
     }
 
@@ -223,11 +216,6 @@ public:
         InternalFree(NodeIdPairAllocator);
         NodeIdPairAllocator = nullptr;
       }
-      if (ChunkAlloc != nullptr) {
-        ChunkAlloc->~ChunkAllocator();
-        InternalFree(ChunkAlloc);
-        ChunkAlloc = nullptr;
-      }
     }
   };
 
@@ -258,11 +246,6 @@ public:
         InternalAlloc(sizeof(NodeIdPairAllocatorType)));
     new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max);
     A.NodeIdPairAllocator = NodeIdPairAllocator;
-
-    auto ChunkAlloc = reinterpret_cast<ChunkAllocator *>(
-        InternalAlloc(sizeof(ChunkAllocator)));
-    new (ChunkAlloc) ChunkAllocator(Max);
-    A.ChunkAlloc = ChunkAlloc;
     return A;
   }
 
@@ -271,22 +254,20 @@ private:
   RootArray Roots;
   ShadowStackArray ShadowStack;
   NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
-  ChunkAllocator *ChunkAlloc = nullptr;
 
 public:
   explicit FunctionCallTrie(const Allocators &A)
-      : Nodes(*A.NodeAllocator, *A.ChunkAlloc),
-        Roots(*A.RootAllocator, *A.ChunkAlloc),
-        ShadowStack(*A.ShadowStackAllocator, *A.ChunkAlloc),
-        NodeIdPairAllocator(A.NodeIdPairAllocator), ChunkAlloc(A.ChunkAlloc) {}
+      : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
+        ShadowStack(*A.ShadowStackAllocator),
+        NodeIdPairAllocator(A.NodeIdPairAllocator) {}
 
   void enterFunction(const int32_t FId, uint64_t TSC) {
     DCHECK_NE(FId, 0);
     // This function primarily deals with ensuring that the ShadowStack is
     // consistent and ready for when an exit event is encountered.
     if (UNLIKELY(ShadowStack.empty())) {
-      auto NewRoot = Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator,
-                                         *ChunkAlloc, 0, 0, FId);
+      auto NewRoot =
+          Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId);
       if (UNLIKELY(NewRoot == nullptr))
         return;
       Roots.Append(NewRoot);
@@ -309,8 +290,8 @@ public:
     }
 
     // This means we've never seen this stack before, create a new node here.
-    auto NewNode = Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator,
-                                       *ChunkAlloc, 0, 0, FId);
+    auto NewNode =
+        Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
     if (UNLIKELY(NewNode == nullptr))
       return;
     DCHECK_NE(NewNode, nullptr);
@@ -370,13 +351,12 @@ public:
 
     typename Stack::AllocatorType StackAllocator(
         profilingFlags()->stack_allocator_max);
-    ChunkAllocator StackChunkAllocator(profilingFlags()->stack_allocator_max);
-    Stack DFSStack(StackAllocator, StackChunkAllocator);
+    Stack DFSStack(StackAllocator);
 
     for (const auto Root : getRoots()) {
       // Add a node in O for this root.
       auto NewRoot = O.Nodes.AppendEmplace(
-          nullptr, *O.NodeIdPairAllocator, *O.ChunkAlloc, Root->CallCount,
+          nullptr, *O.NodeIdPairAllocator, Root->CallCount,
           Root->CumulativeLocalTime, Root->FId);
 
       // Because we cannot allocate more memory we should bail out right away.
@@ -395,9 +375,8 @@ public:
         DFSStack.trim(1);
         for (const auto Callee : NP.Node->Callees) {
           auto NewNode = O.Nodes.AppendEmplace(
-              NP.NewNode, *O.NodeIdPairAllocator, *O.ChunkAlloc,
-              Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime,
-              Callee.FId);
+              NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount,
+              Callee.NodePtr->CumulativeLocalTime, Callee.FId);
           if (UNLIKELY(NewNode == nullptr))
             return;
           NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
@@ -423,16 +402,15 @@ public:
     using Stack = Array<NodeAndTarget>;
     typename Stack::AllocatorType StackAllocator(
         profilingFlags()->stack_allocator_max);
-    ChunkAllocator CA(profilingFlags()->stack_allocator_max);
-    Stack DFSStack(StackAllocator, CA);
+    Stack DFSStack(StackAllocator);
 
     for (const auto Root : getRoots()) {
       Node *TargetRoot = nullptr;
       auto R = O.Roots.find_element(
           [&](const Node *Node) { return Node->FId == Root->FId; });
       if (R == nullptr) {
-        TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator,
-                                           *O.ChunkAlloc, 0, 0, Root->FId);
+        TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
+                                           0, Root->FId);
         if (UNLIKELY(TargetRoot == nullptr))
           return;
 
@@ -456,9 +434,8 @@ public:
                 return C.FId == Callee.FId;
               });
           if (TargetCallee == nullptr) {
-            auto NewTargetNode =
-                O.Nodes.AppendEmplace(NT.TargetNode, *O.NodeIdPairAllocator,
-                                      *O.ChunkAlloc, 0, 0, Callee.FId);
+            auto NewTargetNode = O.Nodes.AppendEmplace(
+                NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
 
             if (UNLIKELY(NewTargetNode == nullptr))
               return;
diff --git a/lib/xray/xray_profile_collector.cc b/lib/xray/xray_profile_collector.cc
index ba347d13b..a43744d9a 100644
--- a/lib/xray/xray_profile_collector.cc
+++ b/lib/xray/xray_profile_collector.cc
@@ -118,12 +118,11 @@ struct ProfileRecord {
   const FunctionCallTrie::Node *Node = nullptr;
 
   // Constructor for in-place construction.
-  ProfileRecord(PathAllocator &A, ChunkAllocator &CA,
-                const FunctionCallTrie::Node *N)
+  ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
       : Path([&] {
           auto P =
               reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
-          new (P) PathArray(A, CA);
+          new (P) PathArray(A);
           return P;
         }()),
         Node(N) {}
@@ -137,18 +136,17 @@ using ProfileRecordArray = Array<ProfileRecord>;
 // the path(s) and the data associated with the path.
 static void populateRecords(ProfileRecordArray &PRs,
                             ProfileRecord::PathAllocator &PA,
-                            ChunkAllocator &CA, const FunctionCallTrie &Trie) {
+                            const FunctionCallTrie &Trie) {
   using StackArray = Array<const FunctionCallTrie::Node *>;
   using StackAllocator = typename StackArray::AllocatorType;
   StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
-  ChunkAllocator StackChunkAlloc(profilingFlags()->stack_allocator_max);
-  StackArray DFSStack(StackAlloc, StackChunkAlloc);
+  StackArray DFSStack(StackAlloc);
   for (const auto R : Trie.getRoots()) {
     DFSStack.Append(R);
     while (!DFSStack.empty()) {
       auto Node = DFSStack.back();
       DFSStack.trim(1);
-      auto Record = PRs.AppendEmplace(PA, CA, Node);
+      auto Record = PRs.AppendEmplace(PA, Node);
       if (Record == nullptr)
         return;
       DCHECK_NE(Record, nullptr);
@@ -214,10 +212,9 @@ void serialize() {
   for (u32 I = 0; I < ThreadTries->Size(); ++I) {
     using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
     ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
-    ChunkAllocator CA(profilingFlags()->global_allocator_max);
     ProfileRecord::PathAllocator PathAlloc(
         profilingFlags()->global_allocator_max);
-    ProfileRecordArray ProfileRecords(PRAlloc, CA);
+    ProfileRecordArray ProfileRecords(PRAlloc);
 
     // First, we want to compute the amount of space we're going to need. We'll
     // use a local allocator and an __xray::Array<...> to store the intermediary
@@ -226,7 +223,7 @@ void serialize() {
     const auto &Trie = *(*ThreadTries)[I].Trie;
     if (Trie.getRoots().empty())
       continue;
-    populateRecords(ProfileRecords, PathAlloc, CA, Trie);
+    populateRecords(ProfileRecords, PathAlloc, Trie);
     DCHECK(!Trie.getRoots().empty());
     DCHECK(!ProfileRecords.empty());
 
diff --git a/lib/xray/xray_profiling.cc b/lib/xray/xray_profiling.cc
index 3a0c0b90e..a11fddcc2 100644
--- a/lib/xray/xray_profiling.cc
+++ b/lib/xray/xray_profiling.cc
@@ -57,12 +57,13 @@ struct alignas(64) ProfilingData {
 static pthread_key_t ProfilingKey;
 
 thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage{};
-
 static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
-  if (pthread_getspecific(ProfilingKey) == NULL) {
+  thread_local auto ThreadOnce = [] {
     new (&ThreadStorage) ProfilingData{};
     pthread_setspecific(ProfilingKey, &ThreadStorage);
-  }
+    return false;
+  }();
+  (void)ThreadOnce;
 
   auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
 
diff --git a/lib/xray/xray_segmented_array.h b/lib/xray/xray_segmented_array.h
index f001b230c..018c2aa9e 100644
--- a/lib/xray/xray_segmented_array.h
+++ b/lib/xray/xray_segmented_array.h
@@ -9,7 +9,7 @@
 //
 // This file is a part of XRay, a dynamic runtime instrumentation system.
 //
-// Defines the implementation of a segmented array, with fixed-size chunks
+// Defines the implementation of a segmented array, with fixed-size segments
 // backing the segments.
 //
 //===----------------------------------------------------------------------===//
@@ -24,113 +24,128 @@
 #include <utility>
 
 namespace __xray {
-struct Chunk {
-  void *Data;
-  Chunk *Prev;
-  Chunk *Next;
-};
-
-using ChunkAllocator = Allocator<next_pow2(sizeof(Chunk)) * 8>;
 
 /// The Array type provides an interface similar to std::vector<...> but does
 /// not shrink in size. Once constructed, elements can be appended but cannot be
 /// removed. The implementation is heavily dependent on the contract provided by
 /// the Allocator type, in that all memory will be released when the Allocator
 /// is destroyed. When an Array is destroyed, it will destroy elements in the
-/// backing store but will not free the memory. The parameter N defines how many
-/// elements of T there should be in a single block.
-///
-/// We compute the least common multiple of the size of T and the cache line
-/// size, to allow us to maximise the number of T objects we can place in
-/// cache-line multiple sized blocks. To get back the number of T's, we divide
-/// this least common multiple by the size of T.
-template <class T,
-          size_t N = lcm(next_pow2(sizeof(T)), kCacheLineSize) / sizeof(T)>
-struct Array {
-  static constexpr size_t ChunkSize = N;
-  static constexpr size_t ElementStorageSize = next_pow2(sizeof(T));
-  static constexpr size_t AllocatorChunkSize = ElementStorageSize * ChunkSize;
-  using AllocatorType = Allocator<AllocatorChunkSize>;
-  static_assert(std::is_trivially_destructible<T>::value,
-                "T must be trivially destructible.");
+/// backing store but will not free the memory.
+template <class T> class Array {
+  struct SegmentBase {
+    SegmentBase *Prev;
+    SegmentBase *Next;
+  };
 
-private:
-  // TODO: Consider co-locating the chunk information with the data in the
-  // Block, as in an intrusive list -- i.e. putting the next and previous
-  // pointer values inside the Block storage.
-  static Chunk SentinelChunk;
+  // We want each segment of the array to be cache-line aligned, and elements of
+  // the array be offset from the beginning of the segment.
+  struct Segment : SegmentBase {
+    char Data[];
+  };
+
+public:
+  // Each segment of the array will be laid out with the following assumptions:
+  //
+  //   - Each segment will be on a cache-line address boundary (kCacheLineSize
+  //     aligned).
+  //
+  //   - The elements will be accessed through an aligned pointer, dependent on
+  //     the alignment of T.
+  //
+  //   - Each element is at least two-pointers worth from the beginning of the
+  //     Segment, aligned properly, and the rest of the elements are accessed
+  //     through appropriate alignment.
+  //
+  // We then compute the size of the segment to follow this logic:
+  //
+  //   - Compute the number of elements that can fit within
+  //     kCacheLineSize-multiple segments, minus the size of two pointers.
+  //
+  //   - Request cacheline-multiple sized elements from the allocator.
+  static constexpr size_t AlignedElementStorageSize =
+      sizeof(typename std::aligned_storage<sizeof(T), alignof(T)>::type);
+
+  static constexpr size_t SegmentSize =
+      nearest_boundary(sizeof(Segment) + next_pow2(sizeof(T)), kCacheLineSize);
+
+  using AllocatorType = Allocator<SegmentSize>;
+
+  static constexpr size_t ElementsPerSegment =
+      (SegmentSize - sizeof(Segment)) / next_pow2(sizeof(T));
+
+  static_assert(ElementsPerSegment > 0,
+                "Must have at least 1 element per segment.");
+
+  static SegmentBase SentinelSegment;
 
+private:
   AllocatorType *Alloc;
-  ChunkAllocator *ChunkAlloc;
-  Chunk *Head = &SentinelChunk;
-  Chunk *Tail = &SentinelChunk;
+  SegmentBase *Head = &SentinelSegment;
+  SegmentBase *Tail = &SentinelSegment;
   size_t Size = 0;
 
-  // Here we keep track of chunks in the freelist, to allow us to re-use chunks
-  // when elements are trimmed off the end.
-  Chunk *Freelist = &SentinelChunk;
+  // Here we keep track of segments in the freelist, to allow us to re-use
+  // segments when elements are trimmed off the end.
+  SegmentBase *Freelist = &SentinelSegment;
 
-  Chunk *NewChunk() {
+  Segment *NewSegment() {
     // We need to handle the case in which enough elements have been trimmed to
-    // allow us to re-use chunks we've allocated before. For this we look into
+    // allow us to re-use segments we've allocated before. For this we look into
     // the Freelist, to see whether we need to actually allocate new blocks or
     // just re-use blocks we've already seen before.
-    if (Freelist != &SentinelChunk) {
-      auto *FreeChunk = Freelist;
-      Freelist = FreeChunk->Next;
-      FreeChunk->Next = &SentinelChunk;
-      Freelist->Prev = &SentinelChunk;
-      return FreeChunk;
+    if (Freelist != &SentinelSegment) {
+      auto *FreeSegment = Freelist;
+      Freelist = FreeSegment->Next;
+      FreeSegment->Next = &SentinelSegment;
+      Freelist->Prev = &SentinelSegment;
+      return static_cast<Segment *>(FreeSegment);
     }
 
-    auto Block = Alloc->Allocate();
-    if (Block.Data == nullptr)
-      return nullptr;
-
-    auto ChunkElement = ChunkAlloc->Allocate();
-    if (ChunkElement.Data == nullptr)
+    auto SegmentBlock = Alloc->Allocate();
+    if (SegmentBlock.Data == nullptr)
       return nullptr;
 
-    // Placement-new the Chunk element at the appropriate location.
-    auto C = reinterpret_cast<Chunk *>(ChunkElement.Data);
-    Chunk LocalC{Block.Data, &SentinelChunk, &SentinelChunk};
-    internal_memcpy(C, &LocalC, sizeof(LocalC));
-    return C;
+    // Placement-new the Segment element at the beginning of the SegmentBlock.
+    auto S = reinterpret_cast<Segment *>(SegmentBlock.Data);
+    new (S) SegmentBase{&SentinelSegment, &SentinelSegment};
+    return S;
   }
 
-  Chunk *InitHeadAndTail() {
-    DCHECK_EQ(Head, &SentinelChunk);
-    DCHECK_EQ(Tail, &SentinelChunk);
-    auto Chunk = NewChunk();
-    if (Chunk == nullptr)
+  Segment *InitHeadAndTail() {
+    DCHECK_EQ(Head, &SentinelSegment);
+    DCHECK_EQ(Tail, &SentinelSegment);
+    auto Segment = NewSegment();
+    if (Segment == nullptr)
       return nullptr;
-    DCHECK_EQ(Chunk->Next, &SentinelChunk);
-    DCHECK_EQ(Chunk->Prev, &SentinelChunk);
-    return Head = Tail = Chunk;
+    DCHECK_EQ(Segment->Next, &SentinelSegment);
+    DCHECK_EQ(Segment->Prev, &SentinelSegment);
+    Head = Tail = static_cast<SegmentBase *>(Segment);
+    return Segment;
   }
 
-  Chunk *AppendNewChunk() {
-    auto Chunk = NewChunk();
-    if (Chunk == nullptr)
+  Segment *AppendNewSegment() {
+    auto S = NewSegment();
+    if (S == nullptr)
       return nullptr;
-    DCHECK_NE(Tail, &SentinelChunk);
-    DCHECK_EQ(Tail->Next, &SentinelChunk);
-    DCHECK_EQ(Chunk->Prev, &SentinelChunk);
-    DCHECK_EQ(Chunk->Next, &SentinelChunk);
-    Tail->Next = Chunk;
-    Chunk->Prev = Tail;
-    Tail = Chunk;
-    return Tail;
+    DCHECK_NE(Tail, &SentinelSegment);
+    DCHECK_EQ(Tail->Next, &SentinelSegment);
+    DCHECK_EQ(S->Prev, &SentinelSegment);
+    DCHECK_EQ(S->Next, &SentinelSegment);
+    Tail->Next = S;
+    S->Prev = Tail;
+    Tail = S;
+    return static_cast<Segment *>(Tail);
   }
 
   // This Iterator models a BidirectionalIterator.
   template <class U> class Iterator {
-    Chunk *C = &SentinelChunk;
+    SegmentBase *S = &SentinelSegment;
     size_t Offset = 0;
     size_t Size = 0;
 
   public:
-    Iterator(Chunk *IC, size_t Off, size_t S) : C(IC), Offset(Off), Size(S) {}
+    Iterator(SegmentBase *IS, size_t Off, size_t S)
+        : S(IS), Offset(Off), Size(S) {}
     Iterator(const Iterator &) noexcept = default;
     Iterator() noexcept = default;
     Iterator(Iterator &&) noexcept = default;
@@ -139,28 +154,28 @@ private:
     ~Iterator() = default;
 
     Iterator &operator++() {
-      if (++Offset % N || Offset == Size)
+      if (++Offset % ElementsPerSegment || Offset == Size)
         return *this;
 
       // At this point, we know that Offset % N == 0, so we must advance the
-      // chunk pointer.
-      DCHECK_EQ(Offset % N, 0);
+      // segment pointer.
+      DCHECK_EQ(Offset % ElementsPerSegment, 0);
       DCHECK_NE(Offset, Size);
-      DCHECK_NE(C, &SentinelChunk);
-      DCHECK_NE(C->Next, &SentinelChunk);
-      C = C->Next;
-      DCHECK_NE(C, &SentinelChunk);
+      DCHECK_NE(S, &SentinelSegment);
+      DCHECK_NE(S->Next, &SentinelSegment);
+      S = S->Next;
+      DCHECK_NE(S, &SentinelSegment);
       return *this;
     }
 
     Iterator &operator--() {
-      DCHECK_NE(C, &SentinelChunk);
+      DCHECK_NE(S, &SentinelSegment);
       DCHECK_GT(Offset, 0);
 
       auto PreviousOffset = Offset--;
-      if (PreviousOffset != Size && PreviousOffset % N == 0) {
-        DCHECK_NE(C->Prev, &SentinelChunk);
-        C = C->Prev;
+      if (PreviousOffset != Size && PreviousOffset % ElementsPerSegment == 0) {
+        DCHECK_NE(S->Prev, &SentinelSegment);
+        S = S->Prev;
       }
 
       return *this;
@@ -180,7 +195,7 @@ private:
 
     template <class V, class W>
     friend bool operator==(const Iterator<V> &L, const Iterator<W> &R) {
-      return L.C == R.C && L.Offset == R.Offset;
+      return L.S == R.S && L.Offset == R.Offset;
     }
 
     template <class V, class W>
@@ -189,31 +204,29 @@ private:
     }
 
     U &operator*() const {
-      DCHECK_NE(C, &SentinelChunk);
-      auto RelOff = Offset % N;
-      return *reinterpret_cast<U *>(reinterpret_cast<char *>(C->Data) +
-                                    (RelOff * ElementStorageSize));
+      DCHECK_NE(S, &SentinelSegment);
+      auto RelOff = Offset % ElementsPerSegment;
+
+      // We need to compute the character-aligned pointer, offset from the
+      // segment's Data location to get the element in the position of Offset.
+      auto Base = static_cast<Segment *>(S)->Data;
+      auto AlignedOffset = Base + (RelOff * AlignedElementStorageSize);
+      return *reinterpret_cast<U *>(AlignedOffset);
     }
 
-    U *operator->() const {
-      DCHECK_NE(C, &SentinelChunk);
-      auto RelOff = Offset % N;
-      return reinterpret_cast<U *>(reinterpret_cast<char *>(C->Data) +
-                                   (RelOff * ElementStorageSize));
-    }
+    U *operator->() const { return &(**this); }
   };
 
 public:
-  explicit Array(AllocatorType &A, ChunkAllocator &CA)
-      : Alloc(&A), ChunkAlloc(&CA) {}
+  explicit Array(AllocatorType &A) : Alloc(&A) {}
 
   Array(const Array &) = delete;
   Array(Array &&O) NOEXCEPT : Alloc(O.Alloc),
                               Head(O.Head),
                               Tail(O.Tail),
                               Size(O.Size) {
-    O.Head = &SentinelChunk;
-    O.Tail = &SentinelChunk;
+    O.Head = &SentinelSegment;
+    O.Tail = &SentinelSegment;
     O.Size = 0;
   }
 
@@ -227,39 +240,41 @@ public:
   size_t size() const { return Size; }
 
   T *Append(const T &E) {
-    if (UNLIKELY(Head == &SentinelChunk))
+    if (UNLIKELY(Head == &SentinelSegment))
       if (InitHeadAndTail() == nullptr)
         return nullptr;
 
-    auto Offset = Size % N;
+    auto Offset = Size % ElementsPerSegment;
     if (UNLIKELY(Size != 0 && Offset == 0))
-      if (AppendNewChunk() == nullptr)
+      if (AppendNewSegment() == nullptr)
         return nullptr;
 
-    auto Position = reinterpret_cast<T *>(reinterpret_cast<char *>(Tail->Data) +
-                                          (Offset * ElementStorageSize));
+    auto Base = static_cast<Segment *>(Tail)->Data;
+    auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+    auto Position = reinterpret_cast<T *>(AlignedOffset);
     *Position = E;
     ++Size;
     return Position;
   }
 
   template <class... Args> T *AppendEmplace(Args &&... args) {
-    if (UNLIKELY(Head == &SentinelChunk))
+    if (UNLIKELY(Head == &SentinelSegment))
       if (InitHeadAndTail() == nullptr)
         return nullptr;
 
-    auto Offset = Size % N;
-    auto *LatestChunk = Tail;
+    auto Offset = Size % ElementsPerSegment;
+    auto *LatestSegment = Tail;
     if (UNLIKELY(Size != 0 && Offset == 0)) {
-      LatestChunk = AppendNewChunk();
-      if (LatestChunk == nullptr)
+      LatestSegment = AppendNewSegment();
+      if (LatestSegment == nullptr)
         return nullptr;
     }
 
-    DCHECK_NE(Tail, &SentinelChunk);
-    auto Position = reinterpret_cast<char *>(LatestChunk->Data) +
-                    (Offset * ElementStorageSize);
-    DCHECK_EQ(reinterpret_cast<uintptr_t>(Position) % ElementStorageSize, 0);
+    DCHECK_NE(Tail, &SentinelSegment);
+    auto Base = static_cast<Segment *>(LatestSegment)->Data;
+    auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+    auto Position = reinterpret_cast<T *>(AlignedOffset);
+
     // In-place construct at Position.
     new (Position) T{std::forward<Args>(args)...};
     ++Size;
@@ -269,24 +284,26 @@ public:
   T &operator[](size_t Offset) const {
     DCHECK_LE(Offset, Size);
     // We need to traverse the array enough times to find the element at Offset.
-    auto C = Head;
-    while (Offset >= N) {
-      C = C->Next;
-      Offset -= N;
-      DCHECK_NE(C, &SentinelChunk);
+    auto S = Head;
+    while (Offset >= ElementsPerSegment) {
+      S = S->Next;
+      Offset -= ElementsPerSegment;
+      DCHECK_NE(S, &SentinelSegment);
     }
-    return *reinterpret_cast<T *>(reinterpret_cast<char *>(C->Data) +
-                                  (Offset * ElementStorageSize));
+    auto Base = static_cast<Segment *>(S)->Data;
+    auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+    auto Position = reinterpret_cast<T *>(AlignedOffset);
+    return *reinterpret_cast<T *>(Position);
   }
 
   T &front() const {
-    DCHECK_NE(Head, &SentinelChunk);
+    DCHECK_NE(Head, &SentinelSegment);
     DCHECK_NE(Size, 0u);
     return *begin();
   }
 
   T &back() const {
-    DCHECK_NE(Tail, &SentinelChunk);
+    DCHECK_NE(Tail, &SentinelSegment);
     DCHECK_NE(Size, 0u);
     auto It = end();
     --It;
@@ -313,28 +330,29 @@ public:
     auto OldSize = Size;
     Size -= Elements;
 
-    DCHECK_NE(Head, &SentinelChunk);
-    DCHECK_NE(Tail, &SentinelChunk);
+    DCHECK_NE(Head, &SentinelSegment);
+    DCHECK_NE(Tail, &SentinelSegment);
 
-    for (auto ChunksToTrim =
-             (nearest_boundary(OldSize, N) - nearest_boundary(Size, N)) / N;
-         ChunksToTrim > 0; --ChunksToTrim) {
-      DCHECK_NE(Head, &SentinelChunk);
-      DCHECK_NE(Tail, &SentinelChunk);
+    for (auto SegmentsToTrim = (nearest_boundary(OldSize, ElementsPerSegment) -
+                                nearest_boundary(Size, ElementsPerSegment)) /
+                               ElementsPerSegment;
+         SegmentsToTrim > 0; --SegmentsToTrim) {
+      DCHECK_NE(Head, &SentinelSegment);
+      DCHECK_NE(Tail, &SentinelSegment);
       // Put the tail into the Freelist.
-      auto *FreeChunk = Tail;
+      auto *FreeSegment = Tail;
       Tail = Tail->Prev;
-      if (Tail == &SentinelChunk)
+      if (Tail == &SentinelSegment)
         Head = Tail;
       else
-        Tail->Next = &SentinelChunk;
-
-      DCHECK_EQ(Tail->Next, &SentinelChunk);
-      FreeChunk->Next = Freelist;
-      FreeChunk->Prev = &SentinelChunk;
-      if (Freelist != &SentinelChunk)
-        Freelist->Prev = FreeChunk;
-      Freelist = FreeChunk;
+        Tail->Next = &SentinelSegment;
+
+      DCHECK_EQ(Tail->Next, &SentinelSegment);
+      FreeSegment->Next = Freelist;
+      FreeSegment->Prev = &SentinelSegment;
+      if (Freelist != &SentinelSegment)
+        Freelist->Prev = FreeSegment;
+      Freelist = FreeSegment;
     }
   }
 
@@ -346,11 +364,11 @@ public:
 };
 
 // We need to have this storage definition out-of-line so that the compiler can
-// ensure that storage for the SentinelChunk is defined and has a single
+// ensure that storage for the SentinelSegment is defined and has a single
 // address.
-template <class T, size_t N>
-Chunk Array<T, N>::SentinelChunk{nullptr, &Array<T, N>::SentinelChunk,
-                                 &Array<T, N>::SentinelChunk};
+template <class T>
+typename Array<T>::SegmentBase Array<T>::SentinelSegment{
+    &Array<T>::SentinelSegment, &Array<T>::SentinelSegment};
 
 } // namespace __xray