summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDean Michael Berris <dberris@google.com>2018-07-18 02:08:39 +0000
committerDean Michael Berris <dberris@google.com>2018-07-18 02:08:39 +0000
commit165e3d0ade3950a1c4165dd6e70bf1628d2522d1 (patch)
treefc5adf81c84c1f1157b1e4f4900ae2aaaa40b36d
parent1d40550d23737645acb8d9f4fd90472b52328906 (diff)
[XRay][compiler-rt] Segmented Array: Simplify and Optimise
Summary: This is a follow-on to D49217 which simplifies and optimises the implementation of the segmented array. In this patch we co-locate the book-keeping for segments in the `__xray::Array<T>` with the data it's managing. We take the chance in this patch to actually rename `Chunk` to `Segment` to better align with the high-level description of the segmented array. With measurements using benchmarks landed in D48879, we've identified that calls to `pthread_getspecific` started dominating the cycles, which led us to revert the change made in D49217 to use C++ thread_local initialisation instead (it reduces the cost by a huge margin, since we save one PLT-based call to pthread functions in the hot path). In particular, this is in `__xray::getThreadLocalData()`. We also took the opportunity to remove the least-common-multiple based calculation and instead pack as much data into segments of the array. This greatly simplifies the API of the container which hides as much of the implementation details as possible. For instance, we calculate the number of elements we need for the each segment internally in the Array instead of making it part of the type. With the changes here, we're able to get a measurable improvement on the performance of profiling mode on top of what D48879 already provides. Depends on D48879. Reviewers: kpw, eizan Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D49363 git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@337343 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/xray/tests/unit/segmented_array_test.cc52
-rw-r--r--lib/xray/xray_function_call_trie.h63
-rw-r--r--lib/xray/xray_profile_collector.cc17
-rw-r--r--lib/xray/xray_profiling.cc7
-rw-r--r--lib/xray/xray_segmented_array.h308
5 files changed, 215 insertions, 232 deletions
diff --git a/lib/xray/tests/unit/segmented_array_test.cc b/lib/xray/tests/unit/segmented_array_test.cc
index b48d481ff..035674ccf 100644
--- a/lib/xray/tests/unit/segmented_array_test.cc
+++ b/lib/xray/tests/unit/segmented_array_test.cc
@@ -15,16 +15,14 @@ struct TestData {
TEST(SegmentedArrayTest, ConstructWithAllocators) {
using AllocatorType = typename Array<TestData>::AllocatorType;
AllocatorType A(1 << 4);
- ChunkAllocator CA(1 << 4);
- Array<TestData> Data(A, CA);
+ Array<TestData> Data(A);
(void)Data;
}
TEST(SegmentedArrayTest, ConstructAndPopulate) {
using AllocatorType = typename Array<TestData>::AllocatorType;
AllocatorType A(1 << 4);
- ChunkAllocator CA(1 << 4);
- Array<TestData> data(A, CA);
+ Array<TestData> data(A);
ASSERT_NE(data.Append(TestData{0, 0}), nullptr);
ASSERT_NE(data.Append(TestData{1, 1}), nullptr);
ASSERT_EQ(data.size(), 2u);
@@ -33,8 +31,7 @@ TEST(SegmentedArrayTest, ConstructAndPopulate) {
TEST(SegmentedArrayTest, ConstructPopulateAndLookup) {
using AllocatorType = typename Array<TestData>::AllocatorType;
AllocatorType A(1 << 4);
- ChunkAllocator CA(1 << 4);
- Array<TestData> data(A, CA);
+ Array<TestData> data(A);
ASSERT_NE(data.Append(TestData{0, 1}), nullptr);
ASSERT_EQ(data.size(), 1u);
ASSERT_EQ(data[0].First, 0);
@@ -44,8 +41,7 @@ TEST(SegmentedArrayTest, ConstructPopulateAndLookup) {
TEST(SegmentedArrayTest, PopulateWithMoreElements) {
using AllocatorType = typename Array<TestData>::AllocatorType;
AllocatorType A(1 << 24);
- ChunkAllocator CA(1 << 20);
- Array<TestData> data(A, CA);
+ Array<TestData> data(A);
static const auto kMaxElements = 100u;
for (auto I = 0u; I < kMaxElements; ++I) {
ASSERT_NE(data.Append(TestData{I, I + 1}), nullptr);
@@ -60,8 +56,7 @@ TEST(SegmentedArrayTest, PopulateWithMoreElements) {
TEST(SegmentedArrayTest, AppendEmplace) {
using AllocatorType = typename Array<TestData>::AllocatorType;
AllocatorType A(1 << 4);
- ChunkAllocator CA(1 << 4);
- Array<TestData> data(A, CA);
+ Array<TestData> data(A);
ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
ASSERT_EQ(data[0].First, 1);
ASSERT_EQ(data[0].Second, 1);
@@ -70,8 +65,7 @@ TEST(SegmentedArrayTest, AppendEmplace) {
TEST(SegmentedArrayTest, AppendAndTrim) {
using AllocatorType = typename Array<TestData>::AllocatorType;
AllocatorType A(1 << 4);
- ChunkAllocator CA(1 << 4);
- Array<TestData> data(A, CA);
+ Array<TestData> data(A);
ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
ASSERT_EQ(data.size(), 1u);
data.trim(1);
@@ -82,8 +76,7 @@ TEST(SegmentedArrayTest, AppendAndTrim) {
TEST(SegmentedArrayTest, IteratorAdvance) {
using AllocatorType = typename Array<TestData>::AllocatorType;
AllocatorType A(1 << 4);
- ChunkAllocator CA(1 << 4);
- Array<TestData> data(A, CA);
+ Array<TestData> data(A);
ASSERT_TRUE(data.empty());
ASSERT_EQ(data.begin(), data.end());
auto I0 = data.begin();
@@ -104,8 +97,7 @@ TEST(SegmentedArrayTest, IteratorAdvance) {
TEST(SegmentedArrayTest, IteratorRetreat) {
using AllocatorType = typename Array<TestData>::AllocatorType;
AllocatorType A(1 << 4);
- ChunkAllocator CA(1 << 4);
- Array<TestData> data(A, CA);
+ Array<TestData> data(A);
ASSERT_TRUE(data.empty());
ASSERT_EQ(data.begin(), data.end());
ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
@@ -126,17 +118,16 @@ TEST(SegmentedArrayTest, IteratorRetreat) {
TEST(SegmentedArrayTest, IteratorTrimBehaviour) {
using AllocatorType = typename Array<TestData>::AllocatorType;
AllocatorType A(1 << 20);
- ChunkAllocator CA(1 << 10);
- Array<TestData> Data(A, CA);
+ Array<TestData> Data(A);
ASSERT_TRUE(Data.empty());
auto I0Begin = Data.begin(), I0End = Data.end();
// Add enough elements in Data to have more than one chunk.
- constexpr auto Chunk = Array<TestData>::ChunkSize;
- constexpr auto ChunkX2 = Chunk * 2;
- for (auto i = ChunkX2; i > 0u; --i) {
+ constexpr auto Segment = Array<TestData>::SegmentSize;
+ constexpr auto SegmentX2 = Segment * 2;
+ for (auto i = SegmentX2; i > 0u; --i) {
Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i));
}
- ASSERT_EQ(Data.size(), ChunkX2);
+ ASSERT_EQ(Data.size(), SegmentX2);
{
auto &Back = Data.back();
ASSERT_EQ(Back.First, 1);
@@ -144,18 +135,18 @@ TEST(SegmentedArrayTest, IteratorTrimBehaviour) {
}
// Trim one chunk's elements worth.
- Data.trim(Chunk);
- ASSERT_EQ(Data.size(), Chunk);
+ Data.trim(Segment);
+ ASSERT_EQ(Data.size(), Segment);
// Check that we are still able to access 'back' properly.
{
auto &Back = Data.back();
- ASSERT_EQ(Back.First, static_cast<s64>(Chunk + 1));
- ASSERT_EQ(Back.Second, static_cast<s64>(Chunk + 1));
+ ASSERT_EQ(Back.First, static_cast<s64>(Segment + 1));
+ ASSERT_EQ(Back.Second, static_cast<s64>(Segment + 1));
}
// Then trim until it's empty.
- Data.trim(Chunk);
+ Data.trim(Segment);
ASSERT_TRUE(Data.empty());
// Here our iterators should be the same.
@@ -164,10 +155,10 @@ TEST(SegmentedArrayTest, IteratorTrimBehaviour) {
EXPECT_EQ(I0End, I1End);
// Then we ensure that adding elements back works just fine.
- for (auto i = ChunkX2; i > 0u; --i) {
+ for (auto i = SegmentX2; i > 0u; --i) {
Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i));
}
- EXPECT_EQ(Data.size(), ChunkX2);
+ EXPECT_EQ(Data.size(), SegmentX2);
}
struct ShadowStackEntry {
@@ -179,8 +170,7 @@ struct ShadowStackEntry {
TEST(SegmentedArrayTest, SimulateStackBehaviour) {
using AllocatorType = typename Array<ShadowStackEntry>::AllocatorType;
AllocatorType A(1 << 10);
- ChunkAllocator CA(1 << 10);
- Array<ShadowStackEntry> Data(A, CA);
+ Array<ShadowStackEntry> Data(A);
static uint64_t Dummy = 0;
constexpr uint64_t Max = 9;
diff --git a/lib/xray/xray_function_call_trie.h b/lib/xray/xray_function_call_trie.h
index e6e31cec2..2acf14aa5 100644
--- a/lib/xray/xray_function_call_trie.h
+++ b/lib/xray/xray_function_call_trie.h
@@ -119,9 +119,9 @@ public:
// We add a constructor here to allow us to inplace-construct through
// Array<...>'s AppendEmplace.
- Node(Node *P, NodeIdPairAllocatorType &A, ChunkAllocator &CA, int64_t CC,
- int64_t CLT, int32_t F)
- : Parent(P), Callees(A, CA), CallCount(CC), CumulativeLocalTime(CLT),
+ Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
+ int32_t F)
+ : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT),
FId(F) {}
// TODO: Include the compact histogram.
@@ -153,7 +153,6 @@ public:
RootAllocatorType *RootAllocator = nullptr;
ShadowStackAllocatorType *ShadowStackAllocator = nullptr;
NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
- ChunkAllocator *ChunkAlloc = nullptr;
Allocators() {}
Allocators(const Allocators &) = delete;
@@ -162,12 +161,11 @@ public:
Allocators(Allocators &&O)
: NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator),
ShadowStackAllocator(O.ShadowStackAllocator),
- NodeIdPairAllocator(O.NodeIdPairAllocator), ChunkAlloc(O.ChunkAlloc) {
+ NodeIdPairAllocator(O.NodeIdPairAllocator) {
O.NodeAllocator = nullptr;
O.RootAllocator = nullptr;
O.ShadowStackAllocator = nullptr;
O.NodeIdPairAllocator = nullptr;
- O.ChunkAlloc = nullptr;
}
Allocators &operator=(Allocators &&O) {
@@ -191,11 +189,6 @@ public:
O.NodeIdPairAllocator = this->NodeIdPairAllocator;
this->NodeIdPairAllocator = Tmp;
}
- {
- auto Tmp = O.ChunkAlloc;
- O.ChunkAlloc = this->ChunkAlloc;
- this->ChunkAlloc = Tmp;
- }
return *this;
}
@@ -223,11 +216,6 @@ public:
InternalFree(NodeIdPairAllocator);
NodeIdPairAllocator = nullptr;
}
- if (ChunkAlloc != nullptr) {
- ChunkAlloc->~ChunkAllocator();
- InternalFree(ChunkAlloc);
- ChunkAlloc = nullptr;
- }
}
};
@@ -258,11 +246,6 @@ public:
InternalAlloc(sizeof(NodeIdPairAllocatorType)));
new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max);
A.NodeIdPairAllocator = NodeIdPairAllocator;
-
- auto ChunkAlloc = reinterpret_cast<ChunkAllocator *>(
- InternalAlloc(sizeof(ChunkAllocator)));
- new (ChunkAlloc) ChunkAllocator(Max);
- A.ChunkAlloc = ChunkAlloc;
return A;
}
@@ -271,22 +254,20 @@ private:
RootArray Roots;
ShadowStackArray ShadowStack;
NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
- ChunkAllocator *ChunkAlloc = nullptr;
public:
explicit FunctionCallTrie(const Allocators &A)
- : Nodes(*A.NodeAllocator, *A.ChunkAlloc),
- Roots(*A.RootAllocator, *A.ChunkAlloc),
- ShadowStack(*A.ShadowStackAllocator, *A.ChunkAlloc),
- NodeIdPairAllocator(A.NodeIdPairAllocator), ChunkAlloc(A.ChunkAlloc) {}
+ : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
+ ShadowStack(*A.ShadowStackAllocator),
+ NodeIdPairAllocator(A.NodeIdPairAllocator) {}
void enterFunction(const int32_t FId, uint64_t TSC) {
DCHECK_NE(FId, 0);
// This function primarily deals with ensuring that the ShadowStack is
// consistent and ready for when an exit event is encountered.
if (UNLIKELY(ShadowStack.empty())) {
- auto NewRoot = Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator,
- *ChunkAlloc, 0, 0, FId);
+ auto NewRoot =
+ Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId);
if (UNLIKELY(NewRoot == nullptr))
return;
Roots.Append(NewRoot);
@@ -309,8 +290,8 @@ public:
}
// This means we've never seen this stack before, create a new node here.
- auto NewNode = Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator,
- *ChunkAlloc, 0, 0, FId);
+ auto NewNode =
+ Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
if (UNLIKELY(NewNode == nullptr))
return;
DCHECK_NE(NewNode, nullptr);
@@ -370,13 +351,12 @@ public:
typename Stack::AllocatorType StackAllocator(
profilingFlags()->stack_allocator_max);
- ChunkAllocator StackChunkAllocator(profilingFlags()->stack_allocator_max);
- Stack DFSStack(StackAllocator, StackChunkAllocator);
+ Stack DFSStack(StackAllocator);
for (const auto Root : getRoots()) {
// Add a node in O for this root.
auto NewRoot = O.Nodes.AppendEmplace(
- nullptr, *O.NodeIdPairAllocator, *O.ChunkAlloc, Root->CallCount,
+ nullptr, *O.NodeIdPairAllocator, Root->CallCount,
Root->CumulativeLocalTime, Root->FId);
// Because we cannot allocate more memory we should bail out right away.
@@ -395,9 +375,8 @@ public:
DFSStack.trim(1);
for (const auto Callee : NP.Node->Callees) {
auto NewNode = O.Nodes.AppendEmplace(
- NP.NewNode, *O.NodeIdPairAllocator, *O.ChunkAlloc,
- Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime,
- Callee.FId);
+ NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount,
+ Callee.NodePtr->CumulativeLocalTime, Callee.FId);
if (UNLIKELY(NewNode == nullptr))
return;
NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
@@ -423,16 +402,15 @@ public:
using Stack = Array<NodeAndTarget>;
typename Stack::AllocatorType StackAllocator(
profilingFlags()->stack_allocator_max);
- ChunkAllocator CA(profilingFlags()->stack_allocator_max);
- Stack DFSStack(StackAllocator, CA);
+ Stack DFSStack(StackAllocator);
for (const auto Root : getRoots()) {
Node *TargetRoot = nullptr;
auto R = O.Roots.find_element(
[&](const Node *Node) { return Node->FId == Root->FId; });
if (R == nullptr) {
- TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator,
- *O.ChunkAlloc, 0, 0, Root->FId);
+ TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
+ 0, Root->FId);
if (UNLIKELY(TargetRoot == nullptr))
return;
@@ -456,9 +434,8 @@ public:
return C.FId == Callee.FId;
});
if (TargetCallee == nullptr) {
- auto NewTargetNode =
- O.Nodes.AppendEmplace(NT.TargetNode, *O.NodeIdPairAllocator,
- *O.ChunkAlloc, 0, 0, Callee.FId);
+ auto NewTargetNode = O.Nodes.AppendEmplace(
+ NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
if (UNLIKELY(NewTargetNode == nullptr))
return;
diff --git a/lib/xray/xray_profile_collector.cc b/lib/xray/xray_profile_collector.cc
index ba347d13b..a43744d9a 100644
--- a/lib/xray/xray_profile_collector.cc
+++ b/lib/xray/xray_profile_collector.cc
@@ -118,12 +118,11 @@ struct ProfileRecord {
const FunctionCallTrie::Node *Node = nullptr;
// Constructor for in-place construction.
- ProfileRecord(PathAllocator &A, ChunkAllocator &CA,
- const FunctionCallTrie::Node *N)
+ ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
: Path([&] {
auto P =
reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
- new (P) PathArray(A, CA);
+ new (P) PathArray(A);
return P;
}()),
Node(N) {}
@@ -137,18 +136,17 @@ using ProfileRecordArray = Array<ProfileRecord>;
// the path(s) and the data associated with the path.
static void populateRecords(ProfileRecordArray &PRs,
ProfileRecord::PathAllocator &PA,
- ChunkAllocator &CA, const FunctionCallTrie &Trie) {
+ const FunctionCallTrie &Trie) {
using StackArray = Array<const FunctionCallTrie::Node *>;
using StackAllocator = typename StackArray::AllocatorType;
StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
- ChunkAllocator StackChunkAlloc(profilingFlags()->stack_allocator_max);
- StackArray DFSStack(StackAlloc, StackChunkAlloc);
+ StackArray DFSStack(StackAlloc);
for (const auto R : Trie.getRoots()) {
DFSStack.Append(R);
while (!DFSStack.empty()) {
auto Node = DFSStack.back();
DFSStack.trim(1);
- auto Record = PRs.AppendEmplace(PA, CA, Node);
+ auto Record = PRs.AppendEmplace(PA, Node);
if (Record == nullptr)
return;
DCHECK_NE(Record, nullptr);
@@ -214,10 +212,9 @@ void serialize() {
for (u32 I = 0; I < ThreadTries->Size(); ++I) {
using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
- ChunkAllocator CA(profilingFlags()->global_allocator_max);
ProfileRecord::PathAllocator PathAlloc(
profilingFlags()->global_allocator_max);
- ProfileRecordArray ProfileRecords(PRAlloc, CA);
+ ProfileRecordArray ProfileRecords(PRAlloc);
// First, we want to compute the amount of space we're going to need. We'll
// use a local allocator and an __xray::Array<...> to store the intermediary
@@ -226,7 +223,7 @@ void serialize() {
const auto &Trie = *(*ThreadTries)[I].Trie;
if (Trie.getRoots().empty())
continue;
- populateRecords(ProfileRecords, PathAlloc, CA, Trie);
+ populateRecords(ProfileRecords, PathAlloc, Trie);
DCHECK(!Trie.getRoots().empty());
DCHECK(!ProfileRecords.empty());
diff --git a/lib/xray/xray_profiling.cc b/lib/xray/xray_profiling.cc
index 3a0c0b90e..a11fddcc2 100644
--- a/lib/xray/xray_profiling.cc
+++ b/lib/xray/xray_profiling.cc
@@ -57,12 +57,13 @@ struct alignas(64) ProfilingData {
static pthread_key_t ProfilingKey;
thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage{};
-
static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
- if (pthread_getspecific(ProfilingKey) == NULL) {
+ thread_local auto ThreadOnce = [] {
new (&ThreadStorage) ProfilingData{};
pthread_setspecific(ProfilingKey, &ThreadStorage);
- }
+ return false;
+ }();
+ (void)ThreadOnce;
auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
diff --git a/lib/xray/xray_segmented_array.h b/lib/xray/xray_segmented_array.h
index f001b230c..018c2aa9e 100644
--- a/lib/xray/xray_segmented_array.h
+++ b/lib/xray/xray_segmented_array.h
@@ -9,7 +9,7 @@
//
// This file is a part of XRay, a dynamic runtime instrumentation system.
//
-// Defines the implementation of a segmented array, with fixed-size chunks
+// Defines the implementation of a segmented array, with fixed-size segments
// backing the segments.
//
//===----------------------------------------------------------------------===//
@@ -24,113 +24,128 @@
#include <utility>
namespace __xray {
-struct Chunk {
- void *Data;
- Chunk *Prev;
- Chunk *Next;
-};
-
-using ChunkAllocator = Allocator<next_pow2(sizeof(Chunk)) * 8>;
/// The Array type provides an interface similar to std::vector<...> but does
/// not shrink in size. Once constructed, elements can be appended but cannot be
/// removed. The implementation is heavily dependent on the contract provided by
/// the Allocator type, in that all memory will be released when the Allocator
/// is destroyed. When an Array is destroyed, it will destroy elements in the
-/// backing store but will not free the memory. The parameter N defines how many
-/// elements of T there should be in a single block.
-///
-/// We compute the least common multiple of the size of T and the cache line
-/// size, to allow us to maximise the number of T objects we can place in
-/// cache-line multiple sized blocks. To get back the number of T's, we divide
-/// this least common multiple by the size of T.
-template <class T,
- size_t N = lcm(next_pow2(sizeof(T)), kCacheLineSize) / sizeof(T)>
-struct Array {
- static constexpr size_t ChunkSize = N;
- static constexpr size_t ElementStorageSize = next_pow2(sizeof(T));
- static constexpr size_t AllocatorChunkSize = ElementStorageSize * ChunkSize;
- using AllocatorType = Allocator<AllocatorChunkSize>;
- static_assert(std::is_trivially_destructible<T>::value,
- "T must be trivially destructible.");
+/// backing store but will not free the memory.
+template <class T> class Array {
+ struct SegmentBase {
+ SegmentBase *Prev;
+ SegmentBase *Next;
+ };
-private:
- // TODO: Consider co-locating the chunk information with the data in the
- // Block, as in an intrusive list -- i.e. putting the next and previous
- // pointer values inside the Block storage.
- static Chunk SentinelChunk;
+ // We want each segment of the array to be cache-line aligned, and elements of
+ // the array be offset from the beginning of the segment.
+ struct Segment : SegmentBase {
+ char Data[];
+ };
+
+public:
+ // Each segment of the array will be laid out with the following assumptions:
+ //
+ // - Each segment will be on a cache-line address boundary (kCacheLineSize
+ // aligned).
+ //
+ // - The elements will be accessed through an aligned pointer, dependent on
+ // the alignment of T.
+ //
+ // - Each element is at least two-pointers worth from the beginning of the
+ // Segment, aligned properly, and the rest of the elements are accessed
+ // through appropriate alignment.
+ //
+ // We then compute the size of the segment to follow this logic:
+ //
+ // - Compute the number of elements that can fit within
+ // kCacheLineSize-multiple segments, minus the size of two pointers.
+ //
+ // - Request cacheline-multiple sized elements from the allocator.
+ static constexpr size_t AlignedElementStorageSize =
+ sizeof(typename std::aligned_storage<sizeof(T), alignof(T)>::type);
+
+ static constexpr size_t SegmentSize =
+ nearest_boundary(sizeof(Segment) + next_pow2(sizeof(T)), kCacheLineSize);
+
+ using AllocatorType = Allocator<SegmentSize>;
+
+ static constexpr size_t ElementsPerSegment =
+ (SegmentSize - sizeof(Segment)) / next_pow2(sizeof(T));
+
+ static_assert(ElementsPerSegment > 0,
+ "Must have at least 1 element per segment.");
+
+ static SegmentBase SentinelSegment;
+private:
AllocatorType *Alloc;
- ChunkAllocator *ChunkAlloc;
- Chunk *Head = &SentinelChunk;
- Chunk *Tail = &SentinelChunk;
+ SegmentBase *Head = &SentinelSegment;
+ SegmentBase *Tail = &SentinelSegment;
size_t Size = 0;
- // Here we keep track of chunks in the freelist, to allow us to re-use chunks
- // when elements are trimmed off the end.
- Chunk *Freelist = &SentinelChunk;
+ // Here we keep track of segments in the freelist, to allow us to re-use
+ // segments when elements are trimmed off the end.
+ SegmentBase *Freelist = &SentinelSegment;
- Chunk *NewChunk() {
+ Segment *NewSegment() {
// We need to handle the case in which enough elements have been trimmed to
- // allow us to re-use chunks we've allocated before. For this we look into
+ // allow us to re-use segments we've allocated before. For this we look into
// the Freelist, to see whether we need to actually allocate new blocks or
// just re-use blocks we've already seen before.
- if (Freelist != &SentinelChunk) {
- auto *FreeChunk = Freelist;
- Freelist = FreeChunk->Next;
- FreeChunk->Next = &SentinelChunk;
- Freelist->Prev = &SentinelChunk;
- return FreeChunk;
+ if (Freelist != &SentinelSegment) {
+ auto *FreeSegment = Freelist;
+ Freelist = FreeSegment->Next;
+ FreeSegment->Next = &SentinelSegment;
+ Freelist->Prev = &SentinelSegment;
+ return static_cast<Segment *>(FreeSegment);
}
- auto Block = Alloc->Allocate();
- if (Block.Data == nullptr)
- return nullptr;
-
- auto ChunkElement = ChunkAlloc->Allocate();
- if (ChunkElement.Data == nullptr)
+ auto SegmentBlock = Alloc->Allocate();
+ if (SegmentBlock.Data == nullptr)
return nullptr;
- // Placement-new the Chunk element at the appropriate location.
- auto C = reinterpret_cast<Chunk *>(ChunkElement.Data);
- Chunk LocalC{Block.Data, &SentinelChunk, &SentinelChunk};
- internal_memcpy(C, &LocalC, sizeof(LocalC));
- return C;
+ // Placement-new the Segment element at the beginning of the SegmentBlock.
+ auto S = reinterpret_cast<Segment *>(SegmentBlock.Data);
+ new (S) SegmentBase{&SentinelSegment, &SentinelSegment};
+ return S;
}
- Chunk *InitHeadAndTail() {
- DCHECK_EQ(Head, &SentinelChunk);
- DCHECK_EQ(Tail, &SentinelChunk);
- auto Chunk = NewChunk();
- if (Chunk == nullptr)
+ Segment *InitHeadAndTail() {
+ DCHECK_EQ(Head, &SentinelSegment);
+ DCHECK_EQ(Tail, &SentinelSegment);
+ auto Segment = NewSegment();
+ if (Segment == nullptr)
return nullptr;
- DCHECK_EQ(Chunk->Next, &SentinelChunk);
- DCHECK_EQ(Chunk->Prev, &SentinelChunk);
- return Head = Tail = Chunk;
+ DCHECK_EQ(Segment->Next, &SentinelSegment);
+ DCHECK_EQ(Segment->Prev, &SentinelSegment);
+ Head = Tail = static_cast<SegmentBase *>(Segment);
+ return Segment;
}
- Chunk *AppendNewChunk() {
- auto Chunk = NewChunk();
- if (Chunk == nullptr)
+ Segment *AppendNewSegment() {
+ auto S = NewSegment();
+ if (S == nullptr)
return nullptr;
- DCHECK_NE(Tail, &SentinelChunk);
- DCHECK_EQ(Tail->Next, &SentinelChunk);
- DCHECK_EQ(Chunk->Prev, &SentinelChunk);
- DCHECK_EQ(Chunk->Next, &SentinelChunk);
- Tail->Next = Chunk;
- Chunk->Prev = Tail;
- Tail = Chunk;
- return Tail;
+ DCHECK_NE(Tail, &SentinelSegment);
+ DCHECK_EQ(Tail->Next, &SentinelSegment);
+ DCHECK_EQ(S->Prev, &SentinelSegment);
+ DCHECK_EQ(S->Next, &SentinelSegment);
+ Tail->Next = S;
+ S->Prev = Tail;
+ Tail = S;
+ return static_cast<Segment *>(Tail);
}
// This Iterator models a BidirectionalIterator.
template <class U> class Iterator {
- Chunk *C = &SentinelChunk;
+ SegmentBase *S = &SentinelSegment;
size_t Offset = 0;
size_t Size = 0;
public:
- Iterator(Chunk *IC, size_t Off, size_t S) : C(IC), Offset(Off), Size(S) {}
+ Iterator(SegmentBase *IS, size_t Off, size_t S)
+ : S(IS), Offset(Off), Size(S) {}
Iterator(const Iterator &) noexcept = default;
Iterator() noexcept = default;
Iterator(Iterator &&) noexcept = default;
@@ -139,28 +154,28 @@ private:
~Iterator() = default;
Iterator &operator++() {
- if (++Offset % N || Offset == Size)
+ if (++Offset % ElementsPerSegment || Offset == Size)
return *this;
// At this point, we know that Offset % N == 0, so we must advance the
- // chunk pointer.
- DCHECK_EQ(Offset % N, 0);
+ // segment pointer.
+ DCHECK_EQ(Offset % ElementsPerSegment, 0);
DCHECK_NE(Offset, Size);
- DCHECK_NE(C, &SentinelChunk);
- DCHECK_NE(C->Next, &SentinelChunk);
- C = C->Next;
- DCHECK_NE(C, &SentinelChunk);
+ DCHECK_NE(S, &SentinelSegment);
+ DCHECK_NE(S->Next, &SentinelSegment);
+ S = S->Next;
+ DCHECK_NE(S, &SentinelSegment);
return *this;
}
Iterator &operator--() {
- DCHECK_NE(C, &SentinelChunk);
+ DCHECK_NE(S, &SentinelSegment);
DCHECK_GT(Offset, 0);
auto PreviousOffset = Offset--;
- if (PreviousOffset != Size && PreviousOffset % N == 0) {
- DCHECK_NE(C->Prev, &SentinelChunk);
- C = C->Prev;
+ if (PreviousOffset != Size && PreviousOffset % ElementsPerSegment == 0) {
+ DCHECK_NE(S->Prev, &SentinelSegment);
+ S = S->Prev;
}
return *this;
@@ -180,7 +195,7 @@ private:
template <class V, class W>
friend bool operator==(const Iterator<V> &L, const Iterator<W> &R) {
- return L.C == R.C && L.Offset == R.Offset;
+ return L.S == R.S && L.Offset == R.Offset;
}
template <class V, class W>
@@ -189,31 +204,29 @@ private:
}
U &operator*() const {
- DCHECK_NE(C, &SentinelChunk);
- auto RelOff = Offset % N;
- return *reinterpret_cast<U *>(reinterpret_cast<char *>(C->Data) +
- (RelOff * ElementStorageSize));
+ DCHECK_NE(S, &SentinelSegment);
+ auto RelOff = Offset % ElementsPerSegment;
+
+ // We need to compute the character-aligned pointer, offset from the
+ // segment's Data location to get the element in the position of Offset.
+ auto Base = static_cast<Segment *>(S)->Data;
+ auto AlignedOffset = Base + (RelOff * AlignedElementStorageSize);
+ return *reinterpret_cast<U *>(AlignedOffset);
}
- U *operator->() const {
- DCHECK_NE(C, &SentinelChunk);
- auto RelOff = Offset % N;
- return reinterpret_cast<U *>(reinterpret_cast<char *>(C->Data) +
- (RelOff * ElementStorageSize));
- }
+ U *operator->() const { return &(**this); }
};
public:
- explicit Array(AllocatorType &A, ChunkAllocator &CA)
- : Alloc(&A), ChunkAlloc(&CA) {}
+ explicit Array(AllocatorType &A) : Alloc(&A) {}
Array(const Array &) = delete;
Array(Array &&O) NOEXCEPT : Alloc(O.Alloc),
Head(O.Head),
Tail(O.Tail),
Size(O.Size) {
- O.Head = &SentinelChunk;
- O.Tail = &SentinelChunk;
+ O.Head = &SentinelSegment;
+ O.Tail = &SentinelSegment;
O.Size = 0;
}
@@ -227,39 +240,41 @@ public:
size_t size() const { return Size; }
T *Append(const T &E) {
- if (UNLIKELY(Head == &SentinelChunk))
+ if (UNLIKELY(Head == &SentinelSegment))
if (InitHeadAndTail() == nullptr)
return nullptr;
- auto Offset = Size % N;
+ auto Offset = Size % ElementsPerSegment;
if (UNLIKELY(Size != 0 && Offset == 0))
- if (AppendNewChunk() == nullptr)
+ if (AppendNewSegment() == nullptr)
return nullptr;
- auto Position = reinterpret_cast<T *>(reinterpret_cast<char *>(Tail->Data) +
- (Offset * ElementStorageSize));
+ auto Base = static_cast<Segment *>(Tail)->Data;
+ auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+ auto Position = reinterpret_cast<T *>(AlignedOffset);
*Position = E;
++Size;
return Position;
}
template <class... Args> T *AppendEmplace(Args &&... args) {
- if (UNLIKELY(Head == &SentinelChunk))
+ if (UNLIKELY(Head == &SentinelSegment))
if (InitHeadAndTail() == nullptr)
return nullptr;
- auto Offset = Size % N;
- auto *LatestChunk = Tail;
+ auto Offset = Size % ElementsPerSegment;
+ auto *LatestSegment = Tail;
if (UNLIKELY(Size != 0 && Offset == 0)) {
- LatestChunk = AppendNewChunk();
- if (LatestChunk == nullptr)
+ LatestSegment = AppendNewSegment();
+ if (LatestSegment == nullptr)
return nullptr;
}
- DCHECK_NE(Tail, &SentinelChunk);
- auto Position = reinterpret_cast<char *>(LatestChunk->Data) +
- (Offset * ElementStorageSize);
- DCHECK_EQ(reinterpret_cast<uintptr_t>(Position) % ElementStorageSize, 0);
+ DCHECK_NE(Tail, &SentinelSegment);
+ auto Base = static_cast<Segment *>(LatestSegment)->Data;
+ auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+ auto Position = reinterpret_cast<T *>(AlignedOffset);
+
// In-place construct at Position.
new (Position) T{std::forward<Args>(args)...};
++Size;
@@ -269,24 +284,26 @@ public:
T &operator[](size_t Offset) const {
DCHECK_LE(Offset, Size);
// We need to traverse the array enough times to find the element at Offset.
- auto C = Head;
- while (Offset >= N) {
- C = C->Next;
- Offset -= N;
- DCHECK_NE(C, &SentinelChunk);
+ auto S = Head;
+ while (Offset >= ElementsPerSegment) {
+ S = S->Next;
+ Offset -= ElementsPerSegment;
+ DCHECK_NE(S, &SentinelSegment);
}
- return *reinterpret_cast<T *>(reinterpret_cast<char *>(C->Data) +
- (Offset * ElementStorageSize));
+ auto Base = static_cast<Segment *>(S)->Data;
+ auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+ auto Position = reinterpret_cast<T *>(AlignedOffset);
+ return *reinterpret_cast<T *>(Position);
}
T &front() const {
- DCHECK_NE(Head, &SentinelChunk);
+ DCHECK_NE(Head, &SentinelSegment);
DCHECK_NE(Size, 0u);
return *begin();
}
T &back() const {
- DCHECK_NE(Tail, &SentinelChunk);
+ DCHECK_NE(Tail, &SentinelSegment);
DCHECK_NE(Size, 0u);
auto It = end();
--It;
@@ -313,28 +330,29 @@ public:
auto OldSize = Size;
Size -= Elements;
- DCHECK_NE(Head, &SentinelChunk);
- DCHECK_NE(Tail, &SentinelChunk);
+ DCHECK_NE(Head, &SentinelSegment);
+ DCHECK_NE(Tail, &SentinelSegment);
- for (auto ChunksToTrim =
- (nearest_boundary(OldSize, N) - nearest_boundary(Size, N)) / N;
- ChunksToTrim > 0; --ChunksToTrim) {
- DCHECK_NE(Head, &SentinelChunk);
- DCHECK_NE(Tail, &SentinelChunk);
+ for (auto SegmentsToTrim = (nearest_boundary(OldSize, ElementsPerSegment) -
+ nearest_boundary(Size, ElementsPerSegment)) /
+ ElementsPerSegment;
+ SegmentsToTrim > 0; --SegmentsToTrim) {
+ DCHECK_NE(Head, &SentinelSegment);
+ DCHECK_NE(Tail, &SentinelSegment);
// Put the tail into the Freelist.
- auto *FreeChunk = Tail;
+ auto *FreeSegment = Tail;
Tail = Tail->Prev;
- if (Tail == &SentinelChunk)
+ if (Tail == &SentinelSegment)
Head = Tail;
else
- Tail->Next = &SentinelChunk;
-
- DCHECK_EQ(Tail->Next, &SentinelChunk);
- FreeChunk->Next = Freelist;
- FreeChunk->Prev = &SentinelChunk;
- if (Freelist != &SentinelChunk)
- Freelist->Prev = FreeChunk;
- Freelist = FreeChunk;
+ Tail->Next = &SentinelSegment;
+
+ DCHECK_EQ(Tail->Next, &SentinelSegment);
+ FreeSegment->Next = Freelist;
+ FreeSegment->Prev = &SentinelSegment;
+ if (Freelist != &SentinelSegment)
+ Freelist->Prev = FreeSegment;
+ Freelist = FreeSegment;
}
}
@@ -346,11 +364,11 @@ public:
};
// We need to have this storage definition out-of-line so that the compiler can
-// ensure that storage for the SentinelChunk is defined and has a single
+// ensure that storage for the SentinelSegment is defined and has a single
// address.
-template <class T, size_t N>
-Chunk Array<T, N>::SentinelChunk{nullptr, &Array<T, N>::SentinelChunk,
- &Array<T, N>::SentinelChunk};
+template <class T>
+typename Array<T>::SegmentBase Array<T>::SentinelSegment{
+ &Array<T>::SentinelSegment, &Array<T>::SentinelSegment};
} // namespace __xray