lib/xray/xray_allocator.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

//===-- xray_allocator.h ---------------------------------------*- C++ -*-===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file is a part of XRay, a dynamic runtime instrumentation system.
//
// Defines the allocator interface for an arena allocator, used primarily for
// the profiling runtime.
//
//===----------------------------------------------------------------------===//
#ifndef XRAY_ALLOCATOR_H
#define XRAY_ALLOCATOR_H

#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_mutex.h"
#include <cstddef>
#include <cstdint>

#include "sanitizer_common/sanitizer_internal_defs.h"

namespace __xray {

/// The Allocator type hands out fixed-sized chunks of memory that are
/// cache-line aligned and sized. This is useful for placement of
/// performance-sensitive data in memory that's frequently accessed. The
/// allocator also self-limits the peak memory usage to a dynamically defined
/// maximum.
///
/// N is the lower-bound size of the block of memory to return from the
/// allocation function. N is used to compute the size of a block, which is
/// cache-line-size multiples worth of memory. We compute the size of a block by
/// determining how many cache lines worth of memory is required to subsume N.
template <size_t N> struct Allocator {
  // The Allocator returns memory as Block instances.
  struct Block {
    /// Compute the minimum cache-line size multiple that is >= N.
    static constexpr auto Size =
        kCacheLineSize * ((N / kCacheLineSize) + (N % kCacheLineSize ? 1 : 0));
    void *Data = nullptr;
  };

private:
  // A BlockLink will contain a fixed number of blocks, each with an identifier
  // to specify whether it's been handed out or not. We keep track of BlockLink
  // iterators, which are basically a pointer to the link and an offset into
  // the fixed set of blocks associated with a link. The iterators are
  // bidirectional.
  //
  // We're calling it a "link" in the context of seeing these as a chain of
  // block pointer containers (i.e. links in a chain).
  struct BlockLink {
    static_assert(kCacheLineSize % sizeof(void *) == 0,
                  "Cache line size is not divisible by size of void*; none of "
                  "the assumptions of the BlockLink will hold.");

    // We compute the number of pointers to areas in memory where we consider as
    // individual blocks we've allocated. To ensure that instances of the
    // BlockLink object are cache-line sized, we deduct one additional
    // pointers worth representing the pointer to the previous link.
    //
    // This structure corresponds to the following layout:
    //
    //   Blocks [ 0, 1, 2, .., BlockPtrCount - 1]
    //
    static constexpr auto BlockPtrCount =
        (kCacheLineSize / sizeof(Block *)) - 1;

    BlockLink() {
      // Zero out Blocks.
      // FIXME: Use a braced member initializer when we drop support for GCC
      // 4.8.
      internal_memset(Blocks, 0, sizeof(Blocks));
    }

    // FIXME: Align this to cache-line address boundaries?
    Block Blocks[BlockPtrCount];
    BlockLink *Prev = nullptr;
  };

  static_assert(sizeof(BlockLink) == kCacheLineSize,
                "BlockLink instances must be cache-line-sized.");

  static BlockLink NullLink;

  // FIXME: Implement a freelist, in case we actually do intend to return memory
  // to the allocator, as opposed to just de-allocating everything in one go?

  size_t MaxMemory;
  SpinMutex Mutex{};
  BlockLink *Tail = &NullLink;
  size_t Counter = 0;

  BlockLink *NewChainLink() {
    auto NewChain = reinterpret_cast<BlockLink *>(
        InternalAlloc(sizeof(BlockLink), nullptr, kCacheLineSize));
    auto BackingStore = reinterpret_cast<char *>(InternalAlloc(
        BlockLink::BlockPtrCount * Block::Size, nullptr, kCacheLineSize));
    size_t Offset = 0;
    DCHECK_NE(NewChain, nullptr);
    DCHECK_NE(BackingStore, nullptr);
    for (auto &B : NewChain->Blocks) {
      B.Data = BackingStore + Offset;
      Offset += Block::Size;
    }
    NewChain->Prev = Tail;
    return NewChain;
  }

public:
  Allocator(size_t M, size_t PreAllocate) : MaxMemory(M) {
    // FIXME: Implement PreAllocate support!
  }

  Block Allocate() {
    SpinMutexLock Lock(&Mutex);
    // Check whether we're over quota.
    if (Counter * Block::Size >= MaxMemory)
      return {};

    size_t ChainOffset = Counter % BlockLink::BlockPtrCount;

    Block B{};
    BlockLink *Link = Tail;
    if (UNLIKELY(Counter == 0 || ChainOffset == 0))
      Tail = Link = NewChainLink();

    B = Link->Blocks[ChainOffset];
    ++Counter;
    return B;
  }

  ~Allocator() NOEXCEPT {
    // We need to deallocate all the blocks, including the chain links.
    for (auto *C = Tail; C != &NullLink;) {
      // We know that the data block is a large contiguous page, we deallocate
      // that at once.
      InternalFree(C->Blocks[0].Data);
      auto Prev = C->Prev;
      InternalFree(C);
      C = Prev;
    }
  }
}; // namespace __xray

// Storage for the NullLink sentinel.
template <size_t N> typename Allocator<N>::BlockLink Allocator<N>::NullLink;

} // namespace __xray

#endif // XRAY_ALLOCATOR_H