summaryrefslogtreecommitdiff
path: root/lib/esan/working_set.cpp
blob: e56902c8f32a9bbe0f2dd16574aadf3b9af123bc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
//===-- working_set.cpp ---------------------------------------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file is a part of EfficiencySanitizer, a family of performance tuners.
//
// This file contains working-set-specific code.
//===----------------------------------------------------------------------===//

#include "working_set.h"
#include "esan.h"
#include "esan_circular_buffer.h"
#include "esan_flags.h"
#include "esan_shadow.h"
#include "esan_sideline.h"
#include "sanitizer_common/sanitizer_procmaps.h"

// We shadow every cache line of app memory with one shadow byte.
// - The highest bit of each shadow byte indicates whether the corresponding
//   cache line has ever been accessed.
// - The lowest bit of each shadow byte indicates whether the corresponding
//   cache line was accessed since the last sample.
// - The other bits are used for working set snapshots at successively
//   lower frequencies, each bit to the left from the lowest bit stepping
//   down the frequency by 2 to the power of getFlags()->snapshot_step.
// Thus we have something like this:
//   Bit 0: Since last sample
//   Bit 1: Since last 2^2 samples
//   Bit 2: Since last 2^4 samples
//   Bit 3: ...
//   Bit 7: Ever accessed.
// We live with races in accessing each shadow byte.
typedef unsigned char byte;

namespace __esan {

// Our shadow memory assumes that the line size is 64.
static const u32 CacheLineSize = 64;

// See the shadow byte layout description above.
static const u32 TotalWorkingSetBitIdx = 7;
// We accumulate to the left until we hit this bit.
// We don't need to accumulate to the final bit as it's set on each ref
// by the compiler instrumentation.
static const u32 MaxAccumBitIdx = 6;
static const u32 CurWorkingSetBitIdx = 0;
static const byte ShadowAccessedVal =
  (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx);

static SidelineThread Thread;
// If we use real-time-based timer samples this won't overflow in any realistic
// scenario, but if we switch to some other unit (such as memory accesses) we
// may want to consider a 64-bit int.
static u32 SnapshotNum;

// We store the wset size for each of 8 different sampling frequencies.
static const u32 NumFreq = 8; // One for each bit of our shadow bytes.
// We cannot use static objects as the global destructor is called
// prior to our finalize routine.
// These are each circular buffers, sized up front.
CircularBuffer<u32> SizePerFreq[NumFreq];
// We cannot rely on static initializers (they may run too late) but
// we record the size here for clarity:
u32 CircularBufferSizes[NumFreq] = {
  // These are each mmap-ed so our minimum is one page.
  32*1024,
  16*1024,
  8*1024,
  4*1024,
  4*1024,
  4*1024,
  4*1024,
  4*1024,
};

void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
                                  bool IsWrite) {
  if (Size == 0)
    return;
  SIZE_T I = 0;
  uptr LineSize = getFlags()->cache_line_size;
  // As Addr+Size could overflow at the top of a 32-bit address space,
  // we avoid the simpler formula that rounds the start and end.
  SIZE_T NumLines = Size / LineSize +
    // Add any extra at the start or end adding on an extra line:
    (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize;
  byte *Shadow = (byte *)appToShadow(Addr);
  // Write shadow bytes until we're word-aligned.
  while (I < NumLines && (uptr)Shadow % 4 != 0) {
    if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
      *Shadow |= ShadowAccessedVal;
    ++Shadow;
    ++I;
  }
  // Write whole shadow words at a time.
  // Using a word-stride loop improves the runtime of a microbenchmark of
  // memset calls by 10%.
  u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 |
    ShadowAccessedVal << 16 | ShadowAccessedVal << 24;
  while (I + 4 <= NumLines) {
    if ((*(u32*)Shadow & WordValue) != WordValue)
      *(u32*)Shadow |= WordValue;
    Shadow += 4;
    I += 4;
  }
  // Write any trailing shadow bytes.
  while (I < NumLines) {
    if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
      *Shadow |= ShadowAccessedVal;
    ++Shadow;
    ++I;
  }
}

// This routine will word-align ShadowStart and ShadowEnd prior to scanning.
// It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
// measures the access during the entire execution and should never be cleared.
static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart,
                                     uptr ShadowEnd) {
  u32 WorkingSetSize = 0;
  u32 ByteValue = 0x1 << BitIdx;
  u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 |
    ByteValue << 24;
  // Get word aligned start.
  ShadowStart = RoundDownTo(ShadowStart, sizeof(u32));
  bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx;
  // Do not clear the bit that measures access during the entire execution.
  bool Clear = BitIdx < TotalWorkingSetBitIdx;
  for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) {
    if ((*Ptr & WordValue) != 0) {
      byte *BytePtr = (byte *)Ptr;
      for (u32 j = 0; j < sizeof(u32); ++j) {
        if (BytePtr[j] & ByteValue) {
          ++WorkingSetSize;
          if (Accum) {
            // Accumulate to the lower-frequency bit to the left.
            BytePtr[j] |= (ByteValue << 1);
          }
        }
      }
      if (Clear) {
        // Clear this bit from every shadow byte.
        *Ptr &= ~WordValue;
      }
    }
  }
  return WorkingSetSize;
}

// Scan shadow memory to calculate the number of cache lines being accessed,
// i.e., the number of non-zero bits indexed by BitIdx in each shadow byte.
// We also clear the lowest bits (most recent working set snapshot).
// We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
// measures the access during the entire execution and should never be cleared.
static u32 computeWorkingSizeAndReset(u32 BitIdx) {
  u32 WorkingSetSize = 0;
  MemoryMappingLayout MemIter(true/*cache*/);
  MemoryMappedSegment Segment;
  while (MemIter.Next(&Segment)) {
    VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n", __FUNCTION__,
            Segment.start, Segment.end, Segment.protection,
            isAppMem(Segment.start), isShadowMem(Segment.start));
    if (isShadowMem(Segment.start) && Segment.IsWritable()) {
      VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Segment.start,
              Segment.end);
      WorkingSetSize +=
          countAndClearShadowValues(BitIdx, Segment.start, Segment.end);
    }
  }
  return WorkingSetSize;
}

// This is invoked from a signal handler but in a sideline thread doing nothing
// else so it is a little less fragile than a typical signal handler.
static void takeSample(void *Arg) {
  u32 BitIdx = CurWorkingSetBitIdx;
  u32 Freq = 1;
  ++SnapshotNum; // Simpler to skip 0 whose mod matches everything.
  while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) {
    u32 NumLines = computeWorkingSizeAndReset(BitIdx);
    VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName,
            SnapshotNum, BitIdx, Freq, NumLines);
    SizePerFreq[BitIdx].push_back(NumLines);
    Freq = Freq << getFlags()->snapshot_step;
    BitIdx++;
  }
}

unsigned int getSampleCountWorkingSet()
{
  return SnapshotNum;
}

// Initialization that must be done before any instrumented code is executed.
void initializeShadowWorkingSet() {
  CHECK(getFlags()->cache_line_size == CacheLineSize);
  registerMemoryFaultHandler();
}

void initializeWorkingSet() {
  if (getFlags()->record_snapshots) {
    for (u32 i = 0; i < NumFreq; ++i)
      SizePerFreq[i].initialize(CircularBufferSizes[i]);
    Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq);
  }
}

static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) {
  if (MilliSec > 600000) {
    Unit = "min";
    return MilliSec / 60000;
  } else if (MilliSec > 10000) {
    Unit = "sec";
    return MilliSec / 1000;
  } else {
    Unit = "ms";
    return MilliSec;
  }
}

static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) {
  // We need a constant to avoid software divide support:
  static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize;
  static const u32 MegabyteCachelines = KilobyteCachelines << 10;

  if (NumOfCachelines > 10 * MegabyteCachelines) {
    Unit = "MB";
    return NumOfCachelines / MegabyteCachelines;
  } else if (NumOfCachelines > 10 * KilobyteCachelines) {
    Unit = "KB";
    return NumOfCachelines / KilobyteCachelines;
  } else {
    Unit = "Bytes";
    return NumOfCachelines * CacheLineSize;
  }
}

void reportWorkingSet() {
  const char *Unit;
  if (getFlags()->record_snapshots) {
    u32 Freq = 1;
    Report(" Total number of samples: %u\n", SnapshotNum);
    for (u32 i = 0; i < NumFreq; ++i) {
      u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit);
      Report(" Samples array #%d at period %u %s\n", i, Time, Unit);
      // FIXME: report whether we wrapped around and thus whether we
      // have data on the whole run or just the last N samples.
      for (u32 j = 0; j < SizePerFreq[i].size(); ++j) {
        u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit);
        Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit,
               SizePerFreq[i][j]);
      }
      Freq = Freq << getFlags()->snapshot_step;
    }
  }

  // Get the working set size for the entire execution.
  u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx);
  u32 Size = getSizeForPrinting(NumOfCachelines, Unit);
  Report(" %s: the total working set size: %u %s (%u cache lines)\n",
         SanitizerToolName, Size, Unit, NumOfCachelines);
}

int finalizeWorkingSet() {
  if (getFlags()->record_snapshots)
    Thread.joinThread();
  reportWorkingSet();
  if (getFlags()->record_snapshots) {
    for (u32 i = 0; i < NumFreq; ++i)
      SizePerFreq[i].free();
  }
  return 0;
}

} // namespace __esan