From f9947480f58b10f2460df24287f95357082b6563 Mon Sep 17 00:00:00 2001
From: Dean Michael Berris <dberris@google.com>
Date: Fri, 12 May 2017 01:07:41 +0000
Subject: [XRay][compiler-rt] Runtime changes to support custom event logging

Summary:
This change implements support for the custom event logging sleds and
intrinsics at runtime. For now it only supports handling the sleds in
x86_64, with the implementations for other architectures stubbed out to
do nothing.

NOTE: Work in progress, uploaded for exposition/exploration purposes.

Depends on D27503, D30018, and D33032.

Reviewers: echristo, javed.absar, timshen

Subscribers: mehdi_amini, nemanjai, llvm-commits

Differential Revision: https://reviews.llvm.org/D30630

git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@302857 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/xray/xray_AArch64.cc           |  13 ++-
 lib/xray/xray_arm.cc               |  12 +-
 lib/xray/xray_fdr_log_records.h    |   1 +
 lib/xray/xray_fdr_logging.cc       | 150 ++++++++++++++++++-------
 lib/xray/xray_fdr_logging_impl.h   | 223 +++++++++++++++++++++++--------------
 lib/xray/xray_interface.cc         |  21 +++-
 lib/xray/xray_interface_internal.h |   2 +
 lib/xray/xray_mips.cc              |   9 +-
 lib/xray/xray_mips64.cc            |   8 +-
 lib/xray/xray_powerpc64.cc         |   6 +
 lib/xray/xray_trampoline_x86_64.S  |  46 +++++++-
 lib/xray/xray_x86_64.cc            |  36 ++++++
 12 files changed, 387 insertions(+), 140 deletions(-)

(limited to 'lib/xray')
diff --git a/lib/xray/xray_AArch64.cc b/lib/xray/xray_AArch64.cc
index 8d1c7c5d8..372c1ad87 100644
--- a/lib/xray/xray_AArch64.cc
+++ b/lib/xray/xray_AArch64.cc
@@ -18,8 +18,7 @@
 #include <atomic>
 #include <cassert>
 
-
-extern "C" void __clear_cache(void* start, void* end);
+extern "C" void __clear_cache(void *start, void *end);
 
 namespace __xray {
 
@@ -86,8 +85,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
         reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
         uint32_t(PatchOpcodes::PO_B32), std::memory_order_release);
   }
-  __clear_cache(reinterpret_cast<char*>(FirstAddress),
-      reinterpret_cast<char*>(CurAddress));
+  __clear_cache(reinterpret_cast<char *>(FirstAddress),
+                reinterpret_cast<char *>(CurAddress));
   return true;
 }
 
@@ -107,6 +106,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
   return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
 }
 
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySled &Sled)
+    XRAY_NEVER_INSTRUMENT { // FIXME: Implement in aarch64?
+  return false;
+}
+
 // FIXME: Maybe implement this better?
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
 
diff --git a/lib/xray/xray_arm.cc b/lib/xray/xray_arm.cc
index 26d673ec2..94e1c2638 100644
--- a/lib/xray/xray_arm.cc
+++ b/lib/xray/xray_arm.cc
@@ -18,7 +18,7 @@
 #include <atomic>
 #include <cassert>
 
-extern "C" void __clear_cache(void* start, void* end);
+extern "C" void __clear_cache(void *start, void *end);
 
 namespace __xray {
 
@@ -122,8 +122,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
         reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
         uint32_t(PatchOpcodes::PO_B20), std::memory_order_release);
   }
-  __clear_cache(reinterpret_cast<char*>(FirstAddress),
-      reinterpret_cast<char*>(CurAddress));
+  __clear_cache(reinterpret_cast<char *>(FirstAddress),
+                reinterpret_cast<char *>(CurAddress));
   return true;
 }
 
@@ -143,6 +143,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
   return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
 }
 
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySled &Sled)
+    XRAY_NEVER_INSTRUMENT { // FIXME: Implement in arm?
+  return false;
+}
+
 // FIXME: Maybe implement this better?
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
 
diff --git a/lib/xray/xray_fdr_log_records.h b/lib/xray/xray_fdr_log_records.h
index 36d9410d1..3d6d38892 100644
--- a/lib/xray/xray_fdr_log_records.h
+++ b/lib/xray/xray_fdr_log_records.h
@@ -29,6 +29,7 @@ struct alignas(16) MetadataRecord {
     NewCPUId,
     TSCWrap,
     WalltimeMarker,
+    CustomEventMarker,
   };
   // Use 7 bits to identify this record type.
   /* RecordKinds */ uint8_t RecordKind : 7;
diff --git a/lib/xray/xray_fdr_logging.cc b/lib/xray/xray_fdr_logging.cc
index e538b477a..a7e1382c3 100644
--- a/lib/xray/xray_fdr_logging.cc
+++ b/lib/xray/xray_fdr_logging.cc
@@ -41,45 +41,12 @@ namespace __xray {
 // Global BufferQueue.
 std::shared_ptr<BufferQueue> BQ;
 
-__sanitizer::atomic_sint32_t LoggingStatus = {
-    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
-
 __sanitizer::atomic_sint32_t LogFlushStatus = {
     XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
 
-std::unique_ptr<FDRLoggingOptions> FDROptions;
-
-XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax,
-                                 void *Options,
-                                 size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
-  if (OptionsSize != sizeof(FDRLoggingOptions))
-    return static_cast<XRayLogInitStatus>(__sanitizer::atomic_load(
-        &LoggingStatus, __sanitizer::memory_order_acquire));
-  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  if (!__sanitizer::atomic_compare_exchange_strong(
-          &LoggingStatus, &CurrentStatus,
-          XRayLogInitStatus::XRAY_LOG_INITIALIZING,
-          __sanitizer::memory_order_release))
-    return static_cast<XRayLogInitStatus>(CurrentStatus);
-
-  FDROptions.reset(new FDRLoggingOptions());
-  memcpy(FDROptions.get(), Options, OptionsSize);
-  bool Success = false;
-  BQ = std::make_shared<BufferQueue>(BufferSize, BufferMax, Success);
-  if (!Success) {
-    Report("BufferQueue init failed.\n");
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  }
-
-  // Install the actual handleArg0 handler after initialising the buffers.
-  __xray_set_handler(fdrLoggingHandleArg0);
+FDRLoggingOptions FDROptions;
 
-  __sanitizer::atomic_store(&LoggingStatus,
-                            XRayLogInitStatus::XRAY_LOG_INITIALIZED,
-                            __sanitizer::memory_order_release);
-  Report("XRay FDR init successful.\n");
-  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
-}
+__sanitizer::SpinMutex FDROptionsMutex;
 
 // Must finalize before flushing.
 XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
@@ -108,7 +75,11 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
   //      (fixed-sized) and let the tools reading the buffers deal with the data
   //      afterwards.
   //
-  int Fd = FDROptions->Fd;
+  int Fd = -1;
+  {
+    __sanitizer::SpinMutexLock Guard(&FDROptionsMutex);
+    Fd = FDROptions.Fd;
+  }
   if (Fd == -1)
     Fd = getLogFD();
   if (Fd == -1) {
@@ -120,8 +91,8 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
 
   // Test for required CPU features and cache the cycle frequency
   static bool TSCSupported = probeRequiredCPUFeatures();
-  static uint64_t CycleFrequency = TSCSupported ? getTSCFrequency()
-                                   : __xray::NanosecondsPerSecond;
+  static uint64_t CycleFrequency =
+      TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond;
 
   XRayFileHeader Header;
   Header.Version = 1;
@@ -192,8 +163,8 @@ XRayLogInitStatus fdrLoggingReset() XRAY_NEVER_INSTRUMENT {
   return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
 }
 
-void fdrLoggingHandleArg0(int32_t FuncId,
-                          XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
+static std::tuple<uint64_t, unsigned char>
+getTimestamp() XRAY_NEVER_INSTRUMENT {
   // We want to get the TSC as early as possible, so that we can check whether
   // we've seen this CPU before. We also do it before we load anything else, to
   // allow for forward progress with the scheduling.
@@ -203,7 +174,7 @@ void fdrLoggingHandleArg0(int32_t FuncId,
   // Test once for required CPU features
   static bool TSCSupported = probeRequiredCPUFeatures();
 
-  if(TSCSupported) {
+  if (TSCSupported) {
     TSC = __xray::readTSC(CPU);
   } else {
     // FIXME: This code needs refactoring as it appears in multiple locations
@@ -216,9 +187,102 @@ void fdrLoggingHandleArg0(int32_t FuncId,
     CPU = 0;
     TSC = TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec;
   }
+  return std::make_tuple(TSC, CPU);
+}
+
+void fdrLoggingHandleArg0(int32_t FuncId,
+                          XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
+  auto TSC_CPU = getTimestamp();
+  __xray_fdr_internal::processFunctionHook(FuncId, Entry, std::get<0>(TSC_CPU),
+                                           std::get<1>(TSC_CPU), clock_gettime,
+                                           LoggingStatus, BQ);
+}
 
-  __xray_fdr_internal::processFunctionHook(FuncId, Entry, TSC, CPU,
-                                           clock_gettime, LoggingStatus, BQ);
+void fdrLoggingHandleCustomEvent(void *Event,
+                                 std::size_t EventSize) XRAY_NEVER_INSTRUMENT {
+  using namespace __xray_fdr_internal;
+  auto TSC_CPU = getTimestamp();
+  auto &TSC = std::get<0>(TSC_CPU);
+  auto &CPU = std::get<1>(TSC_CPU);
+  thread_local bool Running = false;
+  RecursionGuard Guard{Running};
+  if (!Guard) {
+    assert(Running && "RecursionGuard is buggy!");
+    return;
+  }
+  if (EventSize > std::numeric_limits<int32_t>::max()) {
+    using Empty = struct {};
+    static Empty Once = [&] {
+      Report("Event size too large = %zu ; > max = %d\n", EventSize,
+             std::numeric_limits<int32_t>::max());
+      return Empty();
+    }();
+    (void)Once;
+  }
+  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+  if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, clock_gettime))
+    return;
+
+  // Here we need to prepare the log to handle:
+  //   - The metadata record we're going to write. (16 bytes)
+  //   - The additional data we're going to write. Currently, that's the size of
+  //   the event we're going to dump into the log as free-form bytes.
+  if (!prepareBuffer(clock_gettime, MetadataRecSize + EventSize)) {
+    LocalBQ = nullptr;
+    return;
+  }
+
+  // Write the custom event metadata record, which consists of the following
+  // information:
+  //   - 8 bytes (64-bits) for the full TSC when the event started.
+  //   - 4 bytes (32-bits) for the length of the data.
+  MetadataRecord CustomEvent;
+  CustomEvent.Type = uint8_t(RecordType::Metadata);
+  CustomEvent.RecordKind =
+      uint8_t(MetadataRecord::RecordKinds::CustomEventMarker);
+  constexpr auto TSCSize = sizeof(std::get<0>(TSC_CPU));
+  std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t));
+  std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
+  std::memcpy(RecordPtr, &CustomEvent, sizeof(CustomEvent));
+  RecordPtr += sizeof(CustomEvent);
+  std::memcpy(RecordPtr, Event, ReducedEventSize);
+  endBufferIfFull();
+}
+
+XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax,
+                                 void *Options,
+                                 size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
+  if (OptionsSize != sizeof(FDRLoggingOptions))
+    return static_cast<XRayLogInitStatus>(__sanitizer::atomic_load(
+        &LoggingStatus, __sanitizer::memory_order_acquire));
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  if (!__sanitizer::atomic_compare_exchange_strong(
+          &LoggingStatus, &CurrentStatus,
+          XRayLogInitStatus::XRAY_LOG_INITIALIZING,
+          __sanitizer::memory_order_release))
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+
+  {
+    __sanitizer::SpinMutexLock Guard(&FDROptionsMutex);
+    memcpy(&FDROptions, Options, OptionsSize);
+  }
+
+  bool Success = false;
+  BQ = std::make_shared<BufferQueue>(BufferSize, BufferMax, Success);
+  if (!Success) {
+    Report("BufferQueue init failed.\n");
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  }
+
+  // Install the actual handleArg0 handler after initialising the buffers.
+  __xray_set_handler(fdrLoggingHandleArg0);
+  __xray_set_customevent_handler(fdrLoggingHandleCustomEvent);
+
+  __sanitizer::atomic_store(&LoggingStatus,
+                            XRayLogInitStatus::XRAY_LOG_INITIALIZED,
+                            __sanitizer::memory_order_release);
+  Report("XRay FDR init successful.\n");
+  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
 }
 
 } // namespace __xray
diff --git a/lib/xray/xray_fdr_logging_impl.h b/lib/xray/xray_fdr_logging_impl.h
index ce360cb03..c801babc3 100644
--- a/lib/xray/xray_fdr_logging_impl.h
+++ b/lib/xray/xray_fdr_logging_impl.h
@@ -37,6 +37,9 @@
 
 namespace __xray {
 
+__sanitizer::atomic_sint32_t LoggingStatus = {
+    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
 /// We expose some of the state transitions when FDR logging mode is operating
 /// such that we can simulate a series of log events that may occur without
 /// and test with determinism without worrying about the real CPU time.
@@ -123,12 +126,21 @@ thread_local uint8_t NumTailCalls = 0;
 constexpr auto MetadataRecSize = sizeof(MetadataRecord);
 constexpr auto FunctionRecSize = sizeof(FunctionRecord);
 
+// We use a thread_local variable to keep track of which CPUs we've already
+// run, and the TSC times for these CPUs. This allows us to stop repeating the
+// CPU field in the function records.
+//
+// We assume that we'll support only 65536 CPUs for x86_64.
+thread_local uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
+thread_local uint64_t LastTSC = 0;
+thread_local uint64_t LastFunctionEntryTSC = 0;
+
 class ThreadExitBufferCleanup {
-  std::weak_ptr<BufferQueue> Buffers;
+  std::shared_ptr<BufferQueue> &Buffers;
   BufferQueue::Buffer &Buffer;
 
 public:
-  explicit ThreadExitBufferCleanup(std::weak_ptr<BufferQueue> BQ,
+  explicit ThreadExitBufferCleanup(std::shared_ptr<BufferQueue> &BQ,
                                    BufferQueue::Buffer &Buffer)
       XRAY_NEVER_INSTRUMENT : Buffers(BQ),
                               Buffer(Buffer) {}
@@ -142,17 +154,24 @@ public:
     // the queue.
     assert((RecordPtr + MetadataRecSize) - static_cast<char *>(Buffer.Buffer) >=
            static_cast<ptrdiff_t>(MetadataRecSize));
-    if (auto BQ = Buffers.lock()) {
+    if (Buffers) {
       writeEOBMetadata();
-      auto EC = BQ->releaseBuffer(Buffer);
+      auto EC = Buffers->releaseBuffer(Buffer);
       if (EC != BufferQueue::ErrorCode::Ok)
         Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer,
                BufferQueue::getErrorString(EC));
+      Buffers = nullptr;
       return;
     }
   }
 };
 
+// Make sure a thread that's ever called handleArg0 has a thread-local
+// live reference to the buffer queue for this particular instance of
+// FDRLogging, and that we're going to clean it up when the thread exits.
+thread_local std::shared_ptr<BufferQueue> LocalBQ = nullptr;
+thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer);
+
 class RecursionGuard {
   bool &Running;
   const bool Valid;
@@ -176,7 +195,7 @@ public:
   }
 };
 
-static inline bool loggingInitialized(
+inline bool loggingInitialized(
     const __sanitizer::atomic_sint32_t &LoggingStatus) XRAY_NEVER_INSTRUMENT {
   return __sanitizer::atomic_load(&LoggingStatus,
                                   __sanitizer::memory_order_acquire) ==
@@ -185,7 +204,7 @@ static inline bool loggingInitialized(
 
 } // namespace
 
-static inline void writeNewBufferPreamble(pid_t Tid, timespec TS,
+inline void writeNewBufferPreamble(pid_t Tid, timespec TS,
                                           char *&MemPtr) XRAY_NEVER_INSTRUMENT {
   static constexpr int InitRecordsCount = 2;
   std::aligned_storage<sizeof(MetadataRecord)>::type Records[InitRecordsCount];
@@ -222,9 +241,8 @@ static inline void writeNewBufferPreamble(pid_t Tid, timespec TS,
   NumTailCalls = 0;
 }
 
-static inline void setupNewBuffer(int (*wall_clock_reader)(clockid_t,
-                                                           struct timespec *))
-    XRAY_NEVER_INSTRUMENT {
+inline void setupNewBuffer(int (*wall_clock_reader)(
+    clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT {
   RecordPtr = static_cast<char *>(Buffer.Buffer);
   pid_t Tid = syscall(SYS_gettid);
   timespec TS{0, 0};
@@ -235,7 +253,7 @@ static inline void setupNewBuffer(int (*wall_clock_reader)(clockid_t,
   NumTailCalls = 0;
 }
 
-static inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
+inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
                                          char *&MemPtr) XRAY_NEVER_INSTRUMENT {
   MetadataRecord NewCPUId;
   NewCPUId.Type = uint8_t(RecordType::Metadata);
@@ -253,12 +271,12 @@ static inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
   NumTailCalls = 0;
 }
 
-static inline void writeNewCPUIdMetadata(uint16_t CPU,
+inline void writeNewCPUIdMetadata(uint16_t CPU,
                                          uint64_t TSC) XRAY_NEVER_INSTRUMENT {
   writeNewCPUIdMetadata(CPU, TSC, RecordPtr);
 }
 
-static inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT {
+inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT {
   MetadataRecord EOBMeta;
   EOBMeta.Type = uint8_t(RecordType::Metadata);
   EOBMeta.RecordKind = uint8_t(MetadataRecord::RecordKinds::EndOfBuffer);
@@ -269,11 +287,11 @@ static inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT {
   NumTailCalls = 0;
 }
 
-static inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT {
+inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT {
   writeEOBMetadata(RecordPtr);
 }
 
-static inline void writeTSCWrapMetadata(uint64_t TSC,
+inline void writeTSCWrapMetadata(uint64_t TSC,
                                         char *&MemPtr) XRAY_NEVER_INSTRUMENT {
   MetadataRecord TSCWrap;
   TSCWrap.Type = uint8_t(RecordType::Metadata);
@@ -289,11 +307,11 @@ static inline void writeTSCWrapMetadata(uint64_t TSC,
   NumTailCalls = 0;
 }
 
-static inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
   writeTSCWrapMetadata(TSC, RecordPtr);
 }
 
-static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
+inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
                                        XRayEntryType EntryType,
                                        char *&MemPtr) XRAY_NEVER_INSTRUMENT {
   std::aligned_storage<sizeof(FunctionRecord), alignof(FunctionRecord)>::type
@@ -339,6 +357,17 @@ static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
     FuncRecord.RecordKind =
         uint8_t(FunctionRecord::RecordKinds::FunctionTailExit);
     break;
+  case XRayEntryType::CUSTOM_EVENT: {
+    // This is a bug in patching, so we'll report it once and move on.
+    static bool Once = [&] {
+      Report("Internal error: patched an XRay custom event call as a function; "
+             "func id = %d\n",
+             FuncId);
+      return true;
+    }();
+    (void)Once;
+    return;
+  }
   }
 
   std::memcpy(MemPtr, &AlignedFuncRecordBuffer, sizeof(FunctionRecord));
@@ -346,8 +375,9 @@ static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
 }
 
 static uint64_t thresholdTicks() {
-  static uint64_t TicksPerSec = probeRequiredCPUFeatures() ? getTSCFrequency() :
-                                __xray::NanosecondsPerSecond;
+  static uint64_t TicksPerSec = probeRequiredCPUFeatures()
+                                    ? getTSCFrequency()
+                                    : __xray::NanosecondsPerSecond;
   static const uint64_t ThresholdTicks =
       TicksPerSec * flags()->xray_fdr_log_func_duration_threshold_us / 1000000;
   return ThresholdTicks;
@@ -397,9 +427,8 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
     RewindingRecordPtr -= FunctionRecSize;
     RewindingTSC -= ExpectedTailExit.TSCDelta;
     AlignedFuncStorage FunctionEntryBuffer;
-    const auto &ExpectedFunctionEntry =
-        *reinterpret_cast<FunctionRecord *>(std::memcpy(
-            &FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize));
+    const auto &ExpectedFunctionEntry = *reinterpret_cast<FunctionRecord *>(
+        std::memcpy(&FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize));
     assert(ExpectedFunctionEntry.RecordKind ==
                uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
            "Expected to find function entry when rewinding tail call.");
@@ -422,7 +451,7 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
   }
 }
 
-static inline bool releaseThreadLocalBuffer(BufferQueue *BQ) {
+inline bool releaseThreadLocalBuffer(BufferQueue *BQ) {
   auto EC = BQ->releaseBuffer(Buffer);
   if (EC != BufferQueue::ErrorCode::Ok) {
     Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer,
@@ -432,11 +461,29 @@ static inline bool releaseThreadLocalBuffer(BufferQueue *BQ) {
   return true;
 }
 
-static inline void processFunctionHook(
-    int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU,
-    int (*wall_clock_reader)(clockid_t, struct timespec *),
-    __sanitizer::atomic_sint32_t &LoggingStatus,
-    const std::shared_ptr<BufferQueue> &BQ) XRAY_NEVER_INSTRUMENT {
+inline bool prepareBuffer(int (*wall_clock_reader)(clockid_t,
+                                                          struct timespec *),
+                                 size_t MaxSize) XRAY_NEVER_INSTRUMENT {
+  char *BufferStart = static_cast<char *>(Buffer.Buffer);
+  if ((RecordPtr + MaxSize) > (BufferStart + Buffer.Size - MetadataRecSize)) {
+    writeEOBMetadata();
+    if (!releaseThreadLocalBuffer(LocalBQ.get()))
+      return false;
+    auto EC = LocalBQ->getBuffer(Buffer);
+    if (EC != BufferQueue::ErrorCode::Ok) {
+      Report("Failed to acquire a buffer; error=%s\n",
+             BufferQueue::getErrorString(EC));
+      return false;
+    }
+    setupNewBuffer(wall_clock_reader);
+  }
+  return true;
+}
+
+inline bool isLogInitializedAndReady(
+    std::shared_ptr<BufferQueue> &LocalBQ, uint64_t TSC, unsigned char CPU,
+    int (*wall_clock_reader)(clockid_t,
+                             struct timespec *)) XRAY_NEVER_INSTRUMENT {
   // Bail out right away if logging is not initialized yet.
   // We should take the opportunity to release the buffer though.
   auto Status = __sanitizer::atomic_load(&LoggingStatus,
@@ -446,44 +493,19 @@ static inline void processFunctionHook(
         (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
          Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) {
       writeEOBMetadata();
-      if (!releaseThreadLocalBuffer(BQ.get()))
-        return;
+      if (!releaseThreadLocalBuffer(LocalBQ.get()))
+        return false;
       RecordPtr = nullptr;
+      LocalBQ = nullptr;
+      return false;
     }
-    return;
-  }
-
-  // We use a thread_local variable to keep track of which CPUs we've already
-  // run, and the TSC times for these CPUs. This allows us to stop repeating the
-  // CPU field in the function records.
-  //
-  // We assume that we'll support only 65536 CPUs for x86_64.
-  thread_local uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
-  thread_local uint64_t LastTSC = 0;
-  thread_local uint64_t LastFunctionEntryTSC = 0;
-
-  // Make sure a thread that's ever called handleArg0 has a thread-local
-  // live reference to the buffer queue for this particular instance of
-  // FDRLogging, and that we're going to clean it up when the thread exits.
-  thread_local auto LocalBQ = BQ;
-  thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer);
-
-  // Prevent signal handler recursion, so in case we're already in a log writing
-  // mode and the signal handler comes in (and is also instrumented) then we
-  // don't want to be clobbering potentially partial writes already happening in
-  // the thread. We use a simple thread_local latch to only allow one on-going
-  // handleArg0 to happen at any given time.
-  thread_local bool Running = false;
-  RecursionGuard Guard{Running};
-  if (!Guard) {
-    assert(Running == true && "RecursionGuard is buggy!");
-    return;
+    return false;
   }
 
   if (!loggingInitialized(LoggingStatus) || LocalBQ->finalizing()) {
     writeEOBMetadata();
-    if (!releaseThreadLocalBuffer(BQ.get()))
-      return;
+    if (!releaseThreadLocalBuffer(LocalBQ.get()))
+      return false;
     RecordPtr = nullptr;
   }
 
@@ -496,19 +518,57 @@ static inline void processFunctionHook(
           LS != XRayLogInitStatus::XRAY_LOG_FINALIZED)
         Report("Failed to acquire a buffer; error=%s\n",
                BufferQueue::getErrorString(EC));
-      return;
+      return false;
     }
 
     setupNewBuffer(wall_clock_reader);
   }
 
   if (CurrentCPU == std::numeric_limits<uint16_t>::max()) {
-    // This means this is the first CPU this thread has ever run on. We set the
-    // current CPU and record this as the first TSC we've seen.
+    // This means this is the first CPU this thread has ever run on. We set
+    // the current CPU and record this as the first TSC we've seen.
     CurrentCPU = CPU;
     writeNewCPUIdMetadata(CPU, TSC);
   }
 
+  return true;
+} // namespace __xray_fdr_internal
+
+inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT {
+  auto BufferStart = static_cast<char *>(Buffer.Buffer);
+  if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) {
+    writeEOBMetadata();
+    if (!releaseThreadLocalBuffer(LocalBQ.get()))
+      return;
+    RecordPtr = nullptr;
+  }
+}
+
+inline void processFunctionHook(
+    int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU,
+    int (*wall_clock_reader)(clockid_t, struct timespec *),
+    __sanitizer::atomic_sint32_t &LoggingStatus,
+    const std::shared_ptr<BufferQueue> &BQ) XRAY_NEVER_INSTRUMENT {
+  // Prevent signal handler recursion, so in case we're already in a log writing
+  // mode and the signal handler comes in (and is also instrumented) then we
+  // don't want to be clobbering potentially partial writes already happening in
+  // the thread. We use a simple thread_local latch to only allow one on-going
+  // handleArg0 to happen at any given time.
+  thread_local bool Running = false;
+  RecursionGuard Guard{Running};
+  if (!Guard) {
+    assert(Running == true && "RecursionGuard is buggy!");
+    return;
+  }
+
+  // In case the reference has been cleaned up before, we make sure we
+  // initialize it to the provided BufferQueue.
+  if (LocalBQ == nullptr)
+    LocalBQ = BQ;
+
+  if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, wall_clock_reader))
+    return;
+
   // Before we go setting up writing new function entries, we need to be really
   // careful about the pointer math we're doing. This means we need to ensure
   // that the record we are about to write is going to fit into the buffer,
@@ -545,24 +605,14 @@ static inline void processFunctionHook(
   // bytes in the end of the buffer, we need to write out the EOB, get a new
   // Buffer, set it up properly before doing any further writing.
   //
-  char *BufferStart = static_cast<char *>(Buffer.Buffer);
-  if ((RecordPtr + (MetadataRecSize + FunctionRecSize)) - BufferStart <
-      static_cast<ptrdiff_t>(MetadataRecSize)) {
-    writeEOBMetadata();
-    if (!releaseThreadLocalBuffer(LocalBQ.get()))
-      return;
-    auto EC = LocalBQ->getBuffer(Buffer);
-    if (EC != BufferQueue::ErrorCode::Ok) {
-      Report("Failed to acquire a buffer; error=%s\n",
-             BufferQueue::getErrorString(EC));
-      return;
-    }
-    setupNewBuffer(wall_clock_reader);
+  if (!prepareBuffer(wall_clock_reader, FunctionRecSize + MetadataRecSize)) {
+    LocalBQ = nullptr;
+    return;
   }
 
   // By this point, we are now ready to write at most 24 bytes (one metadata
   // record and one function record).
-  BufferStart = static_cast<char *>(Buffer.Buffer);
+  auto BufferStart = static_cast<char *>(Buffer.Buffer);
   assert((RecordPtr + (MetadataRecSize + FunctionRecSize)) - BufferStart >=
              static_cast<ptrdiff_t>(MetadataRecSize) &&
          "Misconfigured BufferQueue provided; Buffer size not large enough.");
@@ -586,7 +636,6 @@ static inline void processFunctionHook(
   //     FunctionRecord. In this case we write down just a FunctionRecord with
   //     the correct TSC delta.
   //
-
   uint32_t RecordTSCDelta = 0;
   if (CPU != CurrentCPU) {
     // We've moved to a new CPU.
@@ -619,21 +668,27 @@ static inline void processFunctionHook(
       break;
     rewindRecentCall(TSC, LastTSC, LastFunctionEntryTSC, FuncId);
     return; // without writing log.
+  case XRayEntryType::CUSTOM_EVENT: {
+    // This is a bug in patching, so we'll report it once and move on.
+    static bool Once = [&] {
+      Report("Internal error: patched an XRay custom event call as a function; "
+             "func id = %d",
+             FuncId);
+      return true;
+    }();
+    (void)Once;
+    return;
+  }
   }
 
   writeFunctionRecord(FuncId, RecordTSCDelta, Entry, RecordPtr);
 
   // If we've exhausted the buffer by this time, we then release the buffer to
   // make sure that other threads may start using this buffer.
-  if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) {
-    writeEOBMetadata();
-    if (!releaseThreadLocalBuffer(LocalBQ.get()))
-      return;
-    RecordPtr = nullptr;
-  }
+  endBufferIfFull();
 }
 
 } // namespace __xray_fdr_internal
-
 } // namespace __xray
+
 #endif // XRAY_XRAY_FDR_LOGGING_IMPL_H
diff --git a/lib/xray/xray_interface.cc b/lib/xray/xray_interface.cc
index 26f0ab122..c437a72e3 100644
--- a/lib/xray/xray_interface.cc
+++ b/lib/xray/xray_interface.cc
@@ -50,6 +50,9 @@ __sanitizer::atomic_uintptr_t XRayPatchedFunction{0};
 // This is the function to call from the arg1-enabled sleds/trampolines.
 __sanitizer::atomic_uintptr_t XRayArgLogger{0};
 
+// This is the function to call when we encounter a custom event log call.
+__sanitizer::atomic_uintptr_t XRayPatchedCustomEvent{0};
+
 // MProtectHelper is an RAII wrapper for calls to mprotect(...) that will undo
 // any successful mprotect(...) changes. This is used to make a page writeable
 // and executable, and upon destruction if it was successful in doing so returns
@@ -97,7 +100,19 @@ int __xray_set_handler(void (*entry)(int32_t,
                                __sanitizer::memory_order_acquire)) {
 
     __sanitizer::atomic_store(&__xray::XRayPatchedFunction,
-                              reinterpret_cast<uint64_t>(entry),
+                              reinterpret_cast<uintptr_t>(entry),
+                              __sanitizer::memory_order_release);
+    return 1;
+  }
+  return 0;
+}
+
+int __xray_set_customevent_handler(void (*entry)(void *, size_t))
+    XRAY_NEVER_INSTRUMENT {
+  if (__sanitizer::atomic_load(&XRayInitialized,
+                               __sanitizer::memory_order_acquire)) {
+    __sanitizer::atomic_store(&__xray::XRayPatchedCustomEvent,
+                              reinterpret_cast<uintptr_t>(entry),
                               __sanitizer::memory_order_release);
     return 1;
   }
@@ -161,6 +176,9 @@ inline bool patchSled(const XRaySledEntry &Sled, bool Enable,
   case XRayEntryType::LOG_ARGS_ENTRY:
     Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry);
     break;
+  case XRayEntryType::CUSTOM_EVENT:
+    Success = patchCustomEvent(Enable, FuncId, Sled);
+    break;
   default:
     Report("Unsupported sled kind '%d' @%04x\n", Sled.Address, int(Sled.Kind));
     return false;
@@ -301,6 +319,7 @@ int __xray_set_handler_arg1(void (*Handler)(int32_t, XRayEntryType, uint64_t)) {
                             __sanitizer::memory_order_release);
   return 1;
 }
+
 int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); }
 
 uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_interface_internal.h b/lib/xray/xray_interface_internal.h
index ef0c6b158..4a2784612 100644
--- a/lib/xray/xray_interface_internal.h
+++ b/lib/xray/xray_interface_internal.h
@@ -60,6 +60,7 @@ bool patchFunctionEntry(bool Enable, uint32_t FuncId,
 bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
 bool patchFunctionTailExit(bool Enable, uint32_t FuncId,
                            const XRaySledEntry &Sled);
+bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
 
 } // namespace __xray
 
@@ -70,6 +71,7 @@ extern void __xray_FunctionEntry();
 extern void __xray_FunctionExit();
 extern void __xray_FunctionTailExit();
 extern void __xray_ArgLoggerEntry();
+extern void __xray_CustomEvent();
 }
 
 #endif
diff --git a/lib/xray/xray_mips.cc b/lib/xray/xray_mips.cc
index c8ff39936..cd863304d 100644
--- a/lib/xray/xray_mips.cc
+++ b/lib/xray/xray_mips.cc
@@ -95,7 +95,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
   //   B #44
 
   if (Enable) {
-    uint32_t LoTracingHookAddr = reinterpret_cast<int32_t>(TracingHook) & 0xffff;
+    uint32_t LoTracingHookAddr =
+        reinterpret_cast<int32_t>(TracingHook) & 0xffff;
     uint32_t HiTracingHookAddr =
         (reinterpret_cast<int32_t>(TracingHook) >> 16) & 0xffff;
     uint32_t LoFunctionID = FuncId & 0xffff;
@@ -151,6 +152,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
   return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
 }
 
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips?
+  return false;
+}
+
 } // namespace __xray
 
 extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_mips64.cc b/lib/xray/xray_mips64.cc
index 21136848c..fa8fdd5ab 100644
--- a/lib/xray/xray_mips64.cc
+++ b/lib/xray/xray_mips64.cc
@@ -93,7 +93,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
   if (Enable) {
     uint32_t LoTracingHookAddr =
         reinterpret_cast<int64_t>(TracingHook) & 0xffff;
-    uint32_t HiTracingHookAddr = (reinterpret_cast<int64_t>(TracingHook) >> 16) & 0xffff;
+    uint32_t HiTracingHookAddr =
+        (reinterpret_cast<int64_t>(TracingHook) >> 16) & 0xffff;
     uint32_t HigherTracingHookAddr =
         (reinterpret_cast<int64_t>(TracingHook) >> 32) & 0xffff;
     uint32_t HighestTracingHookAddr =
@@ -160,6 +161,11 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
   return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
 }
 
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips64?
+  return false;
+}
 } // namespace __xray
 
 extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_powerpc64.cc b/lib/xray/xray_powerpc64.cc
index 6a7554cfc..ab03cb100 100644
--- a/lib/xray/xray_powerpc64.cc
+++ b/lib/xray/xray_powerpc64.cc
@@ -93,6 +93,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
 // FIXME: Maybe implement this better?
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
 
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in powerpc64?
+  return false;
+}
+
 } // namespace __xray
 
 extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_trampoline_x86_64.S b/lib/xray/xray_trampoline_x86_64.S
index 847ecef8d..b59eedc4b 100644
--- a/lib/xray/xray_trampoline_x86_64.S
+++ b/lib/xray/xray_trampoline_x86_64.S
@@ -176,9 +176,15 @@ __xray_ArgLoggerEntry:
 	je	.Larg1entryFail
 
 .Larg1entryLog:
-	movq	%rdi, %rdx	// first argument will become the third
-	xorq	%rsi, %rsi	// XRayEntryType::ENTRY into the second
-	movl	%r10d, %edi	// 32-bit function ID becomes the first
+
+  // First argument will become the third
+	movq	%rdi, %rdx
+
+  // XRayEntryType::ENTRY into the second
+	xorq	%rsi, %rsi
+
+	// 32-bit function ID becomes the first
+	movl	%r10d, %edi
 	callq	*%rax
 
 .Larg1entryFail:
@@ -189,4 +195,38 @@ __xray_ArgLoggerEntry:
 	.size __xray_ArgLoggerEntry, .Larg1entryEnd-__xray_ArgLoggerEntry
 	.cfi_endproc
 
+//===----------------------------------------------------------------------===//
+
+	.global __xray_CustomEvent
+	.align 16, 0x90
+	.type __xray_CustomEvent,@function
+__xray_CustomEvent:
+  .cfi_startproc
+	subq $16, %rsp
+	.cfi_def_cfa_offset 24
+	movq %rbp, 8(%rsp)
+	movq %rax, 0(%rsp)
+
+	// We take two arguments to this trampoline, which should be in rdi	and rsi
+	// already. We also make sure that we stash %rax because we use that register
+	// to call the logging handler.
+	movq _ZN6__xray22XRayPatchedCustomEventE(%rip), %rax
+	testq %rax,%rax
+	je .LcustomEventCleanup
+
+	// At this point we know that rcx and rdx already has the data, so we just
+	// call the logging handler.
+  callq *%rax
+
+.LcustomEventCleanup:
+	movq 0(%rsp), %rax
+	movq 8(%rsp), %rbp
+	addq $16, %rsp
+	.cfi_def_cfa_offset 8
+	retq
+
+.Ltmp8:
+	.size __xray_CustomEvent, .Ltmp8-__xray_CustomEvent
+	.cfi_endproc
+
 NO_EXEC_STACK_DIRECTIVE
diff --git a/lib/xray/xray_x86_64.cc b/lib/xray/xray_x86_64.cc
index 2e9a8d270..e34806fa1 100644
--- a/lib/xray/xray_x86_64.cc
+++ b/lib/xray/xray_x86_64.cc
@@ -75,8 +75,10 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
 static constexpr uint8_t CallOpCode = 0xe8;
 static constexpr uint16_t MovR10Seq = 0xba41;
 static constexpr uint16_t Jmp9Seq = 0x09eb;
+static constexpr uint16_t Jmp20Seq = 0x14eb;
 static constexpr uint8_t JmpOpCode = 0xe9;
 static constexpr uint8_t RetOpCode = 0xc3;
+static constexpr uint16_t NopwSeq = 0x9066;
 
 static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()};
 static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()};
@@ -201,6 +203,40 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
   return true;
 }
 
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the following sled:
+  //
+  // xray_sled_n:
+  //   jmp +19          // 2 bytes
+  //   ...
+  //
+  // With the following:
+  //
+  //   nopw             // 2 bytes*
+  //   ...
+  //
+  // We need to do this in the following order:
+  //
+  // 1. Overwrite the 5-byte nop with the call (relative), where (relative) is
+  //    the relative offset to the __xray_CustomEvent trampoline.
+  // 2. Do a two-byte atomic write over the 'jmp +24' to turn it into a 'nopw'.
+  //    This allows us to "enable" this code once the changes have committed.
+  //
+  // The "unpatch" should just turn the 'nopw' back to a 'jmp +24'.
+  //
+  if (Enable) {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), NopwSeq,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq,
+        std::memory_order_release);
+  }
+  return false;
+}
+
 // We determine whether the CPU we're running on has the correct features we
 // need. In x86_64 this will be rdtscp support.
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
-- 
cgit v1.2.3