summaryrefslogtreecommitdiff
path: root/lib/xray
diff options
context:
space:
mode:
authorDean Michael Berris <dberris@google.com>2016-09-08 00:28:26 +0000
committerDean Michael Berris <dberris@google.com>2016-09-08 00:28:26 +0000
commit4753c5bdd181bb7152d0dbdb203036310542dfdb (patch)
tree7defda932fa46dbd40c06cd11790f79efa029f75 /lib/xray
parent101412adea40c78ae9da9efc3783aab20d367838 (diff)
[XRay] ARM 32-bit no-Thumb support in compiler-rt
This is a port of XRay to ARM 32-bit, without Thumb support yet. This is one of 3 commits to different repositories of XRay ARM port. The other 2 are: 1. https://reviews.llvm.org/D23931 (LLVM) 2. https://reviews.llvm.org/D23932 (Clang test) Differential Revision: https://reviews.llvm.org/D23933 git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@280890 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/xray')
-rw-r--r--lib/xray/CMakeLists.txt8
-rw-r--r--lib/xray/xray_arm.cc131
-rw-r--r--lib/xray/xray_inmemory_log.cc52
-rw-r--r--lib/xray/xray_interface.cc141
-rw-r--r--lib/xray/xray_interface_internal.h22
-rw-r--r--lib/xray/xray_trampoline_arm.S65
-rw-r--r--lib/xray/xray_x86_64.cc116
7 files changed, 420 insertions, 115 deletions
diff --git a/lib/xray/CMakeLists.txt b/lib/xray/CMakeLists.txt
index bcd25297b..12d9b7a53 100644
--- a/lib/xray/CMakeLists.txt
+++ b/lib/xray/CMakeLists.txt
@@ -8,9 +8,17 @@ set(XRAY_SOURCES
)
set(x86_64_SOURCES
+ xray_x86_64.cc
xray_trampoline_x86_64.S
${XRAY_SOURCES})
+set(arm_SOURCES
+ xray_arm.cc
+ xray_trampoline_arm.S
+ ${XRAY_SOURCES})
+
+set(armhf_SOURCES ${arm_SOURCES})
+
include_directories(..)
include_directories(../../include)
diff --git a/lib/xray/xray_arm.cc b/lib/xray/xray_arm.cc
new file mode 100644
index 000000000..60e7437c1
--- /dev/null
+++ b/lib/xray/xray_arm.cc
@@ -0,0 +1,131 @@
+//===-- xray_arm.cpp --------------------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of ARM-specific routines (32-bit).
+//
+//===----------------------------------------------------------------------===//
+#include "xray_interface_internal.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include <atomic>
+#include <cassert>
+
+namespace __xray {
+
+// The machine codes for some instructions used in runtime patching.
+enum class PatchOpcodes : uint32_t
+{
+ PO_PushR0Lr = 0xE92D4001, // PUSH {r0, lr}
+ PO_BlxIp = 0xE12FFF3C, // BLX ip
+ PO_PopR0Lr = 0xE8BD4001, // POP {r0, lr}
+ PO_B20 = 0xEA000005 // B #20
+};
+
+// 0xUUUUWXYZ -> 0x000W0XYZ
+inline static uint32_t getMovwMask(const uint32_t Value) {
+ return (Value & 0xfff) | ((Value & 0xf000) << 4);
+}
+
+// 0xWXYZUUUU -> 0x000W0XYZ
+inline static uint32_t getMovtMask(const uint32_t Value) {
+ return getMovwMask(Value >> 16);
+}
+
+// Writes the following instructions:
+// MOVW R<regNo>, #<lower 16 bits of the |Value|>
+// MOVT R<regNo>, #<higher 16 bits of the |Value|>
+inline static uint32_t* write32bitLoadReg(uint8_t regNo, uint32_t* Address,
+ const uint32_t Value) {
+ //This is a fatal error: we cannot just report it and continue execution.
+ assert(regNo <= 15 && "Register number must be 0 to 15.");
+ // MOVW R, #0xWXYZ in machine code is 0xE30WRXYZ
+ *Address = (0xE3000000 | (uint32_t(regNo)<<12) | getMovwMask(Value));
+ Address++;
+ // MOVT R, #0xWXYZ in machine code is 0xE34WRXYZ
+ *Address = (0xE3400000 | (uint32_t(regNo)<<12) | getMovtMask(Value));
+ return Address + 1;
+}
+
+// Writes the following instructions:
+// MOVW r0, #<lower 16 bits of the |Value|>
+// MOVT r0, #<higher 16 bits of the |Value|>
+inline static uint32_t *Write32bitLoadR0(uint32_t *Address,
+ const uint32_t Value) {
+ return write32bitLoadReg(0, Address, Value);
+}
+
+// Writes the following instructions:
+// MOVW ip, #<lower 16 bits of the |Value|>
+// MOVT ip, #<higher 16 bits of the |Value|>
+inline static uint32_t *Write32bitLoadIP(uint32_t *Address,
+ const uint32_t Value) {
+ return write32bitLoadReg(12, Address, Value);
+}
+
+inline static bool patchSled(const bool Enable, const uint32_t FuncId,
+ const XRaySledEntry &Sled, void (*TracingHook)()) {
+ // When |Enable| == true,
+ // We replace the following compile-time stub (sled):
+ //
+ // xray_sled_n:
+ // B #20
+ // 6 NOPs (24 bytes)
+ //
+ // With the following runtime patch:
+ //
+ // xray_sled_n:
+ // PUSH {r0, lr}
+ // MOVW r0, #<lower 16 bits of function ID>
+ // MOVT r0, #<higher 16 bits of function ID>
+ // MOVW ip, #<lower 16 bits of address of TracingHook>
+ // MOVT ip, #<higher 16 bits of address of TracingHook>
+ // BLX ip
+ // POP {r0, lr}
+ //
+ // Replacement of the first 4-byte instruction should be the last and atomic
+ // operation, so that the user code which reaches the sled concurrently
+ // either jumps over the whole sled, or executes the whole sled when the
+ // latter is ready.
+ //
+ // When |Enable|==false, we set back the first instruction in the sled to be
+ // B #20
+
+ uint32_t *FirstAddress = reinterpret_cast<uint32_t *>(Sled.Address);
+ if (Enable) {
+ uint32_t *CurAddress = FirstAddress + 1;
+ CurAddress =
+ Write32bitLoadR0(CurAddress, reinterpret_cast<uint32_t>(FuncId));
+ CurAddress =
+ Write32bitLoadIP(CurAddress, reinterpret_cast<uint32_t>(TracingHook));
+ *CurAddress = uint32_t(PatchOpcodes::PO_BlxIp);
+ CurAddress++;
+ *CurAddress = uint32_t(PatchOpcodes::PO_PopR0Lr);
+ std::atomic_store_explicit(
+ reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+ uint32_t(PatchOpcodes::PO_PushR0Lr), std::memory_order_release);
+ } else {
+ std::atomic_store_explicit(
+ reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+ uint32_t(PatchOpcodes::PO_B20), std::memory_order_release);
+ }
+ return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+ const XRaySledEntry &Sled) {
+ return patchSled(Enable, FuncId, Sled, __xray_FunctionEntry);
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+ const XRaySledEntry &Sled) {
+ return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+} // namespace __xray
diff --git a/lib/xray/xray_inmemory_log.cc b/lib/xray/xray_inmemory_log.cc
index f4fd20846..286ba7122 100644
--- a/lib/xray/xray_inmemory_log.cc
+++ b/lib/xray/xray_inmemory_log.cc
@@ -24,7 +24,14 @@
#include <sys/types.h>
#include <thread>
#include <unistd.h>
-#include <x86intrin.h>
+
+#if defined(__x86_64__)
+ #include <x86intrin.h>
+#elif defined(__arm__)
+ static const int64_t NanosecondsPerSecond = 1000LL*1000*1000;
+#else
+ #error "Unsupported CPU Architecture"
+#endif /* CPU architecture */
#include "sanitizer_common/sanitizer_libc.h"
#include "xray/xray_records.h"
@@ -61,6 +68,7 @@ static void retryingWriteAll(int Fd, char *Begin, char *End) {
}
}
+#if defined(__x86_64__)
static std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin,
char *End) {
auto BytesToRead = std::distance(Begin, End);
@@ -103,6 +111,8 @@ static bool readValueFromFile(const char *Filename, long long *Value) {
return Result;
}
+#endif /* CPU architecture */
+
class ThreadExitFlusher {
int Fd;
XRayRecord *Start;
@@ -164,6 +174,7 @@ void __xray_InMemoryRawLog(int32_t FuncId, XRayEntryType Type) {
// Get the cycle frequency from SysFS on Linux.
long long CPUFrequency = -1;
+#if defined(__x86_64__)
if (readValueFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz",
&CPUFrequency)) {
CPUFrequency *= 1000;
@@ -174,6 +185,20 @@ void __xray_InMemoryRawLog(int32_t FuncId, XRayEntryType Type) {
} else {
Report("Unable to determine CPU frequency for TSC accounting.");
}
+#elif defined(__arm__)
+ // There is no instruction like RDTSCP in user mode on ARM. ARM's CP15 does
+ // not have a constant frequency like TSC on x86(_64), it may go faster
+ // or slower depending on CPU turbo or power saving mode. Furthermore,
+ // to read from CP15 on ARM a kernel modification or a driver is needed.
+ // We can not require this from users of compiler-rt.
+ // So on ARM we use clock_gettime() which gives the result in nanoseconds.
+ // To get the measurements per second, we scale this by the number of
+ // nanoseconds per second, pretending that the TSC frequency is 1GHz and
+ // one TSC tick is 1 nanosecond.
+ CPUFrequency = NanosecondsPerSecond;
+#else
+ #error "Unsupported CPU Architecture"
+#endif /* CPU architecture */
// Since we're here, we get to write the header. We set it up so that the
// header will only be written once, at the start, and let the threads
@@ -201,10 +226,29 @@ void __xray_InMemoryRawLog(int32_t FuncId, XRayEntryType Type) {
// First we get the useful data, and stuff it into the already aligned buffer
// through a pointer offset.
auto &R = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer)[Offset];
- unsigned CPU;
R.RecordType = RecordTypes::NORMAL;
- R.TSC = __rdtscp(&CPU);
- R.CPU = CPU;
+#if defined(__x86_64__)
+ {
+ unsigned CPU;
+ R.TSC = __rdtscp(&CPU);
+ R.CPU = CPU;
+ }
+#elif defined(__arm__)
+ {
+ timespec TS;
+ int result = clock_gettime(CLOCK_REALTIME, &TS);
+ if(result != 0)
+ {
+ Report("clock_gettime() returned %d, errno=%d.", result, int(errno));
+ TS.tv_sec = 0;
+ TS.tv_nsec = 0;
+ }
+ R.TSC = TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec;
+ R.CPU = 0;
+ }
+#else
+ #error "Unsupported CPU Architecture"
+#endif /* CPU architecture */
R.TId = TId;
R.Type = Type;
R.FuncId = FuncId;
diff --git a/lib/xray/xray_interface.cc b/lib/xray/xray_interface.cc
index 5ef3fc7aa..360a6ad5b 100644
--- a/lib/xray/xray_interface.cc
+++ b/lib/xray/xray_interface.cc
@@ -26,6 +26,15 @@
namespace __xray {
+#if defined(__x86_64__)
+ // FIXME: The actual length is 11 bytes. Why was length 12 passed to mprotect() ?
+ static const int16_t cSledLength = 12;
+#elif defined(__arm__)
+ static const int16_t cSledLength = 28;
+#else
+ #error "Unsupported CPU Architecture"
+#endif /* CPU architecture */
+
// This is the function to call when we encounter the entry or exit sleds.
std::atomic<void (*)(int32_t, XRayEntryType)> XRayPatchedFunction{nullptr};
@@ -64,13 +73,6 @@ public:
} // namespace __xray
-extern "C" {
-// The following functions have to be defined in assembler, on a per-platform
-// basis. See xray_trampoline_*.s files for implementations.
-extern void __xray_FunctionEntry();
-extern void __xray_FunctionExit();
-}
-
extern std::atomic<bool> XRayInitialized;
extern std::atomic<__xray::XRaySledMap> XRayInstrMap;
@@ -133,12 +135,13 @@ XRayPatchingStatus ControlPatching(bool Enable) {
if (InstrMap.Entries == 0)
return XRayPatchingStatus::NOT_INITIALIZED;
- int32_t FuncId = 1;
- static constexpr uint8_t CallOpCode = 0xe8;
- static constexpr uint16_t MovR10Seq = 0xba41;
- static constexpr uint16_t Jmp9Seq = 0x09eb;
- static constexpr uint8_t JmpOpCode = 0xe9;
- static constexpr uint8_t RetOpCode = 0xc3;
+ const uint64_t PageSize = GetPageSizeCached();
+ if((PageSize == 0) || ( (PageSize & (PageSize-1)) != 0) ) {
+ Report("System page size is not a power of two: %lld", PageSize);
+ return XRayPatchingStatus::FAILED;
+ }
+
+ uint32_t FuncId = 1;
uint64_t CurFun = 0;
for (std::size_t I = 0; I < InstrMap.Entries; I++) {
auto Sled = InstrMap.Sleds[I];
@@ -153,112 +156,28 @@ XRayPatchingStatus ControlPatching(bool Enable) {
// While we're here, we should patch the nop sled. To do that we mprotect
// the page containing the function to be writeable.
void *PageAlignedAddr =
- reinterpret_cast<void *>(Sled.Address & ~((2 << 16) - 1));
+ reinterpret_cast<void *>(Sled.Address & ~(PageSize-1));
std::size_t MProtectLen =
- (Sled.Address + 12) - reinterpret_cast<uint64_t>(PageAlignedAddr);
+ (Sled.Address + cSledLength) - reinterpret_cast<uint64_t>(PageAlignedAddr);
MProtectHelper Protector(PageAlignedAddr, MProtectLen);
if (Protector.MakeWriteable() == -1) {
printf("Failed mprotect: %d\n", errno);
return XRayPatchingStatus::FAILED;
}
- static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()};
- static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()};
- if (Sled.Kind == XRayEntryType::ENTRY) {
- // FIXME: Implement this in a more extensible manner, per-platform.
- // Here we do the dance of replacing the following sled:
- //
- // xray_sled_n:
- // jmp +9
- // <9 byte nop>
- //
- // With the following:
- //
- // mov r10d, <function id>
- // call <relative 32bit offset to entry trampoline>
- //
- // We need to do this in the following order:
- //
- // 1. Put the function id first, 2 bytes from the start of the sled (just
- // after the 2-byte jmp instruction).
- // 2. Put the call opcode 6 bytes from the start of the sled.
- // 3. Put the relative offset 7 bytes from the start of the sled.
- // 4. Do an atomic write over the jmp instruction for the "mov r10d"
- // opcode and first operand.
- //
- // Prerequisite is to compute the relative offset to the
- // __xray_FunctionEntry function's address.
- int64_t TrampolineOffset =
- reinterpret_cast<int64_t>(__xray_FunctionEntry) -
- (static_cast<int64_t>(Sled.Address) + 11);
- if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
- Report("XRay Entry trampoline (%p) too far from sled (%p); distance = "
- "%ld\n",
- __xray_FunctionEntry, reinterpret_cast<void *>(Sled.Address),
- TrampolineOffset);
- continue;
- }
- if (Enable) {
- *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
- *reinterpret_cast<uint8_t *>(Sled.Address + 6) = CallOpCode;
- *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
- std::atomic_store_explicit(
- reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
- std::memory_order_release);
- } else {
- std::atomic_store_explicit(
- reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp9Seq,
- std::memory_order_release);
- // FIXME: Write out the nops still?
- }
- }
-
- if (Sled.Kind == XRayEntryType::EXIT) {
- // FIXME: Implement this in a more extensible manner, per-platform.
- // Here we do the dance of replacing the following sled:
- //
- // xray_sled_n:
- // ret
- // <10 byte nop>
- //
- // With the following:
- //
- // mov r10d, <function id>
- // jmp <relative 32bit offset to exit trampoline>
- //
- // 1. Put the function id first, 2 bytes from the start of the sled (just
- // after the 1-byte ret instruction).
- // 2. Put the jmp opcode 6 bytes from the start of the sled.
- // 3. Put the relative offset 7 bytes from the start of the sled.
- // 4. Do an atomic write over the jmp instruction for the "mov r10d"
- // opcode and first operand.
- //
- // Prerequisite is to compute the relative offset fo the
- // __xray_FunctionExit function's address.
- int64_t TrampolineOffset =
- reinterpret_cast<int64_t>(__xray_FunctionExit) -
- (static_cast<int64_t>(Sled.Address) + 11);
- if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
- Report("XRay Exit trampoline (%p) too far from sled (%p); distance = "
- "%ld\n",
- __xray_FunctionExit, reinterpret_cast<void *>(Sled.Address),
- TrampolineOffset);
- continue;
- }
- if (Enable) {
- *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
- *reinterpret_cast<uint8_t *>(Sled.Address + 6) = JmpOpCode;
- *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
- std::atomic_store_explicit(
- reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
- std::memory_order_release);
- } else {
- std::atomic_store_explicit(
- reinterpret_cast<std::atomic<uint8_t> *>(Sled.Address), RetOpCode,
- std::memory_order_release);
- // FIXME: Write out the nops still?
- }
+ bool Success = false;
+ switch(Sled.Kind) {
+ case XRayEntryType::ENTRY:
+ Success = patchFunctionEntry(Enable, FuncId, Sled);
+ break;
+ case XRayEntryType::EXIT:
+ Success = patchFunctionExit(Enable, FuncId, Sled);
+ break;
+ default:
+ Report("Unsupported sled kind: %d", int(Sled.Kind));
+ continue;
}
+ (void)Success;
}
XRayPatching.store(false, std::memory_order_release);
PatchingSuccess = true;
diff --git a/lib/xray/xray_interface_internal.h b/lib/xray/xray_interface_internal.h
index 6208c110e..3465b67d5 100644
--- a/lib/xray/xray_interface_internal.h
+++ b/lib/xray/xray_interface_internal.h
@@ -16,18 +16,30 @@
#define XRAY_INTERFACE_INTERNAL_H
#include "xray/xray_interface.h"
+#include "sanitizer_common/sanitizer_platform.h"
#include <cstddef>
#include <cstdint>
extern "C" {
struct XRaySledEntry {
+#if SANITIZER_WORDSIZE == 64
uint64_t Address;
uint64_t Function;
unsigned char Kind;
unsigned char AlwaysInstrument;
unsigned char Padding[14]; // Need 32 bytes
+#elif SANITIZER_WORDSIZE == 32
+ uint32_t Address;
+ uint32_t Function;
+ unsigned char Kind;
+ unsigned char AlwaysInstrument;
+ unsigned char Padding[6]; // Need 16 bytes
+#else
+ #error "Unsupported word size."
+#endif
};
+
}
namespace __xray {
@@ -37,6 +49,16 @@ struct XRaySledMap {
size_t Entries;
};
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, const XRaySledEntry& Sled);
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId, const XRaySledEntry& Sled);
+
} // namespace __xray
+extern "C" {
+// The following functions have to be defined in assembler, on a per-platform
+// basis. See xray_trampoline_*.S files for implementations.
+extern void __xray_FunctionEntry();
+extern void __xray_FunctionExit();
+}
+
#endif
diff --git a/lib/xray/xray_trampoline_arm.S b/lib/xray/xray_trampoline_arm.S
new file mode 100644
index 000000000..225bb50ce
--- /dev/null
+++ b/lib/xray/xray_trampoline_arm.S
@@ -0,0 +1,65 @@
+ .syntax unified
+ .arch armv7
+ .fpu vfpv3
+ .code 32
+ .global _ZN6__xray19XRayPatchedFunctionE
+ @ Word-aligned function entry point
+ .p2align 2
+ @ Let C/C++ see the symbol
+ .global __xray_FunctionEntry
+ @ It preserves all registers except r0, r12(ip), r14(lr) and r15(pc)
+ @ Assume that "q" part of the floating-point registers is not used
+ @ for passing parameters to C/C++ functions.
+ .type __xray_FunctionEntry, %function
+ @ In C++ it is void extern "C" __xray_FunctionEntry(uint32_t FuncId) with
+ @ FuncId passed in r0 register.
+__xray_FunctionEntry:
+ PUSH {r1-r3,lr}
+ @ Save floating-point parameters of the instrumented function
+ VPUSH {d0-d7}
+ MOVW r1,#:lower16:_ZN6__xray19XRayPatchedFunctionE
+ MOVT r1,#:upper16:_ZN6__xray19XRayPatchedFunctionE
+ LDR r2, [r1]
+ @ Handler address is nullptr if handler is not set
+ CMP r2, #0
+ BEQ FunctionEntry_restore
+ @ Function ID is already in r0 (the first parameter).
+ @ r1=0 means that we are tracing an entry event
+ MOV r1, #0
+ @ Call the handler with 2 parameters in r0 and r1
+ BLX r2
+FunctionEntry_restore:
+ @ Restore floating-point parameters of the instrumented function
+ VPOP {d0-d7}
+ POP {r1-r3,pc}
+
+ @ Word-aligned function entry point
+ .p2align 2
+ @ Let C/C++ see the symbol
+ .global __xray_FunctionExit
+ @ Assume that d1-d7 are not used for the return value.
+ @ Assume that "q" part of the floating-point registers is not used for the
+ @ return value in C/C++.
+ .type __xray_FunctionExit, %function
+ @ In C++ it is extern "C" void __xray_FunctionExit(uint32_t FuncId) with
+ @ FuncId passed in r0 register.
+__xray_FunctionExit:
+ PUSH {r1-r3,lr}
+ @ Save the floating-point return value of the instrumented function
+ VPUSH {d0}
+ @ Load the handler address
+ MOVW r1,#:lower16:_ZN6__xray19XRayPatchedFunctionE
+ MOVT r1,#:upper16:_ZN6__xray19XRayPatchedFunctionE
+ LDR r2, [r1]
+ @ Handler address is nullptr if handler is not set
+ CMP r2, #0
+ BEQ FunctionExit_restore
+ @ Function ID is already in r0 (the first parameter).
+ @ 1 means that we are tracing an exit event
+ MOV r1, #1
+ @ Call the handler with 2 parameters in r0 and r1
+ BLX r2
+FunctionExit_restore:
+ @ Restore the floating-point return value of the instrumented function
+ VPOP {d0}
+ POP {r1-r3,pc}
diff --git a/lib/xray/xray_x86_64.cc b/lib/xray/xray_x86_64.cc
new file mode 100644
index 000000000..398d6fc14
--- /dev/null
+++ b/lib/xray/xray_x86_64.cc
@@ -0,0 +1,116 @@
+#include "xray_interface_internal.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include <atomic>
+#include <cstdint>
+#include <limits>
+
+namespace __xray {
+
+static constexpr uint8_t CallOpCode = 0xe8;
+static constexpr uint16_t MovR10Seq = 0xba41;
+static constexpr uint16_t Jmp9Seq = 0x09eb;
+static constexpr uint8_t JmpOpCode = 0xe9;
+static constexpr uint8_t RetOpCode = 0xc3;
+
+static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()};
+static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()};
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, const XRaySledEntry& Sled)
+{
+ // Here we do the dance of replacing the following sled:
+ //
+ // xray_sled_n:
+ // jmp +9
+ // <9 byte nop>
+ //
+ // With the following:
+ //
+ // mov r10d, <function id>
+ // call <relative 32bit offset to entry trampoline>
+ //
+ // We need to do this in the following order:
+ //
+ // 1. Put the function id first, 2 bytes from the start of the sled (just
+ // after the 2-byte jmp instruction).
+ // 2. Put the call opcode 6 bytes from the start of the sled.
+ // 3. Put the relative offset 7 bytes from the start of the sled.
+ // 4. Do an atomic write over the jmp instruction for the "mov r10d"
+ // opcode and first operand.
+ //
+ // Prerequisite is to compute the relative offset to the
+ // __xray_FunctionEntry function's address.
+ int64_t TrampolineOffset =
+ reinterpret_cast<int64_t>(__xray_FunctionEntry) -
+ (static_cast<int64_t>(Sled.Address) + 11);
+ if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
+ Report("XRay Entry trampoline (%p) too far from sled (%p); distance = "
+ "%ld\n",
+ __xray_FunctionEntry, reinterpret_cast<void *>(Sled.Address),
+ TrampolineOffset);
+ return false;
+ }
+ if (Enable) {
+ *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
+ *reinterpret_cast<uint8_t *>(Sled.Address + 6) = CallOpCode;
+ *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
+ std::atomic_store_explicit(
+ reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
+ std::memory_order_release);
+ } else {
+ std::atomic_store_explicit(
+ reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp9Seq,
+ std::memory_order_release);
+ // FIXME: Write out the nops still?
+ }
+ return true;
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId, const XRaySledEntry& Sled)
+{
+ // Here we do the dance of replacing the following sled:
+ //
+ // xray_sled_n:
+ // ret
+ // <10 byte nop>
+ //
+ // With the following:
+ //
+ // mov r10d, <function id>
+ // jmp <relative 32bit offset to exit trampoline>
+ //
+ // 1. Put the function id first, 2 bytes from the start of the sled (just
+ // after the 1-byte ret instruction).
+ // 2. Put the jmp opcode 6 bytes from the start of the sled.
+ // 3. Put the relative offset 7 bytes from the start of the sled.
+ // 4. Do an atomic write over the jmp instruction for the "mov r10d"
+ // opcode and first operand.
+ //
+ // Prerequisite is to compute the relative offset fo the
+ // __xray_FunctionExit function's address.
+ int64_t TrampolineOffset =
+ reinterpret_cast<int64_t>(__xray_FunctionExit) -
+ (static_cast<int64_t>(Sled.Address) + 11);
+ if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
+ Report("XRay Exit trampoline (%p) too far from sled (%p); distance = "
+ "%ld\n",
+ __xray_FunctionExit, reinterpret_cast<void *>(Sled.Address),
+ TrampolineOffset);
+ return false;
+ }
+ if (Enable) {
+ *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
+ *reinterpret_cast<uint8_t *>(Sled.Address + 6) = JmpOpCode;
+ *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
+ std::atomic_store_explicit(
+ reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
+ std::memory_order_release);
+ } else {
+ std::atomic_store_explicit(
+ reinterpret_cast<std::atomic<uint8_t> *>(Sled.Address), RetOpCode,
+ std::memory_order_release);
+ // FIXME: Write out the nops still?
+ }
+ return true;
+}
+
+} // namespace __xray