From f9947480f58b10f2460df24287f95357082b6563 Mon Sep 17 00:00:00 2001 From: Dean Michael Berris Date: Fri, 12 May 2017 01:07:41 +0000 Subject: [XRay][compiler-rt] Runtime changes to support custom event logging Summary: This change implements support for the custom event logging sleds and intrinsics at runtime. For now it only supports handling the sleds in x86_64, with the implementations for other architectures stubbed out to do nothing. NOTE: Work in progress, uploaded for exposition/exploration purposes. Depends on D27503, D30018, and D33032. Reviewers: echristo, javed.absar, timshen Subscribers: mehdi_amini, nemanjai, llvm-commits Differential Revision: https://reviews.llvm.org/D30630 git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@302857 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/xray/xray_AArch64.cc | 13 ++- lib/xray/xray_arm.cc | 12 +- lib/xray/xray_fdr_log_records.h | 1 + lib/xray/xray_fdr_logging.cc | 150 ++++++++++++++++++------- lib/xray/xray_fdr_logging_impl.h | 223 +++++++++++++++++++++++-------------- lib/xray/xray_interface.cc | 21 +++- lib/xray/xray_interface_internal.h | 2 + lib/xray/xray_mips.cc | 9 +- lib/xray/xray_mips64.cc | 8 +- lib/xray/xray_powerpc64.cc | 6 + lib/xray/xray_trampoline_x86_64.S | 46 +++++++- lib/xray/xray_x86_64.cc | 36 ++++++ 12 files changed, 387 insertions(+), 140 deletions(-) (limited to 'lib/xray') diff --git a/lib/xray/xray_AArch64.cc b/lib/xray/xray_AArch64.cc index 8d1c7c5d8..372c1ad87 100644 --- a/lib/xray/xray_AArch64.cc +++ b/lib/xray/xray_AArch64.cc @@ -18,8 +18,7 @@ #include #include - -extern "C" void __clear_cache(void* start, void* end); +extern "C" void __clear_cache(void *start, void *end); namespace __xray { @@ -86,8 +85,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId, reinterpret_cast *>(FirstAddress), uint32_t(PatchOpcodes::PO_B32), std::memory_order_release); } - __clear_cache(reinterpret_cast(FirstAddress), - reinterpret_cast(CurAddress)); + __clear_cache(reinterpret_cast(FirstAddress), + reinterpret_cast(CurAddress)); return true; } @@ -107,6 +106,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit); } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySled &Sled) + XRAY_NEVER_INSTRUMENT { // FIXME: Implement in aarch64? + return false; +} + // FIXME: Maybe implement this better? bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } diff --git a/lib/xray/xray_arm.cc b/lib/xray/xray_arm.cc index 26d673ec2..94e1c2638 100644 --- a/lib/xray/xray_arm.cc +++ b/lib/xray/xray_arm.cc @@ -18,7 +18,7 @@ #include #include -extern "C" void __clear_cache(void* start, void* end); +extern "C" void __clear_cache(void *start, void *end); namespace __xray { @@ -122,8 +122,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId, reinterpret_cast *>(FirstAddress), uint32_t(PatchOpcodes::PO_B20), std::memory_order_release); } - __clear_cache(reinterpret_cast(FirstAddress), - reinterpret_cast(CurAddress)); + __clear_cache(reinterpret_cast(FirstAddress), + reinterpret_cast(CurAddress)); return true; } @@ -143,6 +143,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit); } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySled &Sled) + XRAY_NEVER_INSTRUMENT { // FIXME: Implement in arm? + return false; +} + // FIXME: Maybe implement this better? bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } diff --git a/lib/xray/xray_fdr_log_records.h b/lib/xray/xray_fdr_log_records.h index 36d9410d1..3d6d38892 100644 --- a/lib/xray/xray_fdr_log_records.h +++ b/lib/xray/xray_fdr_log_records.h @@ -29,6 +29,7 @@ struct alignas(16) MetadataRecord { NewCPUId, TSCWrap, WalltimeMarker, + CustomEventMarker, }; // Use 7 bits to identify this record type. /* RecordKinds */ uint8_t RecordKind : 7; diff --git a/lib/xray/xray_fdr_logging.cc b/lib/xray/xray_fdr_logging.cc index e538b477a..a7e1382c3 100644 --- a/lib/xray/xray_fdr_logging.cc +++ b/lib/xray/xray_fdr_logging.cc @@ -41,45 +41,12 @@ namespace __xray { // Global BufferQueue. std::shared_ptr BQ; -__sanitizer::atomic_sint32_t LoggingStatus = { - XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; - __sanitizer::atomic_sint32_t LogFlushStatus = { XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; -std::unique_ptr FDROptions; - -XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax, - void *Options, - size_t OptionsSize) XRAY_NEVER_INSTRUMENT { - if (OptionsSize != sizeof(FDRLoggingOptions)) - return static_cast(__sanitizer::atomic_load( - &LoggingStatus, __sanitizer::memory_order_acquire)); - s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; - if (!__sanitizer::atomic_compare_exchange_strong( - &LoggingStatus, &CurrentStatus, - XRayLogInitStatus::XRAY_LOG_INITIALIZING, - __sanitizer::memory_order_release)) - return static_cast(CurrentStatus); - - FDROptions.reset(new FDRLoggingOptions()); - memcpy(FDROptions.get(), Options, OptionsSize); - bool Success = false; - BQ = std::make_shared(BufferSize, BufferMax, Success); - if (!Success) { - Report("BufferQueue init failed.\n"); - return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; - } - - // Install the actual handleArg0 handler after initialising the buffers. - __xray_set_handler(fdrLoggingHandleArg0); +FDRLoggingOptions FDROptions; - __sanitizer::atomic_store(&LoggingStatus, - XRayLogInitStatus::XRAY_LOG_INITIALIZED, - __sanitizer::memory_order_release); - Report("XRay FDR init successful.\n"); - return XRayLogInitStatus::XRAY_LOG_INITIALIZED; -} +__sanitizer::SpinMutex FDROptionsMutex; // Must finalize before flushing. XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { @@ -108,7 +75,11 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { // (fixed-sized) and let the tools reading the buffers deal with the data // afterwards. // - int Fd = FDROptions->Fd; + int Fd = -1; + { + __sanitizer::SpinMutexLock Guard(&FDROptionsMutex); + Fd = FDROptions.Fd; + } if (Fd == -1) Fd = getLogFD(); if (Fd == -1) { @@ -120,8 +91,8 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { // Test for required CPU features and cache the cycle frequency static bool TSCSupported = probeRequiredCPUFeatures(); - static uint64_t CycleFrequency = TSCSupported ? getTSCFrequency() - : __xray::NanosecondsPerSecond; + static uint64_t CycleFrequency = + TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond; XRayFileHeader Header; Header.Version = 1; @@ -192,8 +163,8 @@ XRayLogInitStatus fdrLoggingReset() XRAY_NEVER_INSTRUMENT { return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; } -void fdrLoggingHandleArg0(int32_t FuncId, - XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { +static std::tuple +getTimestamp() XRAY_NEVER_INSTRUMENT { // We want to get the TSC as early as possible, so that we can check whether // we've seen this CPU before. We also do it before we load anything else, to // allow for forward progress with the scheduling. @@ -203,7 +174,7 @@ void fdrLoggingHandleArg0(int32_t FuncId, // Test once for required CPU features static bool TSCSupported = probeRequiredCPUFeatures(); - if(TSCSupported) { + if (TSCSupported) { TSC = __xray::readTSC(CPU); } else { // FIXME: This code needs refactoring as it appears in multiple locations @@ -216,9 +187,102 @@ void fdrLoggingHandleArg0(int32_t FuncId, CPU = 0; TSC = TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec; } + return std::make_tuple(TSC, CPU); +} + +void fdrLoggingHandleArg0(int32_t FuncId, + XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { + auto TSC_CPU = getTimestamp(); + __xray_fdr_internal::processFunctionHook(FuncId, Entry, std::get<0>(TSC_CPU), + std::get<1>(TSC_CPU), clock_gettime, + LoggingStatus, BQ); +} - __xray_fdr_internal::processFunctionHook(FuncId, Entry, TSC, CPU, - clock_gettime, LoggingStatus, BQ); +void fdrLoggingHandleCustomEvent(void *Event, + std::size_t EventSize) XRAY_NEVER_INSTRUMENT { + using namespace __xray_fdr_internal; + auto TSC_CPU = getTimestamp(); + auto &TSC = std::get<0>(TSC_CPU); + auto &CPU = std::get<1>(TSC_CPU); + thread_local bool Running = false; + RecursionGuard Guard{Running}; + if (!Guard) { + assert(Running && "RecursionGuard is buggy!"); + return; + } + if (EventSize > std::numeric_limits::max()) { + using Empty = struct {}; + static Empty Once = [&] { + Report("Event size too large = %zu ; > max = %d\n", EventSize, + std::numeric_limits::max()); + return Empty(); + }(); + (void)Once; + } + int32_t ReducedEventSize = static_cast(EventSize); + if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, clock_gettime)) + return; + + // Here we need to prepare the log to handle: + // - The metadata record we're going to write. (16 bytes) + // - The additional data we're going to write. Currently, that's the size of + // the event we're going to dump into the log as free-form bytes. + if (!prepareBuffer(clock_gettime, MetadataRecSize + EventSize)) { + LocalBQ = nullptr; + return; + } + + // Write the custom event metadata record, which consists of the following + // information: + // - 8 bytes (64-bits) for the full TSC when the event started. + // - 4 bytes (32-bits) for the length of the data. + MetadataRecord CustomEvent; + CustomEvent.Type = uint8_t(RecordType::Metadata); + CustomEvent.RecordKind = + uint8_t(MetadataRecord::RecordKinds::CustomEventMarker); + constexpr auto TSCSize = sizeof(std::get<0>(TSC_CPU)); + std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t)); + std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize); + std::memcpy(RecordPtr, &CustomEvent, sizeof(CustomEvent)); + RecordPtr += sizeof(CustomEvent); + std::memcpy(RecordPtr, Event, ReducedEventSize); + endBufferIfFull(); +} + +XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax, + void *Options, + size_t OptionsSize) XRAY_NEVER_INSTRUMENT { + if (OptionsSize != sizeof(FDRLoggingOptions)) + return static_cast(__sanitizer::atomic_load( + &LoggingStatus, __sanitizer::memory_order_acquire)); + s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + if (!__sanitizer::atomic_compare_exchange_strong( + &LoggingStatus, &CurrentStatus, + XRayLogInitStatus::XRAY_LOG_INITIALIZING, + __sanitizer::memory_order_release)) + return static_cast(CurrentStatus); + + { + __sanitizer::SpinMutexLock Guard(&FDROptionsMutex); + memcpy(&FDROptions, Options, OptionsSize); + } + + bool Success = false; + BQ = std::make_shared(BufferSize, BufferMax, Success); + if (!Success) { + Report("BufferQueue init failed.\n"); + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + } + + // Install the actual handleArg0 handler after initialising the buffers. + __xray_set_handler(fdrLoggingHandleArg0); + __xray_set_customevent_handler(fdrLoggingHandleCustomEvent); + + __sanitizer::atomic_store(&LoggingStatus, + XRayLogInitStatus::XRAY_LOG_INITIALIZED, + __sanitizer::memory_order_release); + Report("XRay FDR init successful.\n"); + return XRayLogInitStatus::XRAY_LOG_INITIALIZED; } } // namespace __xray diff --git a/lib/xray/xray_fdr_logging_impl.h b/lib/xray/xray_fdr_logging_impl.h index ce360cb03..c801babc3 100644 --- a/lib/xray/xray_fdr_logging_impl.h +++ b/lib/xray/xray_fdr_logging_impl.h @@ -37,6 +37,9 @@ namespace __xray { +__sanitizer::atomic_sint32_t LoggingStatus = { + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; + /// We expose some of the state transitions when FDR logging mode is operating /// such that we can simulate a series of log events that may occur without /// and test with determinism without worrying about the real CPU time. @@ -123,12 +126,21 @@ thread_local uint8_t NumTailCalls = 0; constexpr auto MetadataRecSize = sizeof(MetadataRecord); constexpr auto FunctionRecSize = sizeof(FunctionRecord); +// We use a thread_local variable to keep track of which CPUs we've already +// run, and the TSC times for these CPUs. This allows us to stop repeating the +// CPU field in the function records. +// +// We assume that we'll support only 65536 CPUs for x86_64. +thread_local uint16_t CurrentCPU = std::numeric_limits::max(); +thread_local uint64_t LastTSC = 0; +thread_local uint64_t LastFunctionEntryTSC = 0; + class ThreadExitBufferCleanup { - std::weak_ptr Buffers; + std::shared_ptr &Buffers; BufferQueue::Buffer &Buffer; public: - explicit ThreadExitBufferCleanup(std::weak_ptr BQ, + explicit ThreadExitBufferCleanup(std::shared_ptr &BQ, BufferQueue::Buffer &Buffer) XRAY_NEVER_INSTRUMENT : Buffers(BQ), Buffer(Buffer) {} @@ -142,17 +154,24 @@ public: // the queue. assert((RecordPtr + MetadataRecSize) - static_cast(Buffer.Buffer) >= static_cast(MetadataRecSize)); - if (auto BQ = Buffers.lock()) { + if (Buffers) { writeEOBMetadata(); - auto EC = BQ->releaseBuffer(Buffer); + auto EC = Buffers->releaseBuffer(Buffer); if (EC != BufferQueue::ErrorCode::Ok) Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer, BufferQueue::getErrorString(EC)); + Buffers = nullptr; return; } } }; +// Make sure a thread that's ever called handleArg0 has a thread-local +// live reference to the buffer queue for this particular instance of +// FDRLogging, and that we're going to clean it up when the thread exits. +thread_local std::shared_ptr LocalBQ = nullptr; +thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer); + class RecursionGuard { bool &Running; const bool Valid; @@ -176,7 +195,7 @@ public: } }; -static inline bool loggingInitialized( +inline bool loggingInitialized( const __sanitizer::atomic_sint32_t &LoggingStatus) XRAY_NEVER_INSTRUMENT { return __sanitizer::atomic_load(&LoggingStatus, __sanitizer::memory_order_acquire) == @@ -185,7 +204,7 @@ static inline bool loggingInitialized( } // namespace -static inline void writeNewBufferPreamble(pid_t Tid, timespec TS, +inline void writeNewBufferPreamble(pid_t Tid, timespec TS, char *&MemPtr) XRAY_NEVER_INSTRUMENT { static constexpr int InitRecordsCount = 2; std::aligned_storage::type Records[InitRecordsCount]; @@ -222,9 +241,8 @@ static inline void writeNewBufferPreamble(pid_t Tid, timespec TS, NumTailCalls = 0; } -static inline void setupNewBuffer(int (*wall_clock_reader)(clockid_t, - struct timespec *)) - XRAY_NEVER_INSTRUMENT { +inline void setupNewBuffer(int (*wall_clock_reader)( + clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT { RecordPtr = static_cast(Buffer.Buffer); pid_t Tid = syscall(SYS_gettid); timespec TS{0, 0}; @@ -235,7 +253,7 @@ static inline void setupNewBuffer(int (*wall_clock_reader)(clockid_t, NumTailCalls = 0; } -static inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC, +inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC, char *&MemPtr) XRAY_NEVER_INSTRUMENT { MetadataRecord NewCPUId; NewCPUId.Type = uint8_t(RecordType::Metadata); @@ -253,12 +271,12 @@ static inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC, NumTailCalls = 0; } -static inline void writeNewCPUIdMetadata(uint16_t CPU, +inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC) XRAY_NEVER_INSTRUMENT { writeNewCPUIdMetadata(CPU, TSC, RecordPtr); } -static inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT { +inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT { MetadataRecord EOBMeta; EOBMeta.Type = uint8_t(RecordType::Metadata); EOBMeta.RecordKind = uint8_t(MetadataRecord::RecordKinds::EndOfBuffer); @@ -269,11 +287,11 @@ static inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT { NumTailCalls = 0; } -static inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT { +inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT { writeEOBMetadata(RecordPtr); } -static inline void writeTSCWrapMetadata(uint64_t TSC, +inline void writeTSCWrapMetadata(uint64_t TSC, char *&MemPtr) XRAY_NEVER_INSTRUMENT { MetadataRecord TSCWrap; TSCWrap.Type = uint8_t(RecordType::Metadata); @@ -289,11 +307,11 @@ static inline void writeTSCWrapMetadata(uint64_t TSC, NumTailCalls = 0; } -static inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT { +inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT { writeTSCWrapMetadata(TSC, RecordPtr); } -static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta, +inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta, XRayEntryType EntryType, char *&MemPtr) XRAY_NEVER_INSTRUMENT { std::aligned_storage::type @@ -339,6 +357,17 @@ static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta, FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionTailExit); break; + case XRayEntryType::CUSTOM_EVENT: { + // This is a bug in patching, so we'll report it once and move on. + static bool Once = [&] { + Report("Internal error: patched an XRay custom event call as a function; " + "func id = %d\n", + FuncId); + return true; + }(); + (void)Once; + return; + } } std::memcpy(MemPtr, &AlignedFuncRecordBuffer, sizeof(FunctionRecord)); @@ -346,8 +375,9 @@ static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta, } static uint64_t thresholdTicks() { - static uint64_t TicksPerSec = probeRequiredCPUFeatures() ? getTSCFrequency() : - __xray::NanosecondsPerSecond; + static uint64_t TicksPerSec = probeRequiredCPUFeatures() + ? getTSCFrequency() + : __xray::NanosecondsPerSecond; static const uint64_t ThresholdTicks = TicksPerSec * flags()->xray_fdr_log_func_duration_threshold_us / 1000000; return ThresholdTicks; @@ -397,9 +427,8 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC, RewindingRecordPtr -= FunctionRecSize; RewindingTSC -= ExpectedTailExit.TSCDelta; AlignedFuncStorage FunctionEntryBuffer; - const auto &ExpectedFunctionEntry = - *reinterpret_cast(std::memcpy( - &FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize)); + const auto &ExpectedFunctionEntry = *reinterpret_cast( + std::memcpy(&FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize)); assert(ExpectedFunctionEntry.RecordKind == uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && "Expected to find function entry when rewinding tail call."); @@ -422,7 +451,7 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC, } } -static inline bool releaseThreadLocalBuffer(BufferQueue *BQ) { +inline bool releaseThreadLocalBuffer(BufferQueue *BQ) { auto EC = BQ->releaseBuffer(Buffer); if (EC != BufferQueue::ErrorCode::Ok) { Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer, @@ -432,11 +461,29 @@ static inline bool releaseThreadLocalBuffer(BufferQueue *BQ) { return true; } -static inline void processFunctionHook( - int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU, - int (*wall_clock_reader)(clockid_t, struct timespec *), - __sanitizer::atomic_sint32_t &LoggingStatus, - const std::shared_ptr &BQ) XRAY_NEVER_INSTRUMENT { +inline bool prepareBuffer(int (*wall_clock_reader)(clockid_t, + struct timespec *), + size_t MaxSize) XRAY_NEVER_INSTRUMENT { + char *BufferStart = static_cast(Buffer.Buffer); + if ((RecordPtr + MaxSize) > (BufferStart + Buffer.Size - MetadataRecSize)) { + writeEOBMetadata(); + if (!releaseThreadLocalBuffer(LocalBQ.get())) + return false; + auto EC = LocalBQ->getBuffer(Buffer); + if (EC != BufferQueue::ErrorCode::Ok) { + Report("Failed to acquire a buffer; error=%s\n", + BufferQueue::getErrorString(EC)); + return false; + } + setupNewBuffer(wall_clock_reader); + } + return true; +} + +inline bool isLogInitializedAndReady( + std::shared_ptr &LocalBQ, uint64_t TSC, unsigned char CPU, + int (*wall_clock_reader)(clockid_t, + struct timespec *)) XRAY_NEVER_INSTRUMENT { // Bail out right away if logging is not initialized yet. // We should take the opportunity to release the buffer though. auto Status = __sanitizer::atomic_load(&LoggingStatus, @@ -446,44 +493,19 @@ static inline void processFunctionHook( (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING || Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) { writeEOBMetadata(); - if (!releaseThreadLocalBuffer(BQ.get())) - return; + if (!releaseThreadLocalBuffer(LocalBQ.get())) + return false; RecordPtr = nullptr; + LocalBQ = nullptr; + return false; } - return; - } - - // We use a thread_local variable to keep track of which CPUs we've already - // run, and the TSC times for these CPUs. This allows us to stop repeating the - // CPU field in the function records. - // - // We assume that we'll support only 65536 CPUs for x86_64. - thread_local uint16_t CurrentCPU = std::numeric_limits::max(); - thread_local uint64_t LastTSC = 0; - thread_local uint64_t LastFunctionEntryTSC = 0; - - // Make sure a thread that's ever called handleArg0 has a thread-local - // live reference to the buffer queue for this particular instance of - // FDRLogging, and that we're going to clean it up when the thread exits. - thread_local auto LocalBQ = BQ; - thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer); - - // Prevent signal handler recursion, so in case we're already in a log writing - // mode and the signal handler comes in (and is also instrumented) then we - // don't want to be clobbering potentially partial writes already happening in - // the thread. We use a simple thread_local latch to only allow one on-going - // handleArg0 to happen at any given time. - thread_local bool Running = false; - RecursionGuard Guard{Running}; - if (!Guard) { - assert(Running == true && "RecursionGuard is buggy!"); - return; + return false; } if (!loggingInitialized(LoggingStatus) || LocalBQ->finalizing()) { writeEOBMetadata(); - if (!releaseThreadLocalBuffer(BQ.get())) - return; + if (!releaseThreadLocalBuffer(LocalBQ.get())) + return false; RecordPtr = nullptr; } @@ -496,19 +518,57 @@ static inline void processFunctionHook( LS != XRayLogInitStatus::XRAY_LOG_FINALIZED) Report("Failed to acquire a buffer; error=%s\n", BufferQueue::getErrorString(EC)); - return; + return false; } setupNewBuffer(wall_clock_reader); } if (CurrentCPU == std::numeric_limits::max()) { - // This means this is the first CPU this thread has ever run on. We set the - // current CPU and record this as the first TSC we've seen. + // This means this is the first CPU this thread has ever run on. We set + // the current CPU and record this as the first TSC we've seen. CurrentCPU = CPU; writeNewCPUIdMetadata(CPU, TSC); } + return true; +} // namespace __xray_fdr_internal + +inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT { + auto BufferStart = static_cast(Buffer.Buffer); + if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) { + writeEOBMetadata(); + if (!releaseThreadLocalBuffer(LocalBQ.get())) + return; + RecordPtr = nullptr; + } +} + +inline void processFunctionHook( + int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU, + int (*wall_clock_reader)(clockid_t, struct timespec *), + __sanitizer::atomic_sint32_t &LoggingStatus, + const std::shared_ptr &BQ) XRAY_NEVER_INSTRUMENT { + // Prevent signal handler recursion, so in case we're already in a log writing + // mode and the signal handler comes in (and is also instrumented) then we + // don't want to be clobbering potentially partial writes already happening in + // the thread. We use a simple thread_local latch to only allow one on-going + // handleArg0 to happen at any given time. + thread_local bool Running = false; + RecursionGuard Guard{Running}; + if (!Guard) { + assert(Running == true && "RecursionGuard is buggy!"); + return; + } + + // In case the reference has been cleaned up before, we make sure we + // initialize it to the provided BufferQueue. + if (LocalBQ == nullptr) + LocalBQ = BQ; + + if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, wall_clock_reader)) + return; + // Before we go setting up writing new function entries, we need to be really // careful about the pointer math we're doing. This means we need to ensure // that the record we are about to write is going to fit into the buffer, @@ -545,24 +605,14 @@ static inline void processFunctionHook( // bytes in the end of the buffer, we need to write out the EOB, get a new // Buffer, set it up properly before doing any further writing. // - char *BufferStart = static_cast(Buffer.Buffer); - if ((RecordPtr + (MetadataRecSize + FunctionRecSize)) - BufferStart < - static_cast(MetadataRecSize)) { - writeEOBMetadata(); - if (!releaseThreadLocalBuffer(LocalBQ.get())) - return; - auto EC = LocalBQ->getBuffer(Buffer); - if (EC != BufferQueue::ErrorCode::Ok) { - Report("Failed to acquire a buffer; error=%s\n", - BufferQueue::getErrorString(EC)); - return; - } - setupNewBuffer(wall_clock_reader); + if (!prepareBuffer(wall_clock_reader, FunctionRecSize + MetadataRecSize)) { + LocalBQ = nullptr; + return; } // By this point, we are now ready to write at most 24 bytes (one metadata // record and one function record). - BufferStart = static_cast(Buffer.Buffer); + auto BufferStart = static_cast(Buffer.Buffer); assert((RecordPtr + (MetadataRecSize + FunctionRecSize)) - BufferStart >= static_cast(MetadataRecSize) && "Misconfigured BufferQueue provided; Buffer size not large enough."); @@ -586,7 +636,6 @@ static inline void processFunctionHook( // FunctionRecord. In this case we write down just a FunctionRecord with // the correct TSC delta. // - uint32_t RecordTSCDelta = 0; if (CPU != CurrentCPU) { // We've moved to a new CPU. @@ -619,21 +668,27 @@ static inline void processFunctionHook( break; rewindRecentCall(TSC, LastTSC, LastFunctionEntryTSC, FuncId); return; // without writing log. + case XRayEntryType::CUSTOM_EVENT: { + // This is a bug in patching, so we'll report it once and move on. + static bool Once = [&] { + Report("Internal error: patched an XRay custom event call as a function; " + "func id = %d", + FuncId); + return true; + }(); + (void)Once; + return; + } } writeFunctionRecord(FuncId, RecordTSCDelta, Entry, RecordPtr); // If we've exhausted the buffer by this time, we then release the buffer to // make sure that other threads may start using this buffer. - if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) { - writeEOBMetadata(); - if (!releaseThreadLocalBuffer(LocalBQ.get())) - return; - RecordPtr = nullptr; - } + endBufferIfFull(); } } // namespace __xray_fdr_internal - } // namespace __xray + #endif // XRAY_XRAY_FDR_LOGGING_IMPL_H diff --git a/lib/xray/xray_interface.cc b/lib/xray/xray_interface.cc index 26f0ab122..c437a72e3 100644 --- a/lib/xray/xray_interface.cc +++ b/lib/xray/xray_interface.cc @@ -50,6 +50,9 @@ __sanitizer::atomic_uintptr_t XRayPatchedFunction{0}; // This is the function to call from the arg1-enabled sleds/trampolines. __sanitizer::atomic_uintptr_t XRayArgLogger{0}; +// This is the function to call when we encounter a custom event log call. +__sanitizer::atomic_uintptr_t XRayPatchedCustomEvent{0}; + // MProtectHelper is an RAII wrapper for calls to mprotect(...) that will undo // any successful mprotect(...) changes. This is used to make a page writeable // and executable, and upon destruction if it was successful in doing so returns @@ -97,7 +100,19 @@ int __xray_set_handler(void (*entry)(int32_t, __sanitizer::memory_order_acquire)) { __sanitizer::atomic_store(&__xray::XRayPatchedFunction, - reinterpret_cast(entry), + reinterpret_cast(entry), + __sanitizer::memory_order_release); + return 1; + } + return 0; +} + +int __xray_set_customevent_handler(void (*entry)(void *, size_t)) + XRAY_NEVER_INSTRUMENT { + if (__sanitizer::atomic_load(&XRayInitialized, + __sanitizer::memory_order_acquire)) { + __sanitizer::atomic_store(&__xray::XRayPatchedCustomEvent, + reinterpret_cast(entry), __sanitizer::memory_order_release); return 1; } @@ -161,6 +176,9 @@ inline bool patchSled(const XRaySledEntry &Sled, bool Enable, case XRayEntryType::LOG_ARGS_ENTRY: Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry); break; + case XRayEntryType::CUSTOM_EVENT: + Success = patchCustomEvent(Enable, FuncId, Sled); + break; default: Report("Unsupported sled kind '%d' @%04x\n", Sled.Address, int(Sled.Kind)); return false; @@ -301,6 +319,7 @@ int __xray_set_handler_arg1(void (*Handler)(int32_t, XRayEntryType, uint64_t)) { __sanitizer::memory_order_release); return 1; } + int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); } uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT { diff --git a/lib/xray/xray_interface_internal.h b/lib/xray/xray_interface_internal.h index ef0c6b158..4a2784612 100644 --- a/lib/xray/xray_interface_internal.h +++ b/lib/xray/xray_interface_internal.h @@ -60,6 +60,7 @@ bool patchFunctionEntry(bool Enable, uint32_t FuncId, bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); bool patchFunctionTailExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); +bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); } // namespace __xray @@ -70,6 +71,7 @@ extern void __xray_FunctionEntry(); extern void __xray_FunctionExit(); extern void __xray_FunctionTailExit(); extern void __xray_ArgLoggerEntry(); +extern void __xray_CustomEvent(); } #endif diff --git a/lib/xray/xray_mips.cc b/lib/xray/xray_mips.cc index c8ff39936..cd863304d 100644 --- a/lib/xray/xray_mips.cc +++ b/lib/xray/xray_mips.cc @@ -95,7 +95,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId, // B #44 if (Enable) { - uint32_t LoTracingHookAddr = reinterpret_cast(TracingHook) & 0xffff; + uint32_t LoTracingHookAddr = + reinterpret_cast(TracingHook) & 0xffff; uint32_t HiTracingHookAddr = (reinterpret_cast(TracingHook) >> 16) & 0xffff; uint32_t LoFunctionID = FuncId & 0xffff; @@ -151,6 +152,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in mips? + return false; +} + } // namespace __xray extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { diff --git a/lib/xray/xray_mips64.cc b/lib/xray/xray_mips64.cc index 21136848c..fa8fdd5ab 100644 --- a/lib/xray/xray_mips64.cc +++ b/lib/xray/xray_mips64.cc @@ -93,7 +93,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId, if (Enable) { uint32_t LoTracingHookAddr = reinterpret_cast(TracingHook) & 0xffff; - uint32_t HiTracingHookAddr = (reinterpret_cast(TracingHook) >> 16) & 0xffff; + uint32_t HiTracingHookAddr = + (reinterpret_cast(TracingHook) >> 16) & 0xffff; uint32_t HigherTracingHookAddr = (reinterpret_cast(TracingHook) >> 32) & 0xffff; uint32_t HighestTracingHookAddr = @@ -160,6 +161,11 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in mips64? + return false; +} } // namespace __xray extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { diff --git a/lib/xray/xray_powerpc64.cc b/lib/xray/xray_powerpc64.cc index 6a7554cfc..ab03cb100 100644 --- a/lib/xray/xray_powerpc64.cc +++ b/lib/xray/xray_powerpc64.cc @@ -93,6 +93,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, // FIXME: Maybe implement this better? bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in powerpc64? + return false; +} + } // namespace __xray extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { diff --git a/lib/xray/xray_trampoline_x86_64.S b/lib/xray/xray_trampoline_x86_64.S index 847ecef8d..b59eedc4b 100644 --- a/lib/xray/xray_trampoline_x86_64.S +++ b/lib/xray/xray_trampoline_x86_64.S @@ -176,9 +176,15 @@ __xray_ArgLoggerEntry: je .Larg1entryFail .Larg1entryLog: - movq %rdi, %rdx // first argument will become the third - xorq %rsi, %rsi // XRayEntryType::ENTRY into the second - movl %r10d, %edi // 32-bit function ID becomes the first + + // First argument will become the third + movq %rdi, %rdx + + // XRayEntryType::ENTRY into the second + xorq %rsi, %rsi + + // 32-bit function ID becomes the first + movl %r10d, %edi callq *%rax .Larg1entryFail: @@ -189,4 +195,38 @@ __xray_ArgLoggerEntry: .size __xray_ArgLoggerEntry, .Larg1entryEnd-__xray_ArgLoggerEntry .cfi_endproc +//===----------------------------------------------------------------------===// + + .global __xray_CustomEvent + .align 16, 0x90 + .type __xray_CustomEvent,@function +__xray_CustomEvent: + .cfi_startproc + subq $16, %rsp + .cfi_def_cfa_offset 24 + movq %rbp, 8(%rsp) + movq %rax, 0(%rsp) + + // We take two arguments to this trampoline, which should be in rdi and rsi + // already. We also make sure that we stash %rax because we use that register + // to call the logging handler. + movq _ZN6__xray22XRayPatchedCustomEventE(%rip), %rax + testq %rax,%rax + je .LcustomEventCleanup + + // At this point we know that rcx and rdx already has the data, so we just + // call the logging handler. + callq *%rax + +.LcustomEventCleanup: + movq 0(%rsp), %rax + movq 8(%rsp), %rbp + addq $16, %rsp + .cfi_def_cfa_offset 8 + retq + +.Ltmp8: + .size __xray_CustomEvent, .Ltmp8-__xray_CustomEvent + .cfi_endproc + NO_EXEC_STACK_DIRECTIVE diff --git a/lib/xray/xray_x86_64.cc b/lib/xray/xray_x86_64.cc index 2e9a8d270..e34806fa1 100644 --- a/lib/xray/xray_x86_64.cc +++ b/lib/xray/xray_x86_64.cc @@ -75,8 +75,10 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT { static constexpr uint8_t CallOpCode = 0xe8; static constexpr uint16_t MovR10Seq = 0xba41; static constexpr uint16_t Jmp9Seq = 0x09eb; +static constexpr uint16_t Jmp20Seq = 0x14eb; static constexpr uint8_t JmpOpCode = 0xe9; static constexpr uint8_t RetOpCode = 0xc3; +static constexpr uint16_t NopwSeq = 0x9066; static constexpr int64_t MinOffset{std::numeric_limits::min()}; static constexpr int64_t MaxOffset{std::numeric_limits::max()}; @@ -201,6 +203,40 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, return true; } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // Here we do the dance of replacing the following sled: + // + // xray_sled_n: + // jmp +19 // 2 bytes + // ... + // + // With the following: + // + // nopw // 2 bytes* + // ... + // + // We need to do this in the following order: + // + // 1. Overwrite the 5-byte nop with the call (relative), where (relative) is + // the relative offset to the __xray_CustomEvent trampoline. + // 2. Do a two-byte atomic write over the 'jmp +24' to turn it into a 'nopw'. + // This allows us to "enable" this code once the changes have committed. + // + // The "unpatch" should just turn the 'nopw' back to a 'jmp +24'. + // + if (Enable) { + std::atomic_store_explicit( + reinterpret_cast *>(Sled.Address), NopwSeq, + std::memory_order_release); + } else { + std::atomic_store_explicit( + reinterpret_cast *>(Sled.Address), Jmp20Seq, + std::memory_order_release); + } + return false; +} + // We determine whether the CPU we're running on has the correct features we // need. In x86_64 this will be rdtscp support. bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { -- cgit v1.2.3