25 files changed, 447 insertions, 246 deletions
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index f796fe18a..25501d8bf 100644
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -29,6 +29,7 @@ check_cxx_compiler_flag(-std=c++11           COMPILER_RT_HAS_STD_CXX11_FLAG)
 check_cxx_compiler_flag(-ftls-model=initial-exec COMPILER_RT_HAS_FTLS_MODEL_INITIAL_EXEC)
 check_cxx_compiler_flag(-fno-lto             COMPILER_RT_HAS_FNO_LTO_FLAG)
 check_cxx_compiler_flag("-Werror -msse3" COMPILER_RT_HAS_MSSE3_FLAG)
+check_cxx_compiler_flag("-Werror -msse4.2"   COMPILER_RT_HAS_MSSE4_2_FLAG)
 check_cxx_compiler_flag(--sysroot=.          COMPILER_RT_HAS_SYSROOT_FLAG)
 
 if(NOT WIN32 AND NOT CYGWIN)
@@ -178,7 +179,7 @@ set(ALL_UBSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64}
 set(ALL_SAFESTACK_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM64} ${MIPS32} ${MIPS64})
 set(ALL_CFI_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64})
 set(ALL_ESAN_SUPPORTED_ARCH ${X86_64} ${MIPS64})
-set(ALL_SCUDO_SUPPORTED_ARCH ${X86_64})
+set(ALL_SCUDO_SUPPORTED_ARCH ${X86} ${X86_64})
 set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64})
 
 if(APPLE)
@@ -405,8 +406,7 @@ else()
     ${ALL_SAFESTACK_SUPPORTED_ARCH})
   filter_available_targets(CFI_SUPPORTED_ARCH ${ALL_CFI_SUPPORTED_ARCH})
   filter_available_targets(ESAN_SUPPORTED_ARCH ${ALL_ESAN_SUPPORTED_ARCH})
-  filter_available_targets(SCUDO_SUPPORTED_ARCH
-    ${ALL_SCUDO_SUPPORTED_ARCH})
+  filter_available_targets(SCUDO_SUPPORTED_ARCH ${ALL_SCUDO_SUPPORTED_ARCH})
   filter_available_targets(XRAY_SUPPORTED_ARCH ${ALL_XRAY_SUPPORTED_ARCH})
 endif()
 
diff --git a/lib/scudo/CMakeLists.txt b/lib/scudo/CMakeLists.txt
index 6f8f7d701..332c3a972 100644
--- a/lib/scudo/CMakeLists.txt
+++ b/lib/scudo/CMakeLists.txt
@@ -4,7 +4,7 @@ include_directories(..)
 
 set(SCUDO_CFLAGS ${SANITIZER_COMMON_CFLAGS})
 append_rtti_flag(OFF SCUDO_CFLAGS)
-list(APPEND SCUDO_CFLAGS -msse4.2 -mcx16)
+append_list_if(COMPILER_RT_HAS_MSSE4_2_FLAG -msse4.2 SCUDO_CFLAGS)
 
 set(SCUDO_SOURCES
   scudo_allocator.cpp
diff --git a/lib/scudo/scudo_allocator.cpp b/lib/scudo/scudo_allocator.cpp
index 243561349..890f8aef3 100644
--- a/lib/scudo/scudo_allocator.cpp
+++ b/lib/scudo/scudo_allocator.cpp
@@ -22,23 +22,41 @@
 
 #include <limits.h>
 #include <pthread.h>
-#include <smmintrin.h>
 
 #include <cstring>
 
 namespace __scudo {
 
+#if SANITIZER_CAN_USE_ALLOCATOR64
+const uptr AllocatorSpace = ~0ULL;
+const uptr AllocatorSize = 0x40000000000ULL;
+typedef DefaultSizeClassMap SizeClassMap;
 struct AP {
-  static const uptr kSpaceBeg = ~0ULL;
-  static const uptr kSpaceSize = 0x10000000000ULL;
+  static const uptr kSpaceBeg = AllocatorSpace;
+  static const uptr kSpaceSize = AllocatorSize;
   static const uptr kMetadataSize = 0;
-  typedef DefaultSizeClassMap SizeClassMap;
+  typedef __scudo::SizeClassMap SizeClassMap;
   typedef NoOpMapUnmapCallback MapUnmapCallback;
   static const uptr kFlags =
       SizeClassAllocator64FlagMasks::kRandomShuffleChunks;
 };
-
 typedef SizeClassAllocator64<AP> PrimaryAllocator;
+#else
+// Currently, the 32-bit Sanitizer allocator has not yet benefited from all the
+// security improvements brought to the 64-bit one. This makes the 32-bit
+// version of Scudo slightly less toughened.
+static const uptr RegionSizeLog = 20;
+static const uptr NumRegions = SANITIZER_MMAP_RANGE_SIZE >> RegionSizeLog;
+# if SANITIZER_WORDSIZE == 32
+typedef FlatByteMap<NumRegions> ByteMap;
+# elif SANITIZER_WORDSIZE == 64
+typedef TwoLevelByteMap<(NumRegions >> 12), 1 << 12> ByteMap;
+# endif  // SANITIZER_WORDSIZE
+typedef SizeClassMap<3, 4, 8, 16, 64, 14> SizeClassMap;
+typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, 0, SizeClassMap,
+    RegionSizeLog, ByteMap> PrimaryAllocator;
+#endif  // SANITIZER_CAN_USE_ALLOCATOR64
+
 typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
 typedef ScudoLargeMmapAllocator SecondaryAllocator;
 typedef CombinedAllocator<PrimaryAllocator, AllocatorCache, SecondaryAllocator>
@@ -48,7 +66,50 @@ static ScudoAllocator &getAllocator();
 
 static thread_local Xorshift128Plus Prng;
 // Global static cookie, initialized at start-up.
-static u64 Cookie;
+static uptr Cookie;
+
+enum : u8 {
+  CRC32Software = 0,
+  CRC32Hardware = 1,
+};
+// We default to software CRC32 if the alternatives are not supported, either
+// at compilation or at runtime.
+static atomic_uint8_t HashAlgorithm = { CRC32Software };
+
+// Hardware CRC32 is supported at compilation via the following:
+// - for i386 & x86_64: -msse4.2
+// - for ARM & AArch64: -march=armv8-a+crc
+// An additional check must be performed at runtime as well to make sure the
+// emitted instructions are valid on the target host.
+#if defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
+# ifdef __SSE4_2__
+#  include <smmintrin.h>
+#  define HW_CRC32 FIRST_32_SECOND_64(_mm_crc32_u32, _mm_crc32_u64)
+# endif
+# ifdef __ARM_FEATURE_CRC32
+#  include <arm_acle.h>
+#  define HW_CRC32 FIRST_32_SECOND_64(__crc32cw, __crc32cd)
+# endif
+#endif
+
+// Helper function that will compute the chunk checksum, being passed all the
+// the needed information as uptrs. It will opt for the hardware version of
+// the checksumming function if available.
+INLINE u32 hashUptrs(uptr Pointer, uptr *Array, uptr ArraySize, u8 HashType) {
+  u32 Crc;
+#if defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
+  if (HashType == CRC32Hardware) {
+    Crc = HW_CRC32(Cookie, Pointer);
+    for (uptr i = 0; i < ArraySize; i++)
+      Crc = HW_CRC32(Crc, Array[i]);
+    return Crc;
+  }
+#endif
+  Crc = computeCRC32(Cookie, Pointer);
+  for (uptr i = 0; i < ArraySize; i++)
+    Crc = computeCRC32(Crc, Array[i]);
+  return Crc;
+}
 
 struct ScudoChunk : UnpackedHeader {
   // We can't use the offset member of the chunk itself, as we would double
@@ -59,19 +120,37 @@ struct ScudoChunk : UnpackedHeader {
         reinterpret_cast<uptr>(this) - (Header->Offset << MinAlignmentLog));
   }
 
-  // CRC32 checksum of the Chunk pointer and its ChunkHeader.
-  // It currently uses the Intel Nehalem SSE4.2 crc32 64-bit instruction.
+  // Returns the usable size for a chunk, meaning the amount of bytes from the
+  // beginning of the user data to the end of the backend allocated chunk.
+  uptr getUsableSize(UnpackedHeader *Header) {
+    uptr Size = getAllocator().GetActuallyAllocatedSize(getAllocBeg(Header));
+    if (Size == 0)
+      return Size;
+    return Size - AlignedChunkHeaderSize - (Header->Offset << MinAlignmentLog);
+  }
+
+  // Compute the checksum of the Chunk pointer and its ChunkHeader.
   u16 computeChecksum(UnpackedHeader *Header) const {
-    u64 HeaderHolder[2];
-    memcpy(HeaderHolder, Header, sizeof(HeaderHolder));
-    u64 Crc = _mm_crc32_u64(Cookie, reinterpret_cast<uptr>(this));
-    // This is somewhat of a shortcut. The checksum is stored in the 16 least
-    // significant bits of the first 8 bytes of the header, hence zero-ing
-    // those bits out. It would be more valid to zero the checksum field of the
-    // UnpackedHeader, but would require holding an additional copy of it.
-    Crc = _mm_crc32_u64(Crc, HeaderHolder[0] & 0xffffffffffff0000ULL);
-    Crc = _mm_crc32_u64(Crc, HeaderHolder[1]);
-    return static_cast<u16>(Crc);
+    UnpackedHeader ZeroChecksumHeader = *Header;
+    ZeroChecksumHeader.Checksum = 0;
+    uptr HeaderHolder[sizeof(UnpackedHeader) / sizeof(uptr)];
+    memcpy(&HeaderHolder, &ZeroChecksumHeader, sizeof(HeaderHolder));
+    u32 Hash = hashUptrs(reinterpret_cast<uptr>(this),
+                         HeaderHolder,
+                         ARRAY_SIZE(HeaderHolder),
+                         atomic_load_relaxed(&HashAlgorithm));
+    return static_cast<u16>(Hash);
+  }
+
+  // Checks the validity of a chunk by verifying its checksum.
+  bool isValid() {
+    UnpackedHeader NewUnpackedHeader;
+    const AtomicPackedHeader *AtomicHeader =
+        reinterpret_cast<const AtomicPackedHeader *>(this);
+    PackedHeader NewPackedHeader =
+        AtomicHeader->load(std::memory_order_relaxed);
+    NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
+    return (NewUnpackedHeader.Checksum == computeChecksum(&NewUnpackedHeader));
   }
 
   // Loads and unpacks the header, verifying the checksum in the process.
@@ -81,9 +160,7 @@ struct ScudoChunk : UnpackedHeader {
     PackedHeader NewPackedHeader =
         AtomicHeader->load(std::memory_order_relaxed);
     *NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
-    if ((NewUnpackedHeader->Unused_0_ != 0) ||
-        (NewUnpackedHeader->Unused_1_ != 0) ||
-        (NewUnpackedHeader->Checksum != computeChecksum(NewUnpackedHeader))) {
+    if (NewUnpackedHeader->Checksum != computeChecksum(NewUnpackedHeader)) {
       dieWithMessage("ERROR: corrupted chunk header at address %p\n", this);
     }
   }
@@ -119,7 +196,7 @@ struct ScudoChunk : UnpackedHeader {
 static bool ScudoInitIsRunning = false;
 
 static pthread_once_t GlobalInited = PTHREAD_ONCE_INIT;
-static pthread_key_t pkey;
+static pthread_key_t PThreadKey;
 
 static thread_local bool ThreadInited = false;
 static thread_local bool ThreadTornDown = false;
@@ -133,7 +210,7 @@ static void teardownThread(void *p) {
   // like, so we wait until PTHREAD_DESTRUCTOR_ITERATIONS before draining the
   // quarantine and swallowing the cache.
   if (v < PTHREAD_DESTRUCTOR_ITERATIONS) {
-    pthread_setspecific(pkey, reinterpret_cast<void *>(v + 1));
+    pthread_setspecific(PThreadKey, reinterpret_cast<void *>(v + 1));
     return;
   }
   drainQuarantine();
@@ -146,6 +223,11 @@ static void initInternal() {
   CHECK(!ScudoInitIsRunning && "Scudo init calls itself!");
   ScudoInitIsRunning = true;
 
+  // Check is SSE4.2 is supported, if so, opt for the CRC32 hardware version.
+  if (testCPUFeature(CRC32CPUFeature)) {
+    atomic_store_relaxed(&HashAlgorithm, CRC32Hardware);
+  }
+
   initFlags();
 
   AllocatorOptions Options;
@@ -158,13 +240,13 @@ static void initInternal() {
 }
 
 static void initGlobal() {
-  pthread_key_create(&pkey, teardownThread);
+  pthread_key_create(&PThreadKey, teardownThread);
   initInternal();
 }
 
 static void NOINLINE initThread() {
   pthread_once(&GlobalInited, initGlobal);
-  pthread_setspecific(pkey, reinterpret_cast<void *>(1));
+  pthread_setspecific(PThreadKey, reinterpret_cast<void *>(1));
   getAllocator().InitCache(&Cache);
   ThreadInited = true;
 }
@@ -253,9 +335,6 @@ struct Allocator {
       FallbackQuarantineCache(LINKER_INITIALIZED) {}
 
   void init(const AllocatorOptions &Options) {
-    // Currently SSE 4.2 support is required. This might change later.
-    CHECK(testCPUFeature(SSE4_2)); // for crc32
-
     // Verify that the header offset field can hold the maximum offset. In the
     // case of the Secondary allocator, it takes care of alignment and the
     // offset will always be 0. In the case of the Primary, the worst case
@@ -266,14 +345,25 @@ struct Allocator {
     // last size class minus the header size, in multiples of MinAlignment.
     UnpackedHeader Header = {};
     uptr MaxPrimaryAlignment = 1 << MostSignificantSetBitIndex(
-        PrimaryAllocator::SizeClassMap::kMaxSize - MinAlignment);
-    uptr MaximumOffset = (MaxPrimaryAlignment - ChunkHeaderSize) >>
+        SizeClassMap::kMaxSize - MinAlignment);
+    uptr MaxOffset = (MaxPrimaryAlignment - AlignedChunkHeaderSize) >>
         MinAlignmentLog;
-    Header.Offset = MaximumOffset;
-    if (Header.Offset != MaximumOffset) {
+    Header.Offset = MaxOffset;
+    if (Header.Offset != MaxOffset) {
       dieWithMessage("ERROR: the maximum possible offset doesn't fit in the "
                      "header\n");
     }
+    // Verify that we can fit the maximum amount of unused bytes in the header.
+    // The worst case scenario would be when allocating 1 byte on a MaxAlignment
+    // alignment. Since the combined allocator currently rounds the size up to
+    // the alignment before passing it to the secondary, we end up with
+    // MaxAlignment - 1 extra bytes.
+    uptr MaxUnusedBytes = MaxAlignment - 1;
+    Header.UnusedBytes = MaxUnusedBytes;
+    if (Header.UnusedBytes != MaxUnusedBytes) {
+      dieWithMessage("ERROR: the maximum possible unused bytes doesn't fit in "
+                     "the header\n");
+    }
 
     DeallocationTypeMismatch = Options.DeallocationTypeMismatch;
     DeleteSizeMismatch = Options.DeleteSizeMismatch;
@@ -286,6 +376,17 @@ struct Allocator {
     Cookie = Prng.Next();
   }
 
+  // Helper function that checks for a valid Scudo chunk.
+  bool isValidPointer(const void *UserPtr) {
+    uptr ChunkBeg = reinterpret_cast<uptr>(UserPtr);
+    if (!IsAligned(ChunkBeg, MinAlignment)) {
+      return false;
+    }
+    ScudoChunk *Chunk =
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    return Chunk->isValid();
+  }
+
   // Allocates a chunk.
   void *allocate(uptr Size, uptr Alignment, AllocType Type) {
     if (UNLIKELY(!ThreadInited))
@@ -302,7 +403,7 @@ struct Allocator {
     if (Size >= MaxAllowedMallocSize)
       return BackendAllocator.ReturnNullOrDieOnBadRequest();
     uptr RoundedSize = RoundUpTo(Size, MinAlignment);
-    uptr NeededSize = RoundedSize + ChunkHeaderSize;
+    uptr NeededSize = RoundedSize + AlignedChunkHeaderSize;
     if (Alignment > MinAlignment)
       NeededSize += Alignment;
     if (NeededSize >= MaxAllowedMallocSize)
@@ -321,28 +422,33 @@ struct Allocator {
     if (!Ptr)
       return BackendAllocator.ReturnNullOrDieOnOOM();
 
-    // If requested, we will zero out the entire contents of the returned chunk.
-    if (ZeroContents && BackendAllocator.FromPrimary(Ptr))
-       memset(Ptr, 0, BackendAllocator.GetActuallyAllocatedSize(Ptr));
-
     uptr AllocBeg = reinterpret_cast<uptr>(Ptr);
     // If the allocation was serviced by the secondary, the returned pointer
     // accounts for ChunkHeaderSize to pass the alignment check of the combined
     // allocator. Adjust it here.
     if (!FromPrimary)
-      AllocBeg -= ChunkHeaderSize;
-    uptr ChunkBeg = AllocBeg + ChunkHeaderSize;
+      AllocBeg -= AlignedChunkHeaderSize;
+
+    uptr ActuallyAllocatedSize = BackendAllocator.GetActuallyAllocatedSize(
+        reinterpret_cast<void *>(AllocBeg));
+    // If requested, we will zero out the entire contents of the returned chunk.
+    if (ZeroContents && FromPrimary)
+       memset(Ptr, 0, ActuallyAllocatedSize);
+
+    uptr ChunkBeg = AllocBeg + AlignedChunkHeaderSize;
     if (!IsAligned(ChunkBeg, Alignment))
       ChunkBeg = RoundUpTo(ChunkBeg, Alignment);
     CHECK_LE(ChunkBeg + Size, AllocBeg + NeededSize);
     ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
     UnpackedHeader Header = {};
     Header.State = ChunkAllocated;
-    Header.Offset = (ChunkBeg - ChunkHeaderSize - AllocBeg) >> MinAlignmentLog;
+    uptr Offset = ChunkBeg - AlignedChunkHeaderSize - AllocBeg;
+    Header.Offset = Offset >> MinAlignmentLog;
     Header.AllocType = Type;
-    Header.RequestedSize = Size;
-    Header.Salt = static_cast<u16>(Prng.Next());
+    Header.UnusedBytes = ActuallyAllocatedSize - Offset -
+        AlignedChunkHeaderSize - Size;
+    Header.Salt = static_cast<u8>(Prng.Next());
     Chunk->storeHeader(&Header);
     void *UserPtr = reinterpret_cast<void *>(ChunkBeg);
     // TODO(kostyak): hooks sound like a terrible idea security wise but might
@@ -366,13 +472,14 @@ struct Allocator {
                      "aligned at address %p\n", UserPtr);
     }
     ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
     UnpackedHeader OldHeader;
     Chunk->loadHeader(&OldHeader);
     if (OldHeader.State != ChunkAllocated) {
       dieWithMessage("ERROR: invalid chunk state when deallocating address "
-                     "%p\n", Chunk);
+                     "%p\n", UserPtr);
     }
+    uptr UsableSize = Chunk->getUsableSize(&OldHeader);
     UnpackedHeader NewHeader = OldHeader;
     NewHeader.State = ChunkQuarantine;
     Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
@@ -386,69 +493,40 @@ struct Allocator {
         }
       }
     }
-    uptr Size = NewHeader.RequestedSize;
+    uptr Size = UsableSize - OldHeader.UnusedBytes;
     if (DeleteSizeMismatch) {
       if (DeleteSize && DeleteSize != Size) {
         dieWithMessage("ERROR: invalid sized delete on chunk at address %p\n",
                        Chunk);
       }
     }
+
     if (LIKELY(!ThreadTornDown)) {
       AllocatorQuarantine.Put(&ThreadQuarantineCache,
-                              QuarantineCallback(&Cache), Chunk, Size);
+                              QuarantineCallback(&Cache), Chunk, UsableSize);
     } else {
       SpinMutexLock l(&FallbackMutex);
       AllocatorQuarantine.Put(&FallbackQuarantineCache,
                               QuarantineCallback(&FallbackAllocatorCache),
-                              Chunk, Size);
+                              Chunk, UsableSize);
     }
   }
 
-  // Returns the actual usable size of a chunk. Since this requires loading the
-  // header, we will return it in the second parameter, as it can be required
-  // by the caller to perform additional processing.
-  uptr getUsableSize(const void *Ptr, UnpackedHeader *Header) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
-    if (!Ptr)
-      return 0;
-    uptr ChunkBeg = reinterpret_cast<uptr>(Ptr);
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
-    Chunk->loadHeader(Header);
-    // Getting the usable size of a chunk only makes sense if it's allocated.
-    if (Header->State != ChunkAllocated) {
-      dieWithMessage("ERROR: attempted to size a non-allocated chunk at "
-                     "address %p\n", Chunk);
-    }
-    uptr Size =
-        BackendAllocator.GetActuallyAllocatedSize(Chunk->getAllocBeg(Header));
-    // UsableSize works as malloc_usable_size, which is also what (AFAIU)
-    // tcmalloc's MallocExtension::GetAllocatedSize aims at providing. This
-    // means we will return the size of the chunk from the user beginning to
-    // the end of the 'user' allocation, hence us subtracting the header size
-    // and the offset from the size.
-    if (Size == 0)
-      return Size;
-    return Size - ChunkHeaderSize - (Header->Offset << MinAlignmentLog);
-  }
-
-  // Helper function that doesn't care about the header.
-  uptr getUsableSize(const void *Ptr) {
-    UnpackedHeader Header;
-    return getUsableSize(Ptr, &Header);
-  }
-
   // Reallocates a chunk. We can save on a new allocation if the new requested
   // size still fits in the chunk.
   void *reallocate(void *OldPtr, uptr NewSize) {
     if (UNLIKELY(!ThreadInited))
       initThread();
-    UnpackedHeader OldHeader;
-    uptr Size = getUsableSize(OldPtr, &OldHeader);
     uptr ChunkBeg = reinterpret_cast<uptr>(OldPtr);
     ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    UnpackedHeader OldHeader;
+    Chunk->loadHeader(&OldHeader);
+    if (OldHeader.State != ChunkAllocated) {
+      dieWithMessage("ERROR: invalid chunk state when reallocating address "
+                     "%p\n", OldPtr);
+    }
+    uptr Size = Chunk->getUsableSize(&OldHeader);
     if (OldHeader.AllocType != FromMalloc) {
       dieWithMessage("ERROR: invalid chunk type when reallocating address %p\n",
                      Chunk);
@@ -456,7 +534,7 @@ struct Allocator {
     UnpackedHeader NewHeader = OldHeader;
     // The new size still fits in the current chunk.
     if (NewSize <= Size) {
-      NewHeader.RequestedSize = NewSize;
+      NewHeader.UnusedBytes = Size - NewSize;
       Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
       return OldPtr;
     }
@@ -464,23 +542,42 @@ struct Allocator {
     // old one.
     void *NewPtr = allocate(NewSize, MinAlignment, FromMalloc);
     if (NewPtr) {
-      uptr OldSize = OldHeader.RequestedSize;
+      uptr OldSize = Size - OldHeader.UnusedBytes;
       memcpy(NewPtr, OldPtr, Min(NewSize, OldSize));
       NewHeader.State = ChunkQuarantine;
       Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
       if (LIKELY(!ThreadTornDown)) {
         AllocatorQuarantine.Put(&ThreadQuarantineCache,
-                                QuarantineCallback(&Cache), Chunk, OldSize);
+                                QuarantineCallback(&Cache), Chunk, Size);
       } else {
         SpinMutexLock l(&FallbackMutex);
         AllocatorQuarantine.Put(&FallbackQuarantineCache,
                                 QuarantineCallback(&FallbackAllocatorCache),
-                                Chunk, OldSize);
+                                Chunk, Size);
       }
     }
     return NewPtr;
   }
 
+  // Helper function that returns the actual usable size of a chunk.
+  uptr getUsableSize(const void *Ptr) {
+    if (UNLIKELY(!ThreadInited))
+      initThread();
+    if (!Ptr)
+      return 0;
+    uptr ChunkBeg = reinterpret_cast<uptr>(Ptr);
+    ScudoChunk *Chunk =
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    UnpackedHeader Header;
+    Chunk->loadHeader(&Header);
+    // Getting the usable size of a chunk only makes sense if it's allocated.
+    if (Header.State != ChunkAllocated) {
+      dieWithMessage("ERROR: invalid chunk state when sizing address %p\n",
+                     Ptr);
+    }
+    return Chunk->getUsableSize(&Header);
+  }
+
   void *calloc(uptr NMemB, uptr Size) {
     if (UNLIKELY(!ThreadInited))
       initThread();
@@ -575,7 +672,7 @@ uptr scudoMallocUsableSize(void *Ptr) {
   return Instance.getUsableSize(Ptr);
 }
 
-} // namespace __scudo
+}  // namespace __scudo
 
 using namespace __scudo;
 
@@ -605,10 +702,10 @@ uptr __sanitizer_get_estimated_allocated_size(uptr size) {
   return size;
 }
 
-int __sanitizer_get_ownership(const void *p) {
-  return Instance.getUsableSize(p) != 0;
+int __sanitizer_get_ownership(const void *Ptr) {
+  return Instance.isValidPointer(Ptr);
 }
 
-uptr __sanitizer_get_allocated_size(const void *p) {
-  return Instance.getUsableSize(p);
+uptr __sanitizer_get_allocated_size(const void *Ptr) {
+  return Instance.getUsableSize(Ptr);
 }
diff --git a/lib/scudo/scudo_allocator.h b/lib/scudo/scudo_allocator.h
index d2450aa48..484b7ea7a 100644
--- a/lib/scudo/scudo_allocator.h
+++ b/lib/scudo/scudo_allocator.h
@@ -14,10 +14,6 @@
 #ifndef SCUDO_ALLOCATOR_H_
 #define SCUDO_ALLOCATOR_H_
 
-#ifndef __x86_64__
-# error "The Scudo hardened allocator currently only supports x86_64."
-#endif
-
 #include "scudo_flags.h"
 
 #include "sanitizer_common/sanitizer_allocator.h"
@@ -39,57 +35,38 @@ enum ChunkState : u8 {
   ChunkQuarantine = 2
 };
 
-#if SANITIZER_WORDSIZE == 64
-// Our header requires 128 bits of storage on 64-bit platforms, which fits
-// nicely with the alignment requirements. Having the offset saves us from
+// Our header requires 64 bits of storage. Having the offset saves us from
 // using functions such as GetBlockBegin, that is fairly costly. Our first
 // implementation used the MetaData as well, which offers the advantage of
 // being stored away from the chunk itself, but accessing it was costly as
 // well. The header will be atomically loaded and stored using the 16-byte
 // primitives offered by the platform (likely requires cmpxchg16b support).
-typedef unsigned __int128 PackedHeader;
-struct UnpackedHeader {
-  u16  Checksum      : 16;
-  uptr RequestedSize : 40; // Needed for reallocation purposes.
-  u8   State         : 2;  // available, allocated, or quarantined
-  u8   AllocType     : 2;  // malloc, new, new[], or memalign
-  u8   Unused_0_     : 4;
-  uptr Offset        : 12; // Offset from the beginning of the backend
-                           // allocation to the beginning of the chunk itself,
-                           // in multiples of MinAlignment. See comment about
-                           // its maximum value and test in init().
-  u64  Unused_1_     : 36;
-  u16  Salt          : 16;
-};
-#elif SANITIZER_WORDSIZE == 32
-// On 32-bit platforms, our header requires 64 bits.
 typedef u64 PackedHeader;
 struct UnpackedHeader {
-  u16  Checksum      : 12;
-  uptr RequestedSize : 32; // Needed for reallocation purposes.
-  u8   State         : 2;  // available, allocated, or quarantined
-  u8   AllocType     : 2;  // malloc, new, new[], or memalign
-  uptr Offset        : 12; // Offset from the beginning of the backend
-                           // allocation to the beginning of the chunk itself,
-                           // in multiples of MinAlignment. See comment about
-                           // its maximum value and test in Allocator::init().
-  u16  Salt          : 4;
+  u64 Checksum    : 16;
+  u64 UnusedBytes : 24; // Needed for reallocation purposes.
+  u64 State       : 2;  // available, allocated, or quarantined
+  u64 AllocType   : 2;  // malloc, new, new[], or memalign
+  u64 Offset      : 12; // Offset from the beginning of the backend
+                        // allocation to the beginning of the chunk itself,
+                        // in multiples of MinAlignment. See comment about
+                        // its maximum value and test in init().
+  u64 Salt        : 8;
 };
-#else
-# error "Unsupported SANITIZER_WORDSIZE."
-#endif  // SANITIZER_WORDSIZE
 
 typedef std::atomic<PackedHeader> AtomicPackedHeader;
 COMPILER_CHECK(sizeof(UnpackedHeader) == sizeof(PackedHeader));
 
-const uptr ChunkHeaderSize = sizeof(PackedHeader);
-
 // Minimum alignment of 8 bytes for 32-bit, 16 for 64-bit
 const uptr MinAlignmentLog = FIRST_32_SECOND_64(3, 4);
 const uptr MaxAlignmentLog = 24; // 16 MB
 const uptr MinAlignment = 1 << MinAlignmentLog;
 const uptr MaxAlignment = 1 << MaxAlignmentLog;
 
+const uptr ChunkHeaderSize = sizeof(PackedHeader);
+const uptr AlignedChunkHeaderSize =
+    (ChunkHeaderSize + MinAlignment - 1) & ~(MinAlignment - 1);
+
 struct AllocatorOptions {
   u32 QuarantineSizeMb;
   u32 ThreadLocalQuarantineSizeKb;
@@ -120,6 +97,6 @@ uptr scudoMallocUsableSize(void *Ptr);
 
 #include "scudo_allocator_secondary.h"
 
-} // namespace __scudo
+}  // namespace __scudo
 
 #endif  // SCUDO_ALLOCATOR_H_
diff --git a/lib/scudo/scudo_allocator_secondary.h b/lib/scudo/scudo_allocator_secondary.h
index 451803c0c..4b62b8a35 100644
--- a/lib/scudo/scudo_allocator_secondary.h
+++ b/lib/scudo/scudo_allocator_secondary.h
@@ -32,7 +32,7 @@ class ScudoLargeMmapAllocator {
   void *Allocate(AllocatorStats *Stats, uptr Size, uptr Alignment) {
     // The Scudo frontend prevents us from allocating more than
     // MaxAllowedMallocSize, so integer overflow checks would be superfluous.
-    uptr HeadersSize = sizeof(SecondaryHeader) + ChunkHeaderSize;
+    uptr HeadersSize = sizeof(SecondaryHeader) + AlignedChunkHeaderSize;
     uptr MapSize = RoundUpTo(Size + sizeof(SecondaryHeader), PageSize);
     // Account for 2 guard pages, one before and one after the chunk.
     MapSize += 2 * PageSize;
@@ -52,27 +52,36 @@ class ScudoLargeMmapAllocator {
         UserBeg += Alignment - (UserBeg & (Alignment - 1));
       CHECK_GE(UserBeg, MapBeg);
       uptr NewMapBeg = UserBeg - HeadersSize;
-      NewMapBeg = (NewMapBeg & ~(PageSize - 1)) - PageSize;
+      NewMapBeg = RoundDownTo(NewMapBeg, PageSize) - PageSize;
       CHECK_GE(NewMapBeg, MapBeg);
-      uptr NewMapSize = MapEnd - NewMapBeg;
-      uptr Diff = NewMapBeg - MapBeg;
+      uptr NewMapSize = RoundUpTo(MapSize - Alignment, PageSize);
+      uptr NewMapEnd = NewMapBeg + NewMapSize;
+      CHECK_LE(NewMapEnd, MapEnd);
       // Unmap the extra memory if it's large enough.
+      uptr Diff = NewMapBeg - MapBeg;
       if (Diff > PageSize)
         UnmapOrDie(reinterpret_cast<void *>(MapBeg), Diff);
+      Diff = MapEnd - NewMapEnd;
+      if (Diff > PageSize)
+        UnmapOrDie(reinterpret_cast<void *>(NewMapEnd), Diff);
       MapBeg = NewMapBeg;
       MapSize = NewMapSize;
+      MapEnd = NewMapEnd;
     }
-    uptr UserEnd = UserBeg - ChunkHeaderSize + Size;
+    uptr UserEnd = UserBeg - AlignedChunkHeaderSize + Size;
     // For larger alignments, Alignment was added by the frontend to Size.
     if (Alignment > MinAlignment)
       UserEnd -= Alignment;
     CHECK_LE(UserEnd, MapEnd - PageSize);
     CHECK_EQ(MapBeg + PageSize, reinterpret_cast<uptr>(
         MmapFixedOrDie(MapBeg + PageSize, MapSize - 2 * PageSize)));
-    uptr Ptr = UserBeg - ChunkHeaderSize;
+    uptr Ptr = UserBeg - AlignedChunkHeaderSize;
     SecondaryHeader *Header = getHeader(Ptr);
     Header->MapBeg = MapBeg;
     Header->MapSize = MapSize;
+    // The primary adds the whole class size to the stats when allocating a
+    // chunk, so we will do something similar here. But we will not account for
+    // the guard pages.
     Stats->Add(AllocatorStatAllocated, MapSize - 2 * PageSize);
     Stats->Add(AllocatorStatMapped, MapSize - 2 * PageSize);
     CHECK(IsAligned(UserBeg, Alignment));
@@ -97,8 +106,8 @@ class ScudoLargeMmapAllocator {
 
   void Deallocate(AllocatorStats *Stats, void *Ptr) {
     SecondaryHeader *Header = getHeader(Ptr);
-    Stats->Sub(AllocatorStatAllocated, Header->MapSize);
-    Stats->Sub(AllocatorStatMapped, Header->MapSize);
+    Stats->Sub(AllocatorStatAllocated, Header->MapSize - 2 * PageSize);
+    Stats->Sub(AllocatorStatMapped, Header->MapSize - 2 * PageSize);
     UnmapOrDie(reinterpret_cast<void *>(Header->MapBeg), Header->MapSize);
   }
 
@@ -154,8 +163,8 @@ class ScudoLargeMmapAllocator {
     uptr MapBeg;
     uptr MapSize;
   };
-  // Check that sizeof(SecondaryHeader) is a multiple of 16.
-  COMPILER_CHECK((sizeof(SecondaryHeader) & 0xf) == 0);
+  // Check that sizeof(SecondaryHeader) is a multiple of MinAlignment.
+  COMPILER_CHECK((sizeof(SecondaryHeader) & (MinAlignment - 1)) == 0);
 
   SecondaryHeader *getHeader(uptr Ptr) {
     return reinterpret_cast<SecondaryHeader*>(Ptr - sizeof(SecondaryHeader));
diff --git a/lib/scudo/scudo_flags.cpp b/lib/scudo/scudo_flags.cpp
index f0d208863..b9c838107 100644
--- a/lib/scudo/scudo_flags.cpp
+++ b/lib/scudo/scudo_flags.cpp
@@ -90,4 +90,4 @@ Flags *getFlags() {
   return &ScudoFlags;
 }
 
-}
+}  // namespace __scudo
diff --git a/lib/scudo/scudo_flags.h b/lib/scudo/scudo_flags.h
index c16f635d3..d4ae31031 100644
--- a/lib/scudo/scudo_flags.h
+++ b/lib/scudo/scudo_flags.h
@@ -28,6 +28,6 @@ Flags *getFlags();
 
 void initFlags();
 
-} // namespace __scudo
+}  // namespace __scudo
 
 #endif  // SCUDO_FLAGS_H_
diff --git a/lib/scudo/scudo_interceptors.cpp b/lib/scudo/scudo_interceptors.cpp
index 9204652d8..735a13196 100644
--- a/lib/scudo/scudo_interceptors.cpp
+++ b/lib/scudo/scudo_interceptors.cpp
@@ -72,4 +72,4 @@ INTERCEPTOR(int, mallopt, int cmd, int value) {
   return -1;
 }
 
-#endif // SANITIZER_LINUX
+#endif  // SANITIZER_LINUX
diff --git a/lib/scudo/scudo_new_delete.cpp b/lib/scudo/scudo_new_delete.cpp
index 172f5659c..c022bd0ac 100644
--- a/lib/scudo/scudo_new_delete.cpp
+++ b/lib/scudo/scudo_new_delete.cpp
@@ -24,7 +24,7 @@ using namespace __scudo;
 // Fake std::nothrow_t to avoid including <new>.
 namespace std {
 struct nothrow_t {};
-} // namespace std
+}  // namespace std
 
 CXX_OPERATOR_ATTRIBUTE
 void *operator new(size_t size) {
diff --git a/lib/scudo/scudo_termination.cpp b/lib/scudo/scudo_termination.cpp
index a53338315..c441ff3c1 100644
--- a/lib/scudo/scudo_termination.cpp
+++ b/lib/scudo/scudo_termination.cpp
@@ -39,4 +39,4 @@ void NORETURN CheckFailed(const char *File, int Line, const char *Condition,
                           File, Line, Condition, Value1, Value2);
 }
 
-} // namespace __sanitizer
+}  // namespace __sanitizer
diff --git a/lib/scudo/scudo_utils.cpp b/lib/scudo/scudo_utils.cpp
index 9e6a3512e..2d66865b9 100644
--- a/lib/scudo/scudo_utils.cpp
+++ b/lib/scudo/scudo_utils.cpp
@@ -17,6 +17,7 @@
 #include <fcntl.h>
 #include <stdarg.h>
 #include <unistd.h>
+#include <cpuid.h>
 
 #include <cstring>
 
@@ -28,7 +29,7 @@ namespace __sanitizer {
 extern int VSNPrintf(char *buff, int buff_length, const char *format,
                      va_list args);
 
-} // namespace __sanitizer
+}  // namespace __sanitizer
 
 namespace __scudo {
 
@@ -44,60 +45,61 @@ void NORETURN dieWithMessage(const char *Format, ...) {
   Die();
 }
 
+#if defined(__x86_64__) || defined(__i386__)
+// i386 and x86_64 specific code to detect CRC32 hardware support via CPUID.
+// CRC32 requires the SSE 4.2 instruction set.
 typedef struct {
   u32 Eax;
   u32 Ebx;
   u32 Ecx;
   u32 Edx;
-} CPUIDInfo;
+} CPUIDRegs;
 
-static void getCPUID(CPUIDInfo *info, u32 leaf, u32 subleaf)
+static void getCPUID(CPUIDRegs *Regs, u32 Level)
 {
-  asm volatile("cpuid"
-      : "=a" (info->Eax), "=b" (info->Ebx), "=c" (info->Ecx), "=d" (info->Edx)
-      : "a" (leaf), "c" (subleaf)
-  );
+  __get_cpuid(Level, &Regs->Eax, &Regs->Ebx, &Regs->Ecx, &Regs->Edx);
 }
 
-// Returns true is the CPU is a "GenuineIntel" or "AuthenticAMD"
-static bool isSupportedCPU()
-{
-  CPUIDInfo Info;
-
-  getCPUID(&Info, 0, 0);
-  if (memcmp(reinterpret_cast<char *>(&Info.Ebx), "Genu", 4) == 0 &&
-      memcmp(reinterpret_cast<char *>(&Info.Edx), "ineI", 4) == 0 &&
-      memcmp(reinterpret_cast<char *>(&Info.Ecx), "ntel", 4) == 0) {
-      return true;
-  }
-  if (memcmp(reinterpret_cast<char *>(&Info.Ebx), "Auth", 4) == 0 &&
-      memcmp(reinterpret_cast<char *>(&Info.Edx), "enti", 4) == 0 &&
-      memcmp(reinterpret_cast<char *>(&Info.Ecx), "cAMD", 4) == 0) {
-      return true;
+CPUIDRegs getCPUFeatures() {
+  CPUIDRegs VendorRegs = {};
+  getCPUID(&VendorRegs, 0);
+  bool IsIntel =
+      (VendorRegs.Ebx == signature_INTEL_ebx) &&
+      (VendorRegs.Edx == signature_INTEL_edx) &&
+      (VendorRegs.Ecx == signature_INTEL_ecx);
+  bool IsAMD =
+      (VendorRegs.Ebx == signature_AMD_ebx) &&
+      (VendorRegs.Edx == signature_AMD_edx) &&
+      (VendorRegs.Ecx == signature_AMD_ecx);
+  // Default to an empty feature set if not on a supported CPU.
+  CPUIDRegs FeaturesRegs = {};
+  if (IsIntel || IsAMD) {
+    getCPUID(&FeaturesRegs, 1);
   }
-  return false;
+  return FeaturesRegs;
 }
 
-bool testCPUFeature(CPUFeature feature)
+#ifndef bit_SSE4_2
+#define bit_SSE4_2 bit_SSE42  // clang and gcc have different defines.
+#endif
+
+bool testCPUFeature(CPUFeature Feature)
 {
-  static bool InfoInitialized = false;
-  static CPUIDInfo CPUInfo = {};
-
-  if (InfoInitialized == false) {
-    if (isSupportedCPU() == true)
-      getCPUID(&CPUInfo, 1, 0);
-    else
-      UNIMPLEMENTED();
-    InfoInitialized = true;
-  }
-  switch (feature) {
-    case SSE4_2:
-      return ((CPUInfo.Ecx >> 20) & 0x1) != 0;
+  static CPUIDRegs FeaturesRegs = getCPUFeatures();
+
+  switch (Feature) {
+    case CRC32CPUFeature:  // CRC32 is provided by SSE 4.2.
+      return !!(FeaturesRegs.Ecx & bit_SSE4_2);
     default:
       break;
   }
   return false;
 }
+#else
+bool testCPUFeature(CPUFeature Feature) {
+  return false;
+}
+#endif  // defined(__x86_64__) || defined(__i386__)
 
 // readRetry will attempt to read Count bytes from the Fd specified, and if
 // interrupted will retry to read additional bytes to reach Count.
@@ -117,17 +119,77 @@ static ssize_t readRetry(int Fd, u8 *Buffer, size_t Count) {
   return AmountRead;
 }
 
-// Default constructor for Xorshift128Plus seeds the state with /dev/urandom
-Xorshift128Plus::Xorshift128Plus() {
+static void fillRandom(u8 *Data, ssize_t Size) {
   int Fd = open("/dev/urandom", O_RDONLY);
-  bool Success = readRetry(Fd, reinterpret_cast<u8 *>(&State_0_),
-                           sizeof(State_0_)) == sizeof(State_0_);
-  Success &= readRetry(Fd, reinterpret_cast<u8 *>(&State_1_),
-                           sizeof(State_1_)) == sizeof(State_1_);
+  if (Fd < 0) {
+    dieWithMessage("ERROR: failed to open /dev/urandom.\n");
+  }
+  bool Success = readRetry(Fd, Data, Size) == Size;
   close(Fd);
   if (!Success) {
     dieWithMessage("ERROR: failed to read enough data from /dev/urandom.\n");
   }
 }
 
-} // namespace __scudo
+// Default constructor for Xorshift128Plus seeds the state with /dev/urandom.
+// TODO(kostyak): investigate using getrandom() if available.
+Xorshift128Plus::Xorshift128Plus() {
+  fillRandom(reinterpret_cast<u8 *>(State), sizeof(State));
+}
+
+const static u32 CRC32Table[] = {
+  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+  0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
+  0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+  0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
+  0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+  0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
+  0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+  0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+  0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+  0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
+  0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+  0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
+  0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+  0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
+  0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+  0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
+  0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+  0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+  0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+  0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
+  0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+  0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+u32 computeCRC32(u32 Crc, uptr Data)
+{
+  for (uptr i = 0; i < sizeof(Data); i++) {
+    Crc = CRC32Table[(Crc ^ Data) & 0xff] ^ (Crc >> 8);
+    Data >>= 8;
+  }
+  return Crc;
+}
+
+}  // namespace __scudo
diff --git a/lib/scudo/scudo_utils.h b/lib/scudo/scudo_utils.h
index c4f076095..f93f26ef1 100644
--- a/lib/scudo/scudo_utils.h
+++ b/lib/scudo/scudo_utils.h
@@ -30,9 +30,9 @@ inline Dest bit_cast(const Source& source) {
 
 void NORETURN dieWithMessage(const char *Format, ...);
 
-enum  CPUFeature {
-  SSE4_2 = 0,
-  ENUM_CPUFEATURE_MAX
+enum CPUFeature {
+  CRC32CPUFeature = 0,
+  MaxCPUFeature,
 };
 bool testCPUFeature(CPUFeature feature);
 
@@ -42,18 +42,20 @@ struct Xorshift128Plus {
  public:
   Xorshift128Plus();
   u64 Next() {
-    u64 x = State_0_;
-    const u64 y = State_1_;
-    State_0_ = y;
+    u64 x = State[0];
+    const u64 y = State[1];
+    State[0] = y;
     x ^= x << 23;
-    State_1_ = x ^ y ^ (x >> 17) ^ (y >> 26);
-    return State_1_ + y;
+    State[1] = x ^ y ^ (x >> 17) ^ (y >> 26);
+    return State[1] + y;
   }
  private:
-  u64 State_0_;
-  u64 State_1_;
+  u64 State[2];
 };
 
-} // namespace __scudo
+// Software CRC32 functions, to be used when SSE 4.2 support is not detected.
+u32 computeCRC32(u32 Crc, uptr Data);
+
+}  // namespace __scudo
 
 #endif  // SCUDO_UTILS_H_
diff --git a/test/scudo/CMakeLists.txt b/test/scudo/CMakeLists.txt
index b6cb2fd24..42cdaf9e8 100644
--- a/test/scudo/CMakeLists.txt
+++ b/test/scudo/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(SCUDO_LIT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(SCUDO_LIT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
+set(SCUDO_TESTSUITES)
 
 set(SCUDO_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
 if(NOT COMPILER_RT_STANDALONE_BUILD)
@@ -12,17 +13,30 @@ configure_lit_site_cfg(
   ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
   )
 
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-   EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO)
-   STRING(REGEX REPLACE "^.*(sse4_2).*$" "\\1" SSE_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "sse4_2" "${SSE_THERE}" SSE42_TRUE)
-endif(CMAKE_SYSTEM_NAME MATCHES "Linux")
+set(SCUDO_TEST_ARCH ${SCUDO_SUPPORTED_ARCH})
+foreach(arch ${SCUDO_TEST_ARCH})
+  set(SCUDO_TEST_TARGET_ARCH ${arch})
+  string(TOLOWER "-${arch}" SCUDO_TEST_CONFIG_SUFFIX)
+  
+  if(ANDROID OR ${arch} MATCHES "arm|aarch64")
+    # This is only true if we are cross-compiling.
+    # Build all tests with host compiler and use host tools.
+    set(SCUDO_TEST_TARGET_CFLAGS ${COMPILER_RT_TEST_COMPILER_CFLAGS})
+  else()
+    get_target_flags_for_arch(${arch} SCUDO_TEST_TARGET_CFLAGS)
+    string(REPLACE ";" " " SCUDO_TEST_TARGET_CFLAGS "${SCUDO_TEST_TARGET_CFLAGS}")
+  endif()
 
-if (SSE42_TRUE AND CMAKE_SIZEOF_VOID_P EQUAL 8)
-  add_lit_testsuite(check-scudo
-    "Running the Scudo Hardened Allocator tests"
-    ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS ${SCUDO_TEST_DEPS})
-  set_target_properties(check-scudo PROPERTIES FOLDER
-    "Compiler-RT Misc")
-endif(SSE42_TRUE AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+  string(TOUPPER ${arch} ARCH_UPPER_CASE)
+  set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
+
+  configure_lit_site_cfg(
+    ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
+    ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg)
+  list(APPEND SCUDO_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME})
+endforeach()
+
+add_lit_testsuite(check-scudo "Running the Scudo Hardened Allocator tests"
+  ${SCUDO_TESTSUITES}
+  DEPENDS ${SCUDO_TEST_DEPS})
+set_target_properties(check-scudo PROPERTIES FOLDER "Compiler-RT Misc")
diff --git a/test/scudo/alignment.cpp b/test/scudo/alignment.cpp
index c5e57d179..a6eca87a8 100644
--- a/test/scudo/alignment.cpp
+++ b/test/scudo/alignment.cpp
@@ -1,11 +1,10 @@
 // RUN: %clang_scudo %s -o %t
 // RUN: not %run %t pointers 2>&1 | FileCheck %s
 
-// Tests that a non-16-byte aligned pointer will trigger the associated error
-// on deallocation.
+// Tests that a non MinAlignment aligned pointer will trigger the associated
+// error on deallocation.
 
 #include <assert.h>
-#include <malloc.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -17,7 +16,7 @@ int main(int argc, char **argv)
     void *p = malloc(1U << 16);
     if (!p)
       return 1;
-    free(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(p) | 8));
+    free(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(p) | 1));
   }
   return 0;
 }
diff --git a/test/scudo/double-free.cpp b/test/scudo/double-free.cpp
index 4f5bf0cb8..75919f0c4 100644
--- a/test/scudo/double-free.cpp
+++ b/test/scudo/double-free.cpp
@@ -46,4 +46,4 @@ int main(int argc, char **argv)
   return 0;
 }
 
-// CHECK: ERROR: invalid chunk state when deallocating address
+// CHECK: ERROR: invalid chunk state
diff --git a/test/scudo/interface.cpp b/test/scudo/interface.cpp
new file mode 100644
index 000000000..f9353066e
--- /dev/null
+++ b/test/scudo/interface.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_scudo %s -o %t
+// RUN: %run %t 2>&1
+
+// Tests that the sanitizer interface functions behave appropriately.
+
+#include <stdlib.h>
+
+#include <vector>
+
+#include <sanitizer/allocator_interface.h>
+
+int main(int argc, char **argv)
+{
+  void *p;
+  std::vector<ssize_t> sizes{1, 8, 16, 32, 1024, 32768,
+    1 << 16, 1 << 17, 1 << 20, 1 << 24};
+  for (size_t size : sizes) {
+    p = malloc(size);
+    if (!p)
+      return 1;
+    if (!__sanitizer_get_ownership(p))
+      return 1;
+    if (__sanitizer_get_allocated_size(p) < size)
+      return 1;
+    free(p);
+  }
+  return 0;
+}
diff --git a/test/scudo/lit.cfg b/test/scudo/lit.cfg
index e2a4997dd..4eff2ce21 100644
--- a/test/scudo/lit.cfg
+++ b/test/scudo/lit.cfg
@@ -3,7 +3,7 @@
 import os
 
 # Setup config name.
-config.name = 'Scudo'
+config.name = 'Scudo' + config.name_suffix
 
 # Setup source root.
 config.test_source_root = os.path.dirname(__file__)
@@ -14,18 +14,19 @@ base_lib = os.path.join(config.compiler_rt_libdir,
 whole_archive = "-Wl,-whole-archive %s -Wl,-no-whole-archive " % base_lib
 
 # Test suffixes.
-config.suffixes = ['.c', '.cc', '.cpp', '.m', '.mm', '.ll', '.test']
+config.suffixes = ['.c', '.cc', '.cpp']
 
 # C flags.
-c_flags = ["-std=c++11",
+c_flags = ([config.target_cflags] +
+           ["-std=c++11",
            "-lstdc++",
-           "-ldl",
            "-lrt",
-           "-pthread",
            "-latomic",
+           "-ldl",
+           "-pthread",
            "-fPIE",
            "-pie",
-           "-O0"]
+           "-O0"])
 
 def build_invocation(compile_flags):                                            
   return " " + " ".join([config.clang] + compile_flags) + " "                   
diff --git a/test/scudo/lit.site.cfg.in b/test/scudo/lit.site.cfg.in
index 64e2fb39e..429951875 100644
--- a/test/scudo/lit.site.cfg.in
+++ b/test/scudo/lit.site.cfg.in
@@ -1,5 +1,9 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+config.name_suffix = "@SCUDO_TEST_CONFIG_SUFFIX@"
+config.target_arch = "@SCUDO_TEST_TARGET_ARCH@"
+config.target_cflags = "@SCUDO_TEST_TARGET_CFLAGS@"
+
 # Load common config for all compiler-rt lit tests.
 lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/test/lit.common.configured")
 
diff --git a/test/scudo/malloc.cpp b/test/scudo/malloc.cpp
index 0f452e360..cafc744a2 100644
--- a/test/scudo/malloc.cpp
+++ b/test/scudo/malloc.cpp
@@ -2,9 +2,9 @@
 // RUN: %run %t 2>&1
 
 // Tests that a regular workflow of allocation, memory fill and free works as
-// intended. Also tests that a zero-sized allocation succeeds.
+// intended. Tests various sizes serviced by the primary and secondary
+// allocators.
 
-#include <malloc.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -13,18 +13,25 @@
 int main(int argc, char **argv)
 {
   void *p;
-  std::vector<size_t> sizes{1, 1 << 5, 1 << 10, 1 << 15, 1 << 20};
+  std::vector<ssize_t> sizes{1, 8, 16, 32, 1024, 32768,
+    1 << 16, 1 << 17, 1 << 20, 1 << 24};
+  std::vector<int> offsets{1, 0, -1, -7, -8, -15, -16, -31, -32};
 
   p = malloc(0);
   if (!p)
     return 1;
   free(p);
-  for (size_t size : sizes) {
-    p = malloc(size);
-    if (!p)
-      return 1;
-    memset(p, 'A', size);
-    free(p);
+  for (ssize_t size : sizes) {
+    for (int offset: offsets) {
+      ssize_t actual_size = size + offset;
+      if (actual_size <= 0)
+        continue;
+      p = malloc(actual_size);
+      if (!p)
+        return 1;
+      memset(p, 0xff, actual_size);
+      free(p);
+    }
   }
 
   return 0;
diff --git a/test/scudo/memalign.cpp b/test/scudo/memalign.cpp
index 3ad3fb055..6f4c50e69 100644
--- a/test/scudo/memalign.cpp
+++ b/test/scudo/memalign.cpp
@@ -31,7 +31,7 @@ int main(int argc, char **argv)
       return 1;
     free(p);
     // Tests various combinations of alignment and sizes
-    for (int i = 4; i < 20; i++) {
+    for (int i = (sizeof(void *) == 4) ? 3 : 4; i <= 24; i++) {
       alignment = 1U << i;
       for (int j = 1; j < 33; j++) {
         size = 0x800 * j;
diff --git a/test/scudo/mismatch.cpp b/test/scudo/mismatch.cpp
index 2d3d198af..54cdafc86 100644
--- a/test/scudo/mismatch.cpp
+++ b/test/scudo/mismatch.cpp
@@ -30,7 +30,7 @@ int main(int argc, char **argv)
     free((void *)p);
   }
   if (!strcmp(argv[1], "memaligndel")) {
-    int *p = (int *)memalign(0x10, 0x10);
+    int *p = (int *)memalign(16, 16);
     if (!p)
       return 1;
     delete p;
diff --git a/test/scudo/overflow.cpp b/test/scudo/overflow.cpp
index 5b2cb7560..c93a544ea 100644
--- a/test/scudo/overflow.cpp
+++ b/test/scudo/overflow.cpp
@@ -11,12 +11,13 @@
 int main(int argc, char **argv)
 {
   assert(argc == 2);
+  ssize_t offset = sizeof(void *) == 8 ? 8 : 0;
   if (!strcmp(argv[1], "malloc")) {
     // Simulate a header corruption of an allocated chunk (1-bit)
     void *p = malloc(1U << 4);
     if (!p)
       return 1;
-    ((char *)p)[-1] ^= 1;
+    ((char *)p)[-(offset + 1)] ^= 1;
     free(p);
   }
   if (!strcmp(argv[1], "quarantine")) {
@@ -25,7 +26,7 @@ int main(int argc, char **argv)
       return 1;
     free(p);
     // Simulate a header corruption of a quarantined chunk
-    ((char *)p)[-2] ^= 1;
+    ((char *)p)[-(offset + 2)] ^= 1;
     // Trigger the quarantine recycle
     for (int i = 0; i < 0x100; i++) {
       p = malloc(1U << 16);
diff --git a/test/scudo/preinit.cpp b/test/scudo/preinit.cpp
index a280ae1d4..34f61c9dd 100644
--- a/test/scudo/preinit.cpp
+++ b/test/scudo/preinit.cpp
@@ -4,7 +4,6 @@
 // Verifies that calling malloc in a preinit_array function succeeds, and that
 // the resulting pointer can be freed at program termination.
 
-#include <malloc.h>
 #include <stdlib.h>
 #include <string.h>
 
diff --git a/test/scudo/random_shuffle.cpp b/test/scudo/random_shuffle.cpp
index 54768a578..026834034 100644
--- a/test/scudo/random_shuffle.cpp
+++ b/test/scudo/random_shuffle.cpp
@@ -7,6 +7,7 @@
 // RUN: %run %t 10000 > %T/random_shuffle_tmp_dir/out2
 // RUN: not diff %T/random_shuffle_tmp_dir/out?
 // RUN: rm -rf %T/random_shuffle_tmp_dir
+// UNSUPPORTED: i386-linux,i686-linux
 
 // Tests that the allocator shuffles the chunks before returning to the user.
 
diff --git a/test/scudo/realloc.cpp b/test/scudo/realloc.cpp
index dfad40197..cc4459500 100644
--- a/test/scudo/realloc.cpp
+++ b/test/scudo/realloc.cpp
@@ -20,7 +20,7 @@ int main(int argc, char **argv)
 {
   void *p, *old_p;
   // Those sizes will exercise both allocators (Primary & Secondary).
-  std::vector<size_t> sizes{1, 1 << 5, 1 << 10, 1 << 15, 1 << 20};
+  std::vector<size_t> sizes{1, 16, 1024, 32768, 1 << 16, 1 << 17, 1 << 20};
 
   assert(argc == 2);
   for (size_t size : sizes) {