[Power] Improve the expansion of atomic loads/stores

Summary: Atomic loads and store of up to the native size (32 bits, or 64 for PPC64) can be lowered to a simple load or store instruction (as the synchronization is already handled by AtomicExpand, and the atomicity is guaranteed thanks to the alignment requirements of atomic accesses). This is exactly what this patch does. Previously, these were implemented by complex load-linked/store-conditional loops.. an obvious performance problem. For example, this patch turns ``` define void @store_i8_unordered(i8* %mem) { store atomic i8 42, i8* %mem unordered, align 1 ret void } ``` from ``` _store_i8_unordered: ; @store_i8_unordered ; BB#0: rlwinm r2, r3, 3, 27, 28 li r4, 42 xori r5, r2, 24 rlwinm r2, r3, 0, 0, 29 li r3, 255 slw r4, r4, r5 slw r3, r3, r5 and r4, r4, r3 LBB4_1: ; =>This Inner Loop Header: Depth=1 lwarx r5, 0, r2 andc r5, r5, r3 or r5, r4, r5 stwcx. r5, 0, r2 bne cr0, LBB4_1 ; BB#2: blr ``` into ``` _store_i8_unordered: ; @store_i8_unordered ; BB#0: li r2, 42 stb r2, 0(r3) blr ``` which looks like a pretty clear win to me. Test Plan: fixed the tests + new test for indexed accesses + make check-all Reviewers: jfb, wschmidt, hfinkel Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D5587 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218922 91177308-0d34-0410-b5e6-96231b3b80d8
author: Robin Morisset <morisset@google.com> 2014-10-02 22:27:07 +0000
committer: Robin Morisset <morisset@google.com> 2014-10-02 22:27:07 +0000
commit: 2b1874cbd4faf32d73bf623d7ed88be3ad135e8f (patch)
tree: 57c2e764b33c1e54e3ec2a970c2c24106ccb2f37 /test
parent: bbb28e7e98250e4fa218587633c700ee009e261f (diff)
4 files changed, 101 insertions, 6 deletions
diff --git a/test/CodeGen/PowerPC/atomic-2.ll b/test/CodeGen/PowerPC/atomic-2.ll
index 843250f10b4..9cb0fa5be5c 100644
--- a/test/CodeGen/PowerPC/atomic-2.ll
+++ b/test/CodeGen/PowerPC/atomic-2.ll
@@ -30,8 +30,9 @@ define void @atomic_store(i64* %mem, i64 %val) nounwind {
 entry:
 ; CHECK: @atomic_store
   store atomic i64 %val, i64* %mem release, align 64
-; CHECK: ldarx
-; CHECK: stdcx.
+; CHECK: sync 1
+; CHECK-NOT: stdcx
+; CHECK: std
   ret void
 }
 
@@ -39,9 +40,9 @@ define i64 @atomic_load(i64* %mem) nounwind {
 entry:
 ; CHECK: @atomic_load
   %tmp = load atomic i64* %mem acquire, align 64
-; CHECK: ldarx
-; CHECK: stdcx.
-; CHECK: stdcx.
+; CHECK-NOT: ldarx
+; CHECK: ld
+; CHECK: sync 1
   ret i64 %tmp
 }
 
diff --git a/test/CodeGen/PowerPC/atomics-indexed.ll b/test/CodeGen/PowerPC/atomics-indexed.ll
new file mode 100644
index 00000000000..bb9ca040196
--- /dev/null
+++ b/test/CodeGen/PowerPC/atomics-indexed.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc32 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32
+; FIXME: -verify-machineinstrs currently fail on ppc64 (mismatched register/instruction).
+; This is already checked for in Atomics-64.ll
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc64 | FileCheck %s --check-prefix=CHECK --check-prefix=PPC64
+
+; In this file, we check that atomic load/store can make use of the indexed
+; versions of the instructions.
+
+; Indexed version of loads
+define i8 @load_x_i8_seq_cst([100000 x i8]* %mem) {
+; CHECK-LABEL: load_x_i8_seq_cst
+; CHECK: sync 0
+; CHECK: lbzx
+; CHECK: sync 1
+  %ptr = getelementptr inbounds [100000 x i8]* %mem, i64 0, i64 90000
+  %val = load atomic i8* %ptr seq_cst, align 1
+  ret i8 %val
+}
+define i16 @load_x_i16_acquire([100000 x i16]* %mem) {
+; CHECK-LABEL: load_x_i16_acquire
+; CHECK: lhzx
+; CHECK: sync 1
+  %ptr = getelementptr inbounds [100000 x i16]* %mem, i64 0, i64 90000
+  %val = load atomic i16* %ptr acquire, align 2
+  ret i16 %val
+}
+define i32 @load_x_i32_monotonic([100000 x i32]* %mem) {
+; CHECK-LABEL: load_x_i32_monotonic
+; CHECK: lwzx
+; CHECK-NOT: sync
+  %ptr = getelementptr inbounds [100000 x i32]* %mem, i64 0, i64 90000
+  %val = load atomic i32* %ptr monotonic, align 4
+  ret i32 %val
+}
+define i64 @load_x_i64_unordered([100000 x i64]* %mem) {
+; CHECK-LABEL: load_x_i64_unordered
+; PPC32: __sync_
+; PPC64-NOT: __sync_
+; PPC64: ldx
+; CHECK-NOT: sync
+  %ptr = getelementptr inbounds [100000 x i64]* %mem, i64 0, i64 90000
+  %val = load atomic i64* %ptr unordered, align 8
+  ret i64 %val
+}
+
+; Indexed version of stores
+define void @store_x_i8_seq_cst([100000 x i8]* %mem) {
+; CHECK-LABEL: store_x_i8_seq_cst
+; CHECK: sync 0
+; CHECK: stbx
+  %ptr = getelementptr inbounds [100000 x i8]* %mem, i64 0, i64 90000
+  store atomic i8 42, i8* %ptr seq_cst, align 1
+  ret void
+}
+define void @store_x_i16_release([100000 x i16]* %mem) {
+; CHECK-LABEL: store_x_i16_release
+; CHECK: sync 1
+; CHECK: sthx
+  %ptr = getelementptr inbounds [100000 x i16]* %mem, i64 0, i64 90000
+  store atomic i16 42, i16* %ptr release, align 2
+  ret void
+}
+define void @store_x_i32_monotonic([100000 x i32]* %mem) {
+; CHECK-LABEL: store_x_i32_monotonic
+; CHECK-NOT: sync
+; CHECK: stwx
+  %ptr = getelementptr inbounds [100000 x i32]* %mem, i64 0, i64 90000
+  store atomic i32 42, i32* %ptr monotonic, align 4
+  ret void
+}
+define void @store_x_i64_unordered([100000 x i64]* %mem) {
+; CHECK-LABEL: store_x_i64_unordered
+; CHECK-NOT: sync 0
+; CHECK-NOT: sync 1
+; PPC32: __sync_
+; PPC64-NOT: __sync_
+; PPC64: stdx
+  %ptr = getelementptr inbounds [100000 x i64]* %mem, i64 0, i64 90000
+  store atomic i64 42, i64* %ptr unordered, align 8
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/atomics.ll b/test/CodeGen/PowerPC/atomics.ll
index 4c4ccb4df83..5f6a6a4dcdf 100644
--- a/test/CodeGen/PowerPC/atomics.ll
+++ b/test/CodeGen/PowerPC/atomics.ll
@@ -11,18 +11,21 @@
 ; We also vary orderings to check for barriers.
 define i8 @load_i8_unordered(i8* %mem) {
 ; CHECK-LABEL: load_i8_unordered
+; CHECK: lbz
 ; CHECK-NOT: sync
   %val = load atomic i8* %mem unordered, align 1
   ret i8 %val
 }
 define i16 @load_i16_monotonic(i16* %mem) {
 ; CHECK-LABEL: load_i16_monotonic
+; CHECK: lhz
 ; CHECK-NOT: sync
   %val = load atomic i16* %mem monotonic, align 2
   ret i16 %val
 }
 define i32 @load_i32_acquire(i32* %mem) {
 ; CHECK-LABEL: load_i32_acquire
+; CHECK: lwz
   %val = load atomic i32* %mem acquire, align 4
 ; CHECK: sync 1
   ret i32 %val
@@ -30,6 +33,9 @@ define i32 @load_i32_acquire(i32* %mem) {
 define i64 @load_i64_seq_cst(i64* %mem) {
 ; CHECK-LABEL: load_i64_seq_cst
 ; CHECK: sync 0
+; PPC32: __sync_
+; PPC64-NOT: __sync_
+; PPC64: ld
   %val = load atomic i64* %mem seq_cst, align 8
 ; CHECK: sync 1
   ret i64 %val
@@ -39,24 +45,30 @@ define i64 @load_i64_seq_cst(i64* %mem) {
 define void @store_i8_unordered(i8* %mem) {
 ; CHECK-LABEL: store_i8_unordered
 ; CHECK-NOT: sync
+; CHECK: stb
   store atomic i8 42, i8* %mem unordered, align 1
   ret void
 }
 define void @store_i16_monotonic(i16* %mem) {
 ; CHECK-LABEL: store_i16_monotonic
 ; CHECK-NOT: sync
+; CHECK: sth
   store atomic i16 42, i16* %mem monotonic, align 2
   ret void
 }
 define void @store_i32_release(i32* %mem) {
 ; CHECK-LABEL: store_i32_release
 ; CHECK: sync 1
+; CHECK: stw
   store atomic i32 42, i32* %mem release, align 4
   ret void
 }
 define void @store_i64_seq_cst(i64* %mem) {
 ; CHECK-LABEL: store_i64_seq_cst
 ; CHECK: sync 0
+; PPC32: __sync_
+; PPC64-NOT: __sync_
+; PPC64: std
   store atomic i64 42, i64* %mem seq_cst, align 8
   ret void
 }
diff --git a/test/CodeGen/PowerPC/pr15630.ll b/test/CodeGen/PowerPC/pr15630.ll
index c5ba8a4d4f0..3c1b604f009 100644
--- a/test/CodeGen/PowerPC/pr15630.ll
+++ b/test/CodeGen/PowerPC/pr15630.ll
@@ -13,4 +13,5 @@ entry:
   ret void
 }
 
-; CHECK: stwcx.
+; CHECK: sync
+; CHECK: stb
author	Robin Morisset <morisset@google.com>	2014-10-02 22:27:07 +0000
committer	Robin Morisset <morisset@google.com>	2014-10-02 22:27:07 +0000
commit	2b1874cbd4faf32d73bf623d7ed88be3ad135e8f (patch)
tree	57c2e764b33c1e54e3ec2a970c2c24106ccb2f37 /test
parent	bbb28e7e98250e4fa218587633c700ee009e261f (diff)