summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHans Wennborg <hans@hanshq.net>2018-08-09 12:37:40 +0000
committerHans Wennborg <hans@hanshq.net>2018-08-09 12:37:40 +0000
commitf7ec630085d9746dc26713dfb7c2c0fc8a691251 (patch)
tree43bbad721c0e98c8a7cd8d87e16f43b6d082c5ac
parentef7dd703b18efb1109df25124f91d05444358790 (diff)
Merging r339316:
------------------------------------------------------------------------ r339316 | hahnfeld | 2018-08-09 09:45:49 +0200 (Thu, 09 Aug 2018) | 16 lines [NVPTX] Select atomic loads and stores According to PTX ISA .volatile has the same memory synchronization semantics as .relaxed.sys, so it can be used to implement monotonic atomic loads and stores. This is important for OpenMP's atomic construct where - 'read's and 'write's are lowered to atomic loads and stores, and - an update of float or double types are lowered into a cmpxchg loop. (Note that PTX could do better because it has atom.add.f{32,64} but LLVM's atomicrmw instruction only allows integer types.) Higher levels of atomicity (like acquire and release) need additional synchronization properties which were added with PTX ISA 6.0 / sm_70. So using these instructions still results in an error. Differential Revision: https://reviews.llvm.org/D50391 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@339338 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp116
-rw-r--r--test/CodeGen/NVPTX/load-store.ll88
2 files changed, 170 insertions, 34 deletions
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 4dfa8477a36..21939d836dc 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -16,6 +16,7 @@
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
@@ -81,10 +82,12 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
switch (N->getOpcode()) {
case ISD::LOAD:
+ case ISD::ATOMIC_LOAD:
if (tryLoad(N))
return;
break;
case ISD::STORE:
+ case ISD::ATOMIC_STORE:
if (tryStore(N))
return;
break;
@@ -834,17 +837,27 @@ static Optional<unsigned> pickOpcodeForVT(
bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
SDLoc dl(N);
- LoadSDNode *LD = cast<LoadSDNode>(N);
+ MemSDNode *LD = cast<MemSDNode>(N);
+ assert(LD->readMem() && "Expected load");
+ LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
EVT LoadedVT = LD->getMemoryVT();
SDNode *NVPTXLD = nullptr;
// do not support pre/post inc/dec
- if (LD->isIndexed())
+ if (PlainLoad && PlainLoad->isIndexed())
return false;
if (!LoadedVT.isSimple())
return false;
+ AtomicOrdering Ordering = LD->getOrdering();
+ // In order to lower atomic loads with stronger guarantees we would need to
+ // use load.acquire or insert fences. However these features were only added
+ // with PTX ISA 6.0 / sm_70.
+ // TODO: Check if we can actually use the new instructions and implement them.
+ if (isStrongerThanMonotonic(Ordering))
+ return false;
+
// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
@@ -855,8 +868,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
// Volatile Setting
- // - .volatile is only availalble for .global and .shared
- bool isVolatile = LD->isVolatile();
+ // - .volatile is only available for .global and .shared
+ // - .volatile has the same memory synchronization semantics as .relaxed.sys
+ bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
@@ -882,7 +896,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
fromTypeWidth = 32;
}
- if ((LD->getExtensionType() == ISD::SEXTLOAD))
+ if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
fromType = NVPTX::PTXLdStInstCode::Signed;
else if (ScalarVT.isFloatingPoint())
// f16 uses .b16 as its storage type.
@@ -1691,25 +1705,38 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
SDLoc dl(N);
- StoreSDNode *ST = cast<StoreSDNode>(N);
+ MemSDNode *ST = cast<MemSDNode>(N);
+ assert(ST->writeMem() && "Expected store");
+ StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
+ AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
+ assert((PlainStore || AtomicStore) && "Expected store");
EVT StoreVT = ST->getMemoryVT();
SDNode *NVPTXST = nullptr;
// do not support pre/post inc/dec
- if (ST->isIndexed())
+ if (PlainStore && PlainStore->isIndexed())
return false;
if (!StoreVT.isSimple())
return false;
+ AtomicOrdering Ordering = ST->getOrdering();
+ // In order to lower atomic loads with stronger guarantees we would need to
+ // use store.release or insert fences. However these features were only added
+ // with PTX ISA 6.0 / sm_70.
+ // TODO: Check if we can actually use the new instructions and implement them.
+ if (isStrongerThanMonotonic(Ordering))
+ return false;
+
// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
// Volatile Setting
- // - .volatile is only availalble for .global and .shared
- bool isVolatile = ST->isVolatile();
+ // - .volatile is only available for .global and .shared
+ // - .volatile has the same memory synchronization semantics as .relaxed.sys
+ bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
@@ -1739,41 +1766,53 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
toType = NVPTX::PTXLdStInstCode::Unsigned;
// Create the machine instruction DAG
- SDValue Chain = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDValue N2 = N->getOperand(2);
+ SDValue Chain = ST->getChain();
+ SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
+ SDValue BasePtr = ST->getBasePtr();
SDValue Addr;
SDValue Offset, Base;
Optional<unsigned> Opcode;
- MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy;
+ MVT::SimpleValueType SourceVT =
+ Value.getNode()->getSimpleValueType(0).SimpleTy;
- if (SelectDirectAddr(N2, Addr)) {
+ if (SelectDirectAddr(BasePtr, Addr)) {
Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
if (!Opcode)
return false;
- SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
- getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
- getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr,
- Chain };
+ SDValue Ops[] = {Value,
+ getI32Imm(isVolatile, dl),
+ getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl),
+ getI32Imm(toType, dl),
+ getI32Imm(toTypeWidth, dl),
+ Addr,
+ Chain};
NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
- } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
- : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+ } else if (PointerSize == 64
+ ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
+ : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
if (!Opcode)
return false;
- SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
- getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
- getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
- Offset, Chain };
+ SDValue Ops[] = {Value,
+ getI32Imm(isVolatile, dl),
+ getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl),
+ getI32Imm(toType, dl),
+ getI32Imm(toTypeWidth, dl),
+ Base,
+ Offset,
+ Chain};
NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
- } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
- : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+ } else if (PointerSize == 64
+ ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
+ : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
if (PointerSize == 64)
Opcode = pickOpcodeForVT(
SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
@@ -1787,10 +1826,15 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
- SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
- getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
- getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
- Offset, Chain };
+ SDValue Ops[] = {Value,
+ getI32Imm(isVolatile, dl),
+ getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl),
+ getI32Imm(toType, dl),
+ getI32Imm(toTypeWidth, dl),
+ Base,
+ Offset,
+ Chain};
NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
} else {
if (PointerSize == 64)
@@ -1806,10 +1850,14 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
if (!Opcode)
return false;
- SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
- getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
- getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2,
- Chain };
+ SDValue Ops[] = {Value,
+ getI32Imm(isVolatile, dl),
+ getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl),
+ getI32Imm(toType, dl),
+ getI32Imm(toTypeWidth, dl),
+ BasePtr,
+ Chain};
NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
}
diff --git a/test/CodeGen/NVPTX/load-store.ll b/test/CodeGen/NVPTX/load-store.ll
new file mode 100644
index 00000000000..03b0109dea2
--- /dev/null
+++ b/test/CodeGen/NVPTX/load-store.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+; CHECK-LABEL: plain
+define void @plain(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr {
+ ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load i8, i8* %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i8 %a.add, i8* %a
+
+ ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load i16, i16* %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i16 %b.add, i16* %b
+
+ ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load i32, i32* %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store i32 %c.add, i32* %c
+
+ ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load i64, i64* %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store i64 %d.add, i64* %d
+
+ ret void
+}
+
+; CHECK-LABEL: volatile
+define void @volatile(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load volatile i8, i8* %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i8 %a.add, i8* %a
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load volatile i16, i16* %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i16 %b.add, i16* %b
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load volatile i32, i32* %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile i32 %c.add, i32* %c
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load volatile i64, i64* %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store volatile i64 %d.add, i64* %d
+
+ ret void
+}
+
+; CHECK-LABEL: monotonic
+define void @monotonic(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, i8* %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, i8* %a monotonic, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, i16* %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, i16* %b monotonic, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, i32* %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, i32* %c monotonic, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, i64* %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, i64* %d monotonic, align 8
+
+ ret void
+}