summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/llvm/Target/TargetSchedule.td63
-rw-r--r--lib/Target/X86/X86ScheduleBtVer2.td9
-rw-r--r--test/tools/llvm-mca/X86/BtVer2/partial-reg-update-2.s15
-rw-r--r--test/tools/llvm-mca/X86/BtVer2/partial-reg-update-3.s35
-rw-r--r--test/tools/llvm-mca/X86/BtVer2/partial-reg-update-4.s30
-rw-r--r--test/tools/llvm-mca/X86/BtVer2/partial-reg-update-5.s14
-rw-r--r--test/tools/llvm-mca/X86/BtVer2/partial-reg-update-6.s32
-rw-r--r--tools/llvm-mca/Instruction.cpp25
-rw-r--r--tools/llvm-mca/Instruction.h12
-rw-r--r--tools/llvm-mca/RegisterFile.cpp135
-rw-r--r--tools/llvm-mca/RegisterFile.h35
11 files changed, 275 insertions, 130 deletions
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index 29daa029bb2..6fd2d5b78e5 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -453,15 +453,60 @@ class SchedAlias<SchedReadWrite match, SchedReadWrite alias> {
SchedMachineModel SchedModel = ?;
}
-// Allow the definition of processor register files.
-// Each processor register file declares the number of physical registers, as
-// well as a optional register cost information. The cost of a register R is the
-// number of physical registers used to rename R (at register renaming stage).
-// That value defaults to 1, to all the registers contained in the register
-// file. The set of target register files is inferred from the list of register
-// classes. Register costs are defined at register class granularity. An empty
-// list of register classes means that this register file contains all the
-// registers defined by the target.
+// Allow the definition of processor register files for register renaming
+// purposes.
+//
+// Each processor register file declares:
+// - The set of registers that can be renamed.
+// - The number of physical registers which can be used for register renaming
+// purpose.
+// - The cost of a register rename.
+//
+// The cost of a rename is the number of physical registers allocated by the
+// register alias table to map the new definition. By default, register can be
+// renamed at the cost of a single physical register. Note that register costs
+// are defined at register class granularity (see field `Costs`).
+//
+// The set of registers that are subject to register renaming is declared using
+// a list of register classes (see field `RegClasses`). An empty list of
+// register classes means: all the logical registers defined by the target can
+// be fully renamed.
+//
+// A register R can be renamed if its register class appears in the `RegClasses`
+// set. When R is written, a new alias is allocated at the cost of one or more
+// physical registers; as a result, false dependencies on R are removed.
+//
+// A sub-register V of register R is implicitly part of the same register file.
+// However, V is only renamed if its register class is part of `RegClasses`.
+// Otherwise, the processor keeps it (as well as any other different part
+// of R) together with R, and a write of V always causes a compulsory read of R.
+//
+// This is what happens for example on AMD processors (at least from Bulldozer
+// onwards), where AL and AH are not treated as independent from AX, and AX is
+// not treated as independent from EAX. A write to AL has an implicity false
+// dependency on the last write to EAX (or a portion of EAX). As a consequence,
+// a write to AL cannot go in parallel with a write to AH.
+//
+// There is no false dependency if the partial register write belongs to a
+// register class that is in `RegClasses`.
+// There is also no penalty for writes that "clear the content a super-register"
+// (see MC/MCInstrAnalysis.h - method MCInstrAnalysis::clearsSuperRegisters()).
+// On x86-64, 32-bit GPR writes implicitly zero the upper half of the underlying
+// physical register, effectively removing any false dependencies with the
+// previous register definition.
+//
+// TODO: This implementation assumes that there is no limit in the number of
+// renames per cycle, which might not be true for all hardware or register
+// classes. Also, there is no limit to how many times the same logical register
+// can be renamed during the same cycle.
+//
+// TODO: we don't currently model merge penalties for the case where a write to
+// a part of a register is followed by a read from a larger part of the same
+// register. On some Intel chips, different parts of a GPR can be stored in
+// different physical registers. However, there is a cost to pay for when the
+// partial write is combined with the previous super-register definition. We
+// should add support for these cases, and correctly model merge problems with
+// partial register accesses.
class RegisterFile<int numPhysRegs, list<RegisterClass> Classes = [],
list<int> Costs = []> {
list<RegisterClass> RegClasses = Classes;
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
index ce1d49a82db..177aa01f1d4 100644
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -41,7 +41,14 @@ def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
// speculative version of the 64-bit integer registers.
// Reference: www.realworldtech.com/jaguar/4/
-def JIntegerPRF : RegisterFile<64, [GR8, GR16, GR32, GR64, CCR]>;
+//
+// The processor always keeps the different parts of an integer register
+// together. An instruction that writes to a part of a register will therefore
+// have a false dependence on any previous write to the same register or any
+// part of it.
+// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
+// access" - Agner Fog's "microarchitecture.pdf".
+def JIntegerPRF : RegisterFile<64, [GR64, CCR]>;
// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
// registers. Operations on 256-bit data types are cracked into two COPs.
diff --git a/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-2.s b/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-2.s
index c8cc463b015..fb6eb234801 100644
--- a/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-2.s
+++ b/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-2.s
@@ -7,9 +7,9 @@ add %ecx, %ebx
# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 3
-# CHECK-NEXT: Total Cycles: 10
+# CHECK-NEXT: Total Cycles: 11
# CHECK-NEXT: Dispatch Width: 2
-# CHECK-NEXT: IPC: 0.30
+# CHECK-NEXT: IPC: 0.27
# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Instruction Info:
@@ -26,11 +26,12 @@ add %ecx, %ebx
# CHECK-NEXT: 1 1 0.50 addl %ecx, %ebx
# CHECK: Timeline view:
+# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER. imulq %rax, %rbx
-# CHECK-NEXT: [0,1] .DeE----R. lzcntw %ax, %bx
-# CHECK-NEXT: [0,2] .D=====eER addl %ecx, %ebx
+# CHECK: [0,0] DeeeeeeER . imulq %rax, %rbx
+# CHECK-NEXT: [0,1] .D=====eER. lzcntw %ax, %bx
+# CHECK-NEXT: [0,2] .D======eER addl %ecx, %ebx
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -40,5 +41,5 @@ add %ecx, %ebx
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulq %rax, %rbx
-# CHECK-NEXT: 1. 1 1.0 1.0 4.0 lzcntw %ax, %bx
-# CHECK-NEXT: 2. 1 6.0 0.0 0.0 addl %ecx, %ebx
+# CHECK-NEXT: 1. 1 6.0 0.0 0.0 lzcntw %ax, %bx
+# CHECK-NEXT: 2. 1 7.0 0.0 0.0 addl %ecx, %ebx
diff --git a/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-3.s b/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-3.s
index 2384528223a..0a9ffca64a5 100644
--- a/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-3.s
+++ b/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-3.s
@@ -12,9 +12,9 @@ xor %bx, %dx
# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 4500
-# CHECK-NEXT: Total Cycles: 2254
+# CHECK-NEXT: Total Cycles: 4503
# CHECK-NEXT: Dispatch Width: 2
-# CHECK-NEXT: IPC: 2.00
+# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 1.5
# CHECK: Instruction Info:
@@ -52,22 +52,23 @@ xor %bx, %dx
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
-# CHECK-NEXT: 1.00 - - - - - - - - - - - - - addw %cx, %dx
-# CHECK-NEXT: - 1.00 - - - - - - - - - - - - movw %ax, %dx
+# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - addw %cx, %dx
+# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - movw %ax, %dx
# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - xorw %bx, %dx
# CHECK: Timeline view:
-# CHECK-NEXT: Index 012345678
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeER . . addw %cx, %dx
-# CHECK-NEXT: [0,1] DeER . . movw %ax, %dx
-# CHECK-NEXT: [0,2] .DeER. . xorw %bx, %dx
-# CHECK-NEXT: [1,0] .D=eER . addw %cx, %dx
-# CHECK-NEXT: [1,1] . DeER . movw %ax, %dx
-# CHECK-NEXT: [1,2] . D=eER . xorw %bx, %dx
-# CHECK-NEXT: [2,0] . D=eER. addw %cx, %dx
-# CHECK-NEXT: [2,1] . DeE-R. movw %ax, %dx
-# CHECK-NEXT: [2,2] . DeE-R xorw %bx, %dx
+# CHECK: [0,0] DeER . .. addw %cx, %dx
+# CHECK-NEXT: [0,1] D=eER. .. movw %ax, %dx
+# CHECK-NEXT: [0,2] .D=eER .. xorw %bx, %dx
+# CHECK-NEXT: [1,0] .D==eER .. addw %cx, %dx
+# CHECK-NEXT: [1,1] . D==eER .. movw %ax, %dx
+# CHECK-NEXT: [1,2] . D===eER .. xorw %bx, %dx
+# CHECK-NEXT: [2,0] . D===eER.. addw %cx, %dx
+# CHECK-NEXT: [2,1] . D====eER. movw %ax, %dx
+# CHECK-NEXT: [2,2] . D====eER xorw %bx, %dx
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -76,6 +77,6 @@ xor %bx, %dx
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 3 1.7 0.3 0.0 addw %cx, %dx
-# CHECK-NEXT: 1. 3 1.0 1.0 0.3 movw %ax, %dx
-# CHECK-NEXT: 2. 3 1.3 0.0 0.3 xorw %bx, %dx
+# CHECK-NEXT: 0. 3 2.7 0.3 0.0 addw %cx, %dx
+# CHECK-NEXT: 1. 3 3.3 0.0 0.0 movw %ax, %dx
+# CHECK-NEXT: 2. 3 3.7 0.0 0.0 xorw %bx, %dx
diff --git a/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-4.s b/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-4.s
index 6277ceb5c99..cda173e5266 100644
--- a/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-4.s
+++ b/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-4.s
@@ -12,9 +12,9 @@ add %cx, %bx
# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 4500
-# CHECK-NEXT: Total Cycles: 3006
+# CHECK-NEXT: Total Cycles: 7503
# CHECK-NEXT: Dispatch Width: 2
-# CHECK-NEXT: IPC: 1.50
+# CHECK-NEXT: IPC: 0.60
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
@@ -57,18 +57,18 @@ add %cx, %bx
# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - addw %cx, %bx
# CHECK: Timeline view:
-# CHECK-NEXT: 01
+# CHECK-NEXT: 01234567
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeER .. imulw %ax, %bx
-# CHECK-NEXT: [0,1] .DeE-R .. lzcntw %ax, %bx
-# CHECK-NEXT: [0,2] .D=eE-R .. addw %cx, %bx
-# CHECK-NEXT: [1,0] . D=eeeER .. imulw %ax, %bx
-# CHECK-NEXT: [1,1] . DeE--R .. lzcntw %ax, %bx
-# CHECK-NEXT: [1,2] . D=eE--R.. addw %cx, %bx
-# CHECK-NEXT: [2,0] . D=eeeER. imulw %ax, %bx
-# CHECK-NEXT: [2,1] . DeE--R. lzcntw %ax, %bx
-# CHECK-NEXT: [2,2] . D=eE--R addw %cx, %bx
+# CHECK: [0,0] DeeeER . . . imulw %ax, %bx
+# CHECK-NEXT: [0,1] .D==eER . . . lzcntw %ax, %bx
+# CHECK-NEXT: [0,2] .D===eER . . . addw %cx, %bx
+# CHECK-NEXT: [1,0] . D===eeeER . . imulw %ax, %bx
+# CHECK-NEXT: [1,1] . D=====eER . . lzcntw %ax, %bx
+# CHECK-NEXT: [1,2] . D======eER . . addw %cx, %bx
+# CHECK-NEXT: [2,0] . D======eeeER . imulw %ax, %bx
+# CHECK-NEXT: [2,1] . D========eER. lzcntw %ax, %bx
+# CHECK-NEXT: [2,2] . D=========eER addw %cx, %bx
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -77,6 +77,6 @@ add %cx, %bx
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 3 1.7 0.3 0.0 imulw %ax, %bx
-# CHECK-NEXT: 1. 3 1.0 1.0 1.7 lzcntw %ax, %bx
-# CHECK-NEXT: 2. 3 2.0 0.0 1.7 addw %cx, %bx
+# CHECK-NEXT: 0. 3 4.0 0.3 0.0 imulw %ax, %bx
+# CHECK-NEXT: 1. 3 6.0 0.0 0.0 lzcntw %ax, %bx
+# CHECK-NEXT: 2. 3 7.0 0.0 0.0 addw %cx, %bx
diff --git a/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-5.s b/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-5.s
index ca1419d153e..7669621d7af 100644
--- a/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-5.s
+++ b/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-5.s
@@ -7,9 +7,9 @@ lzcnt %ax, %bx ## partial register stall.
# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 1500
-# CHECK-NEXT: Total Cycles: 753
+# CHECK-NEXT: Total Cycles: 1503
# CHECK-NEXT: Dispatch Width: 2
-# CHECK-NEXT: IPC: 1.99
+# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 0.5
# CHECK: Instruction Info:
@@ -48,11 +48,11 @@ lzcnt %ax, %bx ## partial register stall.
# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - lzcntw %ax, %bx
# CHECK: Timeline view:
-# CHECK-NEXT: Index 01234
+# CHECK-NEXT: Index 012345
-# CHECK: [0,0] DeER. lzcntw %ax, %bx
-# CHECK-NEXT: [1,0] DeER. lzcntw %ax, %bx
-# CHECK-NEXT: [2,0] .DeER lzcntw %ax, %bx
+# CHECK: [0,0] DeER . lzcntw %ax, %bx
+# CHECK-NEXT: [1,0] D=eER. lzcntw %ax, %bx
+# CHECK-NEXT: [2,0] .D=eER lzcntw %ax, %bx
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -61,4 +61,4 @@ lzcnt %ax, %bx ## partial register stall.
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 3 1.0 1.0 0.0 lzcntw %ax, %bx
+# CHECK-NEXT: 0. 3 1.7 0.3 0.0 lzcntw %ax, %bx
diff --git a/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-6.s b/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-6.s
index 3530f2191a7..939af6d06c8 100644
--- a/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-6.s
+++ b/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-6.s
@@ -13,9 +13,9 @@ lzcnt 2(%rsp), %cx
# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 4500
-# CHECK-NEXT: Total Cycles: 4507
+# CHECK-NEXT: Total Cycles: 7504
# CHECK-NEXT: Dispatch Width: 2
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: IPC: 0.60
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
@@ -54,22 +54,22 @@ lzcnt 2(%rsp), %cx
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
# CHECK-NEXT: - 1.00 - - - - - - 1.00 - - - - - imull %edx, %ecx
-# CHECK-NEXT: 0.99 0.01 - - - - - 1.00 - - - - - - lzcntw (%rsp), %cx
+# CHECK-NEXT: 1.00 - - - - - - 1.00 - - - - - - lzcntw (%rsp), %cx
# CHECK-NEXT: 0.50 0.50 - - - - - 1.00 - - - - - - lzcntw 2(%rsp), %cx
# CHECK: Timeline view:
-# CHECK-NEXT: 012345
+# CHECK-NEXT: 012345678
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeER . . imull %edx, %ecx
-# CHECK-NEXT: [0,1] .DeeeeER . . lzcntw (%rsp), %cx
-# CHECK-NEXT: [0,2] .D=eeeeER . . lzcntw 2(%rsp), %cx
-# CHECK-NEXT: [1,0] . D====eeeER . imull %edx, %ecx
-# CHECK-NEXT: [1,1] . DeeeeE--R . lzcntw (%rsp), %cx
-# CHECK-NEXT: [1,2] . D=eeeeE--R . lzcntw 2(%rsp), %cx
-# CHECK-NEXT: [2,0] . D=====eeeER. imull %edx, %ecx
-# CHECK-NEXT: [2,1] . DeeeeE---R. lzcntw (%rsp), %cx
-# CHECK-NEXT: [2,2] . D=eeeeE---R lzcntw 2(%rsp), %cx
+# CHECK: [0,0] DeeeER . . . imull %edx, %ecx
+# CHECK-NEXT: [0,1] .DeeeeER . . . lzcntw (%rsp), %cx
+# CHECK-NEXT: [0,2] .D=eeeeER . . . lzcntw 2(%rsp), %cx
+# CHECK-NEXT: [1,0] . D====eeeER . . imull %edx, %ecx
+# CHECK-NEXT: [1,1] . D===eeeeER . . lzcntw (%rsp), %cx
+# CHECK-NEXT: [1,2] . D====eeeeER . . lzcntw 2(%rsp), %cx
+# CHECK-NEXT: [2,0] . D=======eeeER . imull %edx, %ecx
+# CHECK-NEXT: [2,1] . D======eeeeER. lzcntw (%rsp), %cx
+# CHECK-NEXT: [2,2] . D=======eeeeER lzcntw 2(%rsp), %cx
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -78,6 +78,6 @@ lzcnt 2(%rsp), %cx
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 3 4.0 0.3 0.0 imull %edx, %ecx
-# CHECK-NEXT: 1. 3 1.0 1.0 1.7 lzcntw (%rsp), %cx
-# CHECK-NEXT: 2. 3 2.0 2.0 1.7 lzcntw 2(%rsp), %cx
+# CHECK-NEXT: 0. 3 4.7 0.3 0.0 imull %edx, %ecx
+# CHECK-NEXT: 1. 3 4.0 0.3 0.0 lzcntw (%rsp), %cx
+# CHECK-NEXT: 2. 3 5.0 0.0 0.0 lzcntw 2(%rsp), %cx
diff --git a/tools/llvm-mca/Instruction.cpp b/tools/llvm-mca/Instruction.cpp
index 53f0d67bf77..0c847670557 100644
--- a/tools/llvm-mca/Instruction.cpp
+++ b/tools/llvm-mca/Instruction.cpp
@@ -132,7 +132,22 @@ void Instruction::execute() {
void Instruction::update() {
assert(isDispatched() && "Unexpected instruction stage found!");
- if (llvm::all_of(Uses, [](const UniqueUse &Use) { return Use->isReady(); }))
+
+ if (!llvm::all_of(Uses, [](const UniqueUse &Use) { return Use->isReady(); }))
+ return;
+
+ // A partial register write cannot complete before a dependent write.
+ auto IsDefReady = [&](const UniqueDef &Def) {
+ if (const WriteState *Write = Def->getDependentWrite()) {
+ int WriteLatency = Write->getCyclesLeft();
+ if (WriteLatency == UNKNOWN_CYCLES)
+ return false;
+ return static_cast<unsigned>(WriteLatency) < Desc.MaxLatency;
+ }
+ return true;
+ };
+
+ if (llvm::all_of(Defs, IsDefReady))
Stage = IS_READY;
}
@@ -141,14 +156,10 @@ void Instruction::cycleEvent() {
return;
if (isDispatched()) {
- bool IsReady = true;
- for (UniqueUse &Use : Uses) {
+ for (UniqueUse &Use : Uses)
Use->cycleEvent();
- IsReady &= Use->isReady();
- }
- if (IsReady)
- Stage = IS_READY;
+ update();
return;
}
diff --git a/tools/llvm-mca/Instruction.h b/tools/llvm-mca/Instruction.h
index 3588fb0ba60..ddf5c3a5e33 100644
--- a/tools/llvm-mca/Instruction.h
+++ b/tools/llvm-mca/Instruction.h
@@ -101,6 +101,12 @@ class WriteState {
// super-registers.
bool ClearsSuperRegs;
+ // This field is set if this is a partial register write, and it has a false
+ // dependency on any previous write of the same register (or a portion of it).
+ // DependentWrite must be able to complete before this write completes, so
+ // that we don't break the WAW, and the two writes can be merged together.
+ const WriteState *DependentWrite;
+
// A list of dependent reads. Users is a set of dependent
// reads. A dependent read is added to the set only if CyclesLeft
// is "unknown". As soon as CyclesLeft is 'known', each user in the set
@@ -113,7 +119,7 @@ public:
WriteState(const WriteDescriptor &Desc, unsigned RegID,
bool clearsSuperRegs = false)
: WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
- ClearsSuperRegs(clearsSuperRegs) {}
+ ClearsSuperRegs(clearsSuperRegs), DependentWrite(nullptr) {}
WriteState(const WriteState &Other) = delete;
WriteState &operator=(const WriteState &Other) = delete;
@@ -126,6 +132,9 @@ public:
unsigned getNumUsers() const { return Users.size(); }
bool clearsSuperRegisters() const { return ClearsSuperRegs; }
+ const WriteState *getDependentWrite() const { return DependentWrite; }
+ void setDependentWrite(const WriteState *Write) { DependentWrite = Write; }
+
// On every cycle, update CyclesLeft and notify dependent users.
void cycleEvent();
void onInstructionIssued();
@@ -315,6 +324,7 @@ public:
const VecUses &getUses() const { return Uses; }
const InstrDesc &getDesc() const { return Desc; }
unsigned getRCUTokenID() const { return RCUTokenID; }
+ int getCyclesLeft() const { return CyclesLeft; }
unsigned getNumUsers() const {
unsigned NumUsers = 0;
diff --git a/tools/llvm-mca/RegisterFile.cpp b/tools/llvm-mca/RegisterFile.cpp
index 63fe0d2d72e..44de105b899 100644
--- a/tools/llvm-mca/RegisterFile.cpp
+++ b/tools/llvm-mca/RegisterFile.cpp
@@ -26,7 +26,8 @@ namespace mca {
RegisterFile::RegisterFile(const llvm::MCSchedModel &SM,
const llvm::MCRegisterInfo &mri, unsigned NumRegs)
- : MRI(mri), RegisterMappings(mri.getNumRegs(), {WriteRef(), {0, 0}}) {
+ : MRI(mri), RegisterMappings(mri.getNumRegs(),
+ {WriteRef(), {IndexPlusCostPairTy(0, 1), 0}}) {
initialize(SM, NumRegs);
}
@@ -71,34 +72,46 @@ void RegisterFile::addRegisterFile(ArrayRef<MCRegisterCostEntry> Entries,
// Special case where there is no register class identifier in the set.
// An empty set of register classes means: this register file contains all
// the physical registers specified by the target.
- if (Entries.empty()) {
- for (std::pair<WriteRef, IndexPlusCostPairTy> &Mapping : RegisterMappings)
- Mapping.second = std::make_pair(RegisterFileIndex, 1U);
+ // We optimistically assume that a register can be renamed at the cost of a
+ // single physical register. The constructor of RegisterFile ensures that
+ // a RegisterMapping exists for each logical register defined by the Target.
+ if (Entries.empty())
return;
- }
// Now update the cost of individual registers.
for (const MCRegisterCostEntry &RCE : Entries) {
const MCRegisterClass &RC = MRI.getRegClass(RCE.RegisterClassID);
for (const MCPhysReg Reg : RC) {
- IndexPlusCostPairTy &Entry = RegisterMappings[Reg].second;
- if (Entry.first) {
+ RegisterRenamingInfo &Entry = RegisterMappings[Reg].second;
+ IndexPlusCostPairTy &IPC = Entry.IndexPlusCost;
+ if (IPC.first && IPC.first != RegisterFileIndex) {
// The only register file that is allowed to overlap is the default
// register file at index #0. The analysis is inaccurate if register
// files overlap.
errs() << "warning: register " << MRI.getName(Reg)
<< " defined in multiple register files.";
}
- Entry.first = RegisterFileIndex;
- Entry.second = RCE.Cost;
+ IPC = std::make_pair(RegisterFileIndex, RCE.Cost);
+ Entry.RenameAs = Reg;
+
+ // Assume the same cost for each sub-register.
+ for (MCSubRegIterator I(Reg, &MRI); I.isValid(); ++I) {
+ RegisterRenamingInfo &OtherEntry = RegisterMappings[*I].second;
+ if (!OtherEntry.IndexPlusCost.first &&
+ (!OtherEntry.RenameAs ||
+ MRI.isSuperRegister(*I, OtherEntry.RenameAs))) {
+ OtherEntry.IndexPlusCost = IPC;
+ OtherEntry.RenameAs = Reg;
+ }
+ }
}
}
}
-void RegisterFile::allocatePhysRegs(IndexPlusCostPairTy Entry,
+void RegisterFile::allocatePhysRegs(const RegisterRenamingInfo &Entry,
MutableArrayRef<unsigned> UsedPhysRegs) {
- unsigned RegisterFileIndex = Entry.first;
- unsigned Cost = Entry.second;
+ unsigned RegisterFileIndex = Entry.IndexPlusCost.first;
+ unsigned Cost = Entry.IndexPlusCost.second;
if (RegisterFileIndex) {
RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex];
RMT.NumUsedPhysRegs += Cost;
@@ -110,10 +123,10 @@ void RegisterFile::allocatePhysRegs(IndexPlusCostPairTy Entry,
UsedPhysRegs[0] += Cost;
}
-void RegisterFile::freePhysRegs(IndexPlusCostPairTy Entry,
+void RegisterFile::freePhysRegs(const RegisterRenamingInfo &Entry,
MutableArrayRef<unsigned> FreedPhysRegs) {
- unsigned RegisterFileIndex = Entry.first;
- unsigned Cost = Entry.second;
+ unsigned RegisterFileIndex = Entry.IndexPlusCost.first;
+ unsigned Cost = Entry.IndexPlusCost.second;
if (RegisterFileIndex) {
RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex];
RMT.NumUsedPhysRegs -= Cost;
@@ -128,12 +141,48 @@ void RegisterFile::freePhysRegs(IndexPlusCostPairTy Entry,
void RegisterFile::addRegisterWrite(WriteRef Write,
MutableArrayRef<unsigned> UsedPhysRegs,
bool ShouldAllocatePhysRegs) {
- const WriteState &WS = *Write.getWriteState();
+ WriteState &WS = *Write.getWriteState();
unsigned RegID = WS.getRegisterID();
assert(RegID && "Adding an invalid register definition?");
- RegisterMapping &Mapping = RegisterMappings[RegID];
- Mapping.first = Write;
+ LLVM_DEBUG({
+ dbgs() << "RegisterFile: addRegisterWrite [ " << Write.getSourceIndex()
+ << ", " << MRI.getName(RegID) << "]\n";
+ });
+
+ // If RenameAs is equal to RegID, then RegID is subject to register renaming
+ // and false dependencies on RegID are all eliminated.
+
+ // If RenameAs references the invalid register, then we optimistically assume
+ // that it can be renamed. In the absence of tablegen descriptors for register
+ // files, RenameAs is always set to the invalid register ID. In all other
+ // cases, RenameAs must be either equal to RegID, or it must reference a
+ // super-register of RegID.
+
+ // If RenameAs is a super-register of RegID, then a write to RegID has always
+ // a false dependency on RenameAs. The only exception is for when the write
+ // implicitly clears the upper portion of the underlying register.
+ // If a write clears its super-registers, then it is renamed as `RenameAs`.
+ const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+ if (RRI.RenameAs && RRI.RenameAs != RegID) {
+ RegID = RRI.RenameAs;
+ const WriteRef &OtherWrite = RegisterMappings[RegID].first;
+
+ if (!WS.clearsSuperRegisters()) {
+ // The processor keeps the definition of `RegID` together with register
+ // `RenameAs`. Since this partial write is not renamed, no physical
+ // register is allocated.
+ ShouldAllocatePhysRegs = false;
+
+ if (OtherWrite.getSourceIndex() != Write.getSourceIndex()) {
+ // This partial write has a false dependency on RenameAs.
+ WS.setDependentWrite(OtherWrite.getWriteState());
+ }
+ }
+ }
+
+ // Update the mapping for register RegID including its sub-registers.
+ RegisterMappings[RegID].first = Write;
for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I)
RegisterMappings[*I].first = Write;
@@ -141,9 +190,8 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
// hardware. For example, zero-latency data-dependency breaking instructions
// don't consume physical registers.
if (ShouldAllocatePhysRegs)
- allocatePhysRegs(Mapping.second, UsedPhysRegs);
+ allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs);
- // If this is a partial update, then we are done.
if (!WS.clearsSuperRegisters())
return;
@@ -155,42 +203,50 @@ void RegisterFile::removeRegisterWrite(const WriteState &WS,
MutableArrayRef<unsigned> FreedPhysRegs,
bool ShouldFreePhysRegs) {
unsigned RegID = WS.getRegisterID();
- bool ShouldInvalidateSuperRegs = WS.clearsSuperRegisters();
assert(RegID != 0 && "Invalidating an already invalid register?");
- assert(WS.getCyclesLeft() != -512 &&
+ assert(WS.getCyclesLeft() != UNKNOWN_CYCLES &&
"Invalidating a write of unknown cycles!");
assert(WS.getCyclesLeft() <= 0 && "Invalid cycles left for this write!");
- RegisterMapping &Mapping = RegisterMappings[RegID];
- WriteRef &WR = Mapping.first;
- if (!WR.isValid())
- return;
+
+ unsigned RenameAs = RegisterMappings[RegID].second.RenameAs;
+ if (RenameAs && RenameAs != RegID) {
+ RegID = RenameAs;
+
+ if (!WS.clearsSuperRegisters()) {
+ // Keep the definition of `RegID` together with register `RenameAs`.
+ ShouldFreePhysRegs = false;
+ }
+ }
if (ShouldFreePhysRegs)
- freePhysRegs(Mapping.second, FreedPhysRegs);
+ freePhysRegs(RegisterMappings[RegID].second, FreedPhysRegs);
+ WriteRef &WR = RegisterMappings[RegID].first;
if (WR.getWriteState() == &WS)
WR.invalidate();
for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
- WR = RegisterMappings[*I].first;
- if (WR.getWriteState() == &WS)
- WR.invalidate();
+ WriteRef &OtherWR = RegisterMappings[*I].first;
+ if (OtherWR.getWriteState() == &WS)
+ OtherWR.invalidate();
}
- if (!ShouldInvalidateSuperRegs)
+ if (!WS.clearsSuperRegisters())
return;
for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) {
- WR = RegisterMappings[*I].first;
- if (WR.getWriteState() == &WS)
- WR.invalidate();
+ WriteRef &OtherWR = RegisterMappings[*I].first;
+ if (OtherWR.getWriteState() == &WS)
+ OtherWR.invalidate();
}
}
void RegisterFile::collectWrites(SmallVectorImpl<WriteRef> &Writes,
unsigned RegID) const {
assert(RegID && RegID < RegisterMappings.size());
+ LLVM_DEBUG(dbgs() << "RegisterFile: collecting writes for register "
+ << MRI.getName(RegID) << '\n');
const WriteRef &WR = RegisterMappings[RegID].first;
if (WR.isValid())
Writes.push_back(WR);
@@ -225,7 +281,8 @@ unsigned RegisterFile::isAvailable(ArrayRef<unsigned> Regs) const {
// Find how many new mappings must be created for each register file.
for (const unsigned RegID : Regs) {
- const IndexPlusCostPairTy &Entry = RegisterMappings[RegID].second;
+ const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+ const IndexPlusCostPairTy &Entry = RRI.IndexPlusCost;
if (Entry.first)
NumPhysRegs[Entry.first] += Entry.second;
NumPhysRegs[0] += Entry.second;
@@ -266,10 +323,10 @@ void RegisterFile::dump() const {
const RegisterMapping &RM = RegisterMappings[I];
if (!RM.first.getWriteState())
continue;
- const std::pair<unsigned, unsigned> &IndexPlusCost = RM.second;
- dbgs() << MRI.getName(I) << ", " << I << ", PRF=" << IndexPlusCost.first
- << ", Cost=" << IndexPlusCost.second
- << ", ";
+ const RegisterRenamingInfo &RRI = RM.second;
+ dbgs() << MRI.getName(I) << ", " << I << ", PRF=" << RRI.IndexPlusCost.first
+ << ", Cost=" << RRI.IndexPlusCost.second
+ << ", RenameAs=" << RRI.RenameAs << ", ";
RM.first.dump();
dbgs() << '\n';
}
diff --git a/tools/llvm-mca/RegisterFile.h b/tools/llvm-mca/RegisterFile.h
index 8817b8ba60e..349e9789b6e 100644
--- a/tools/llvm-mca/RegisterFile.h
+++ b/tools/llvm-mca/RegisterFile.h
@@ -61,24 +61,35 @@ class RegisterFile : public HardwareUnit {
// regsiter file #0 specifying command line flag `-register-file-size=<uint>`.
llvm::SmallVector<RegisterMappingTracker, 4> RegisterFiles;
- // This pair is used to identify the owner of a register, as well as
- // the "register cost". Register cost is defined as the number of physical
- // registers required to allocate a user register.
+ // This type is used to propagate information about the owner of a register,
+ // and the cost of allocating it in the PRF. Register cost is defined as the
+ // number of physical registers consumed by the PRF to allocate a user
+ // register.
+ //
// For example: on X86 BtVer2, a YMM register consumes 2 128-bit physical
// registers. So, the cost of allocating a YMM register in BtVer2 is 2.
using IndexPlusCostPairTy = std::pair<unsigned, unsigned>;
+ // Struct RegisterRenamingInfo maps registers to register files.
+ // There is a RegisterRenamingInfo object for every register defined by
+ // the target. RegisteRenamingInfo objects are stored into vector
+ // RegisterMappings, and register IDs can be used to reference them.
+ struct RegisterRenamingInfo {
+ IndexPlusCostPairTy IndexPlusCost;
+ llvm::MCPhysReg RenameAs;
+ };
+
// RegisterMapping objects are mainly used to track physical register
// definitions. There is a RegisterMapping for every register defined by the
// Target. For each register, a RegisterMapping pair contains a descriptor of
// the last register write (in the form of a WriteRef object), as well as a
- // IndexPlusCostPairTy to quickly identify owning register files.
+ // RegisterRenamingInfo to quickly identify owning register files.
//
// This implementation does not allow overlapping register files. The only
// register file that is allowed to overlap with other register files is
// register file #0. If we exclude register #0, every register is "owned" by
// at most one register file.
- using RegisterMapping = std::pair<WriteRef, IndexPlusCostPairTy>;
+ using RegisterMapping = std::pair<WriteRef, RegisterRenamingInfo>;
// This map contains one entry for each register defined by the target.
std::vector<RegisterMapping> RegisterMappings;
@@ -103,13 +114,12 @@ class RegisterFile : public HardwareUnit {
// Consumes physical registers in each register file specified by the
// `IndexPlusCostPairTy`. This method is called from `addRegisterMapping()`.
- void allocatePhysRegs(IndexPlusCostPairTy IPC,
+ void allocatePhysRegs(const RegisterRenamingInfo &Entry,
llvm::MutableArrayRef<unsigned> UsedPhysRegs);
- // Releases previously allocated physical registers from the register file(s)
- // referenced by the IndexPlusCostPairTy object. This method is called from
- // `invalidateRegisterMapping()`.
- void freePhysRegs(IndexPlusCostPairTy IPC,
+ // Releases previously allocated physical registers from the register file(s).
+ // This method is called from `invalidateRegisterMapping()`.
+ void freePhysRegs(const RegisterRenamingInfo &Entry,
llvm::MutableArrayRef<unsigned> FreedPhysRegs);
// Create an instance of RegisterMappingTracker for every register file
@@ -140,8 +150,11 @@ public:
// Returns a "response mask" where each bit represents the response from a
// different register file. A mask of all zeroes means that all register
// files are available. Otherwise, the mask can be used to identify which
- // register file was busy. This sematic allows us classify dispatch dispatch
+ // register file was busy. This sematic allows us to classify dispatch
// stalls caused by the lack of register file resources.
+ //
+ // Current implementation can simulate up to 32 register files (including the
+ // special register file at index #0).
unsigned isAvailable(llvm::ArrayRef<unsigned> Regs) const;
void collectWrites(llvm::SmallVectorImpl<WriteRef> &Writes,
unsigned RegID) const;