-
Notifications
You must be signed in to change notification settings - Fork 12.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Add software pipeliner support #117546
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Pengcheng Wang (wangpc-pp) ChangesFull diff: https://github.com/llvm/llvm-project/pull/117546.diff 6 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 933e776da47404..0068991ac6e8d5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -4136,3 +4136,63 @@ bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
return false;
return LHS.getImm() <= RHS.getImm();
}
+
+namespace {
+class RISCVPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
+ MachineInstr *Terminator;
+ SmallVector<MachineOperand, 4> Cond;
+
+public:
+ RISCVPipelinerLoopInfo(MachineInstr *Terminator,
+ const SmallVectorImpl<MachineOperand> &Cond)
+ : Terminator(Terminator), Cond(Cond.begin(), Cond.end()) {}
+
+ bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
+ // Make the instructions for loop control be placed in stage 0.
+ // The predecessors of PredBranch are considered by the caller.
+ return MI == Terminator;
+ }
+
+ std::optional<bool> createTripCountGreaterCondition(
+ int TC, MachineBasicBlock &MBB,
+ SmallVectorImpl<MachineOperand> &CondParam) override {
+ // A branch instruction will be inserted as "if (Cond) goto epilogue".
+ // Cond is normalized for such use.
+ // The predecessors of the branch are assumed to have already been inserted.
+ CondParam = Cond;
+ return {};
+ }
+
+ void setPreheader(MachineBasicBlock *NewPreheader) override {}
+
+ void adjustTripCount(int TripCountAdjust) override {}
+
+ void disposed() override {}
+};
+} // namespace
+
+std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+RISCVInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ SmallVector<MachineOperand, 4> Cond;
+ if (analyzeBranch(*LoopBB, TBB, FBB, Cond, /*AllowModify=*/false))
+ return nullptr;
+
+ // Infinite loops are not supported
+ if (TBB == LoopBB && FBB == LoopBB)
+ return nullptr;
+
+ // Must be conditional branch
+ if (FBB == nullptr)
+ return nullptr;
+
+ assert((TBB == LoopBB || FBB == LoopBB) &&
+ "The Loop must be a single-basic-block loop");
+
+ // Normalization for createTripCountGreaterCondition()
+ if (TBB == LoopBB)
+ reverseBranchCondition(Cond);
+
+ MachineInstr *Terminator = &*LoopBB->getFirstTerminator();
+ return std::make_unique<RISCVPipelinerLoopInfo>(Terminator, Cond);
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index c3aa367486627a..8e878f733c75ee 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -293,6 +293,9 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
unsigned getTailDuplicateSize(CodeGenOptLevel OptLevel) const override;
+ std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+ analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
+
protected:
const RISCVSubtarget &STI;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 03397e1e0d89ee..4df84e9599810d 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -186,6 +186,10 @@ bool RISCVSubtarget::useRVVForFixedLengthVectors() const {
bool RISCVSubtarget::enableSubRegLiveness() const { return true; }
+bool RISCVSubtarget::enableMachinePipeliner() const {
+ return getSchedModel().hasInstrSchedModel();
+}
+
void RISCVSubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
Mutations.push_back(createMacroFusionDAGMutation(getMacroFusions()));
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index f2c0a3d85c998a..a6ba5fcdc8cdd8 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -304,6 +304,10 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>>
&Mutations) const override;
+ bool enableMachinePipeliner() const override;
+
+ bool useDFAforSMS() const override { return false; }
+
bool useAA() const override;
unsigned getCacheLineSize() const override {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index fa507653264ccd..232c92743c82dc 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -114,6 +114,11 @@ static cl::opt<bool>
cl::desc("Enable the RISC-V VL Optimizer pass"),
cl::init(false), cl::Hidden);
+static cl::opt<bool>
+ EnableMachinePipeliner("riscv-enable-pipeliner",
+ cl::desc("Enable Machine Pipeliner for RISC-V"),
+ cl::init(false), cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -600,6 +605,9 @@ void RISCVPassConfig::addPreRegAlloc() {
addPass(createRISCVInsertReadWriteCSRPass());
addPass(createRISCVInsertWriteVXRMPass());
addPass(createRISCVLandingPadSetupPass());
+
+ if (TM->getOptLevel() != CodeGenOptLevel::None && EnableMachinePipeliner)
+ addPass(&MachinePipelinerID);
}
void RISCVPassConfig::addFastRegAlloc() {
diff --git a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
new file mode 100644
index 00000000000000..bad6e8a7abc7da
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -mcpu=sifive-p670 -O3 -verify-machineinstrs -riscv-enable-pipeliner=false < %s \
+; RUN: | FileCheck %s --check-prefixes=CHECK-NOT-PIPELINED
+; RUN: llc -mtriple=riscv64 -mcpu=sifive-p670 -O3 -verify-machineinstrs -riscv-enable-pipeliner=true < %s \
+; RUN: | FileCheck %s --check-prefixes=CHECK-PIPELINED
+
+define void @test_1(ptr noalias %in, ptr noalias %out, i32 signext %cnt) "no-builtins" {
+; CHECK-NOT-PIPELINED-LABEL: test_1:
+; CHECK-NOT-PIPELINED: # %bb.0: # %entry
+; CHECK-NOT-PIPELINED-NEXT: blez a2, .LBB0_3
+; CHECK-NOT-PIPELINED-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NOT-PIPELINED-NEXT: addi a2, a2, -1
+; CHECK-NOT-PIPELINED-NEXT: sh2add.uw a2, a2, a1
+; CHECK-NOT-PIPELINED-NEXT: addi a2, a2, 4
+; CHECK-NOT-PIPELINED-NEXT: .LBB0_2: # %for.body
+; CHECK-NOT-PIPELINED-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NOT-PIPELINED-NEXT: lw a3, 0(a1)
+; CHECK-NOT-PIPELINED-NEXT: addi a1, a1, 4
+; CHECK-NOT-PIPELINED-NEXT: sw a3, 0(a0)
+; CHECK-NOT-PIPELINED-NEXT: addi a0, a0, 4
+; CHECK-NOT-PIPELINED-NEXT: bne a1, a2, .LBB0_2
+; CHECK-NOT-PIPELINED-NEXT: .LBB0_3: # %for.end
+; CHECK-NOT-PIPELINED-NEXT: ret
+;
+; CHECK-PIPELINED-LABEL: test_1:
+; CHECK-PIPELINED: # %bb.0: # %entry
+; CHECK-PIPELINED-NEXT: blez a2, .LBB0_6
+; CHECK-PIPELINED-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-PIPELINED-NEXT: lw a3, 0(a1)
+; CHECK-PIPELINED-NEXT: addi a2, a2, -1
+; CHECK-PIPELINED-NEXT: addi a4, a0, 4
+; CHECK-PIPELINED-NEXT: sh2add.uw a6, a2, a1
+; CHECK-PIPELINED-NEXT: addi a1, a1, 4
+; CHECK-PIPELINED-NEXT: addi a6, a6, 4
+; CHECK-PIPELINED-NEXT: beq a1, a6, .LBB0_5
+; CHECK-PIPELINED-NEXT: # %bb.2: # %for.body
+; CHECK-PIPELINED-NEXT: lw a5, 0(a1)
+; CHECK-PIPELINED-NEXT: addi a2, a4, 4
+; CHECK-PIPELINED-NEXT: addi a1, a1, 4
+; CHECK-PIPELINED-NEXT: beq a1, a6, .LBB0_4
+; CHECK-PIPELINED-NEXT: .LBB0_3: # %for.body
+; CHECK-PIPELINED-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-PIPELINED-NEXT: sw a3, 0(a0)
+; CHECK-PIPELINED-NEXT: mv a3, a5
+; CHECK-PIPELINED-NEXT: lw a5, 0(a1)
+; CHECK-PIPELINED-NEXT: mv a0, a4
+; CHECK-PIPELINED-NEXT: mv a4, a2
+; CHECK-PIPELINED-NEXT: addi a2, a2, 4
+; CHECK-PIPELINED-NEXT: addi a1, a1, 4
+; CHECK-PIPELINED-NEXT: bne a1, a6, .LBB0_3
+; CHECK-PIPELINED-NEXT: .LBB0_4:
+; CHECK-PIPELINED-NEXT: sw a3, 0(a0)
+; CHECK-PIPELINED-NEXT: mv a0, a4
+; CHECK-PIPELINED-NEXT: mv a3, a5
+; CHECK-PIPELINED-NEXT: .LBB0_5:
+; CHECK-PIPELINED-NEXT: sw a3, 0(a0)
+; CHECK-PIPELINED-NEXT: .LBB0_6: # %for.end
+; CHECK-PIPELINED-NEXT: ret
+entry:
+ %cmp5 = icmp sgt i32 %cnt, 0
+ br i1 %cmp5, label %for.body, label %for.end
+
+for.body: ; preds = %entry, %for.body
+ %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %in.addr.07 = phi ptr [ %incdec.ptr, %for.body ], [ %in, %entry ]
+ %out.addr.06 = phi ptr [ %incdec.ptr1, %for.body ], [ %out, %entry ]
+ %0 = load i32, ptr %out.addr.06, align 4
+ store i32 %0, ptr %in.addr.07, align 4
+ %incdec.ptr = getelementptr inbounds i8, ptr %in.addr.07, i64 4
+ %incdec.ptr1 = getelementptr inbounds i8, ptr %out.addr.06, i64 4
+ %inc = add nuw nsw i32 %i.08, 1
+ %exitcond.not = icmp eq i32 %inc, %cnt
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
|
4dc3cb2
to
a0b5e20
Compare
a0b5e20
to
15ab569
Compare
Out of curiosity, have you seen any benefits of adding and enabling this support? |
I haven't tried current implementation, but we did implement it and saw very small performance gain on an OoO core before (that was about 3 years ago, on LLVM 13). |
|
||
const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo(); | ||
auto FindRegDef = [&MRI](MachineOperand &Op) -> const MachineInstr * { | ||
if (!Op.isReg()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When would isReg()
be false?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One operand may be imm for BEQIMM/BNEIMM. And we may have standard branches with immediate soon.
bool shouldIgnoreForPipelining(const MachineInstr *MI) const override { | ||
// Make the instructions for loop control be placed in stage 0. | ||
// The predecessors of LHS/RHS are considered by the caller. | ||
if (LHS && MI == LHS) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we want to ignore LHS and RHS? AArch64 only ignores the compare instruction, but on RISC-V the branch is also the compare instruction. So LHS and RHS are inputs to the compare which AArch64 doesn't ignore.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is the same, we just need a root SUnit and SMSchedule::computeUnpipelineableNodes
will add all its predecessors.
llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp
Lines 3172 to 3196 in f947d5a
/// Determine transitive dependences of unpipelineable instructions | |
SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes( | |
SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) { | |
SmallSet<SUnit *, 8> DoNotPipeline; | |
SmallVector<SUnit *, 8> Worklist; | |
for (auto &SU : SSD->SUnits) | |
if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr())) | |
Worklist.push_back(&SU); | |
while (!Worklist.empty()) { | |
auto SU = Worklist.pop_back_val(); | |
if (DoNotPipeline.count(SU)) | |
continue; | |
LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n"); | |
DoNotPipeline.insert(SU); | |
for (auto &Dep : SU->Preds) | |
Worklist.push_back(Dep.getSUnit()); | |
if (SU->getInstr()->isPHI()) | |
for (auto &Dep : SU->Succs) | |
if (Dep.getKind() == SDep::Anti) | |
Worklist.push_back(Dep.getSUnit()); | |
} | |
return DoNotPipeline; | |
} |
So for AArch64, the LHS/RHS of the compare instruction will also be ignored, but not explicitly be ignored in shouldIgnoreForPipelining
.
In llvm#83875, we changed the type of `Width` to `LocationSize`. To get the clsuter bytes, we use `LocationSize::getValue()` to calculate the value. But when `Width` is an unknown size `LocationSize`, an assertion "Getting value from an unknown LocationSize!" will be triggered. This patch simply skips MemOp with unknown size to fix this issue and keep the logic the same as before. This issue was found when implementing software pipieliner for RISC-V in llvm#117546. The pipeliner may clone some memory operations with `BeforeOrAfterPointer` size.
Accidentally hit the Close with comment button, my bad |
I'm a little surprised to find that this might benefit out-of-order cores -- I thought in-order cores will most likely be the one that gain improvements from this, if there is any. Theoretically, OoO cores can already look into future loop iterations thus overlapping several iterations via software pipelining doesn't always make a difference. That said, the size of hardware scheduler buffers might be a limiting factors (on how far the HW can look ahead) and maybe that's why AArch64 still see performance improvement. But I would like to point out that they also saw "significant performance degradation" in some cases without specifying the quantities nor showing the average percentage of performance changes across all benchmarks. So personally I'll be more cautious on this and I agree that we should turn this feature off by default. |
2ba23cd
to
fab23b5
Compare
In #83875, we changed the type of `Width` to `LocationSize`. To get the clsuter bytes, we use `LocationSize::getValue()` to calculate the value. But when `Width` is an unknown size `LocationSize`, an assertion "Getting value from an unknown LocationSize!" will be triggered. This patch simply skips MemOp with unknown size to fix this issue and keep the logic the same as before. This issue was found when implementing software pipeliner for RISC-V in #117546. The pipeliner may clone some memory operations with `BeforeOrAfterPointer` size.
Hi. I am working on First, we don't think it's worthwhile for all loops. As @mshockwave mentioned, OoO is sufficient in many cases. However, there are cases where it is not enough due to HW resource limitations, for example, a loop with long a dependence chain. Our ultimate goal is to apply software pipelining only to such loops and we are now considering introducing a pragma directive to enable software pipelining to specific loops. Therefore, I don't know much about RISCV, but I agree that it should be disabled by default. Also, there are some correctness issues in Finally, in our experience, Modulo Variable Expansion (#65609) tends to generate efficient code, especially when we cannot ignore the cost of register-to-register instructions, which are generated by default module scheduler. We have to implement some virtual functions for |
Thanks! @kasuga-fj
Is this PR right? It refers to a PR of AMDGPU target. :-)
That's also my plan to make it disabled by default. :-)
Well done! I will have a look later!
Yeah, I noticed that before when I read the implementation of AArch64. I had a local branch for that and may post it when it is good enough. Thanks again! |
Sorry, I made a mistake. #79589 is correct one.
Thanks for your interest! We are not original committers of |
@mshockwave Do you agree to merge this first? |
In llvm#83875, we changed the type of `Width` to `LocationSize`. To get the clsuter bytes, we use `LocationSize::getValue()` to calculate the value. But when `Width` is an unknown size `LocationSize`, an assertion "Getting value from an unknown LocationSize!" will be triggered. This patch simply skips MemOp with unknown size to fix this issue and keep the logic the same as before. This issue was found when implementing software pipeliner for RISC-V in llvm#117546. The pipeliner may clone some memory operations with `BeforeOrAfterPointer` size.
Yes, I'm fine with that. |
This patch adds basic support of `MachinePipeliner` and disable it by default. The functionality should be OK and all llvm-test-suite tests have passed.
fab23b5
to
62e49e7
Compare
This patch adds basic support of
MachinePipeliner
and disableit by default.
The functionality should be OK and all llvm-test-suite tests have
passed.