Skip to content

Commit

Permalink
[AMDGPU] Add off-by-default flag to control LiveRegOpt
Browse files Browse the repository at this point in the history
Change-Id: Id939bf74b48b47e5ee2b432956e476fac80e3375
(cherry picked from commit 5b1a599)
  • Loading branch information
jrbyrnes authored and bcahoon committed Nov 5, 2024
1 parent fa5860c commit a565d43
Show file tree
Hide file tree
Showing 16 changed files with 4,160 additions and 411 deletions.
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,8 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
for (auto &BB : reverse(F))
for (Instruction &I : make_early_inc_range(reverse(BB))) {
Changed |= !HasScalarSubwordLoads && visit(I);
Changed |= LRO.optimizeLiveType(&I, DeadInsts);
if (ST.shouldCoerceIllegalTypes())
Changed |= LRO.optimizeLiveType(&I, DeadInsts);
}

RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts);
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
cl::desc("Number of addresses from which to enable MIMG NSA."),
cl::init(3), cl::Hidden);

static cl::opt<bool>
CoerceIllegal("amdgpu-coerce-illegal-types",
cl::desc("Whether or not to coerce illegal types"),
cl::ReallyHidden, cl::init(false));

GCNSubtarget::~GCNSubtarget() = default;

GCNSubtarget &
Expand Down Expand Up @@ -199,6 +204,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
InstSelector.reset(new AMDGPUInstructionSelector(
*this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));

ShouldCoerceIllegalTypes = CoerceIllegal;
}

unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
Expand Down
14 changes: 8 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,8 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
// queries (e.g. get*InstrCost) to decide the proper handling
// of 8 bit vectors.
if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
if (DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
if (ST->shouldCoerceIllegalTypes() &&
DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
unsigned ElCount = VTy->getElementCount().getFixedValue();
return PowerOf2Ceil(ElCount / 4);
}
Expand Down Expand Up @@ -355,10 +356,10 @@ unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
return 32 * 4 / ElemWidth;

return (ElemWidth == 8) ? 4
: (ElemWidth == 16) ? 2
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
: 1;
return (ST->shouldCoerceIllegalTypes() && ElemWidth == 8) ? 4
: (ElemWidth == 16) ? 2
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
: 1;
}

unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
Expand Down Expand Up @@ -1154,7 +1155,8 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,

unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType());
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
(ScalarSize == 16 || ScalarSize == 8)) {
(ScalarSize == 16 ||
(ScalarSize == 8 && ST->shouldCoerceIllegalTypes()))) {
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;

bool ShouldCoerceIllegalTypes = false;

SelectionDAGTargetInfo TSInfo;
private:
SIInstrInfo InstrInfo;
Expand Down Expand Up @@ -1305,6 +1307,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// of sign-extending.
bool hasGetPCZeroExtension() const { return GFX12Insts; }

/// \returns whether or not we should coerce illegal types into vectors of
// legal types for values that span basic blocks.
bool shouldCoerceIllegalTypes() const { return ShouldCoerceIllegalTypes; }

/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
Expand Down
184 changes: 92 additions & 92 deletions llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll

Large diffs are not rendered by default.

39 changes: 19 additions & 20 deletions llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,29 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_bitcmp0_b32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %bb10
; CHECK-NEXT: global_load_dwordx2 v[8:9], v0, s[8:9]
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[8:9]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v7, 0xff, v8
; CHECK-NEXT: v_bfe_u32 v6, v8, 8, 8
; CHECK-NEXT: v_bfe_u32 v5, v8, 16, 8
; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8
; CHECK-NEXT: v_and_b32_e32 v3, 0xff, v9
; CHECK-NEXT: v_bfe_u32 v2, v9, 8, 8
; CHECK-NEXT: v_bfe_u32 v1, v9, 16, 8
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 24, v9
; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v0
; CHECK-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v0
; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v1
; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v1
; CHECK-NEXT: s_branch .LBB0_3
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v5, 0
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: v_mov_b32_e32 v7, 0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: .LBB0_3: ; %bb41
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x48
; CHECK-NEXT: v_mov_b32_e32 v8, s10
Expand All @@ -48,16 +47,16 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
; CHECK-NEXT: v_mov_b32_e32 v19, s21
; CHECK-NEXT: v_mov_b32_e32 v20, s22
; CHECK-NEXT: v_mov_b32_e32 v21, s23
; CHECK-NEXT: flat_store_byte v[8:9], v7
; CHECK-NEXT: flat_store_byte v[10:11], v6
; CHECK-NEXT: flat_store_byte v[12:13], v5
; CHECK-NEXT: flat_store_byte v[14:15], v4
; CHECK-NEXT: flat_store_byte v[16:17], v3
; CHECK-NEXT: flat_store_byte v[18:19], v2
; CHECK-NEXT: flat_store_byte v[20:21], v1
; CHECK-NEXT: flat_store_byte v[8:9], v0
; CHECK-NEXT: flat_store_byte v[10:11], v7
; CHECK-NEXT: flat_store_byte v[12:13], v6
; CHECK-NEXT: flat_store_byte v[14:15], v5
; CHECK-NEXT: flat_store_byte v[16:17], v1
; CHECK-NEXT: flat_store_byte v[18:19], v4
; CHECK-NEXT: flat_store_byte v[20:21], v3
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
; CHECK-NEXT: flat_store_byte v[2:3], v0
; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; CHECK-NEXT: flat_store_byte v[0:1], v2
; CHECK-NEXT: s_endpgm
bb:
br i1 %arg, label %bb10, label %bb41
Expand Down
Loading

0 comments on commit a565d43

Please sign in to comment.