Skip to content

Commit

Permalink
CHI-CoupledL2: optimize timing (#191)
Browse files Browse the repository at this point in the history
* CoupledL2:timing fix

* MSHRCtl: fix counter for performance

* Revert "BOP: add clock gating (#169)"

This reverts commit 2199272.

* MSHRCtl: Change to RoundRobin to fix commit timeout

---------

Co-authored-by: Yanqin Li <[email protected]>
  • Loading branch information
yulightenyu and Maxpicca-Li authored Jul 1, 2024
1 parent 7c475a9 commit d9b702d
Show file tree
Hide file tree
Showing 8 changed files with 87 additions and 42 deletions.
10 changes: 5 additions & 5 deletions src/main/scala/coupledL2/RequestArb.scala
Original file line number Diff line number Diff line change
Expand Up @@ -130,19 +130,19 @@ class RequestArb(implicit p: Parameters) extends L2Module {
(if (io.fromTXRSP.isDefined) io.fromTXRSP.get.blockSinkBReqEntrance else false.B)
val block_C = io.fromMSHRCtl.blockC_s1 || io.fromMainPipe.blockC_s1 || io.fromGrantBuffer.blockSinkReqEntrance.blockC_s1

val noFreeWay = Wire(Bool())
// val noFreeWay = Wire(Bool())

val sinkValids = VecInit(Seq(
io.sinkC.valid && !block_C,
io.sinkB.valid && !block_B,
io.sinkA.valid && !block_A && !noFreeWay
io.sinkA.valid && !block_A
)).asUInt

// TODO: A Hint is allowed to enter if !s2_ready for mcp2_stall

val sink_ready_basic = io.dirRead_s1.ready && resetFinish && !mshr_task_s1.valid && s2_ready

io.sinkA.ready := sink_ready_basic && !block_A && !sinkValids(1) && !sinkValids(0) && !noFreeWay // SinkC prior to SinkA & SinkB
io.sinkA.ready := sink_ready_basic && !block_A && !sinkValids(1) && !sinkValids(0) // SinkC prior to SinkA & SinkB
io.sinkB.ready := sink_ready_basic && !block_B && !sinkValids(0) // SinkB prior to SinkA
io.sinkC.ready := sink_ready_basic && !block_C

Expand Down Expand Up @@ -195,13 +195,13 @@ class RequestArb(implicit p: Parameters) extends L2Module {
task_s2.valid := s1_fire
when(s1_fire) { task_s2.bits := task_s1.bits }

val sameSet_s2 = task_s2.valid && task_s2.bits.fromA && !task_s2.bits.mshrTask && task_s2.bits.set === A_task.set
/* val sameSet_s2 = task_s2.valid && task_s2.bits.fromA && !task_s2.bits.mshrTask && task_s2.bits.set === A_task.set
val sameSet_s3 = RegNext(task_s2.valid && task_s2.bits.fromA && !task_s2.bits.mshrTask) &&
RegEnable(task_s2.bits.set, task_s2.valid) === A_task.set
val sameSetCnt = PopCount(VecInit(io.msInfo.map(s => s.valid && s.bits.set === A_task.set && s.bits.fromA) :+
sameSet_s2 :+ sameSet_s3).asUInt)
noFreeWay := sameSetCnt >= cacheParams.ways.U

*/
io.taskToPipe_s2 := task_s2

// MSHR task
Expand Down
20 changes: 17 additions & 3 deletions src/main/scala/coupledL2/RequestBuffer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete
val mshrInfo = Vec(mshrsAll, Flipped(ValidIO(new MSHRInfo)))
val aMergeTask = ValidIO(new AMergeTask)
val mainPipeBlock = Input(Vec(2, Bool()))
/* Snoop task from arbiter at stage 2 */
val taskFromArb_s2 = Flipped(ValidIO(new TaskBundle()))

val ATag = Output(UInt(tagBits.W))
val ASet = Output(UInt(setBits.W))
Expand Down Expand Up @@ -143,8 +145,20 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete
io.aMergeTask.bits.id := mergeAId
io.aMergeTask.bits.task := in

/*
noFreeWay check: s2 + s3 + mshrs >= ways(L2)
*/
val task_s2 = io.taskFromArb_s2
val sameSet_s2 = task_s2.valid && task_s2.bits.fromA && !task_s2.bits.mshrTask && task_s2.bits.set === io.ASet
val sameSet_s3 = RegNext(task_s2.valid && task_s2.bits.fromA && !task_s2.bits.mshrTask) &&
RegEnable(task_s2.bits.set, task_s2.valid) === io.ASet
val sameSetCnt = PopCount(VecInit(io.mshrInfo.map(s => s.valid && s.bits.set === io.ASet && s.bits.fromA) :+
sameSet_s2 :+ sameSet_s3).asUInt)
val noFreeWay = sameSetCnt >= cacheParams.ways.U
dontTouch (noFreeWay)

// flow not allowed when full, or entries might starve
val canFlow = flow.B && !full && !conflict(in) && !chosenQValid && !Cat(io.mainPipeBlock).orR
val canFlow = flow.B && !full && !conflict(in) && !chosenQValid && !Cat(io.mainPipeBlock).orR && !noFreeWay
val doFlow = canFlow && io.out.ready
io.hasLatePF := latePrefetch(in) && io.in.valid && !sameAddr(in, RegNext(in))
io.hasMergeA := mergeA && io.in.valid && !sameAddr(in, RegNext(in))
Expand Down Expand Up @@ -177,7 +191,7 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete

entry.valid := true.B
// when Addr-Conflict / Same-Addr-Dependent / MainPipe-Block / noFreeWay-in-Set, entry not ready
entry.rdy := !conflict(in) && !mpBlock && !s1Block // && !Cat(depMask).orR
entry.rdy := !conflict(in) && !mpBlock && !s1Block && !noFreeWay// && !Cat(depMask).orR
entry.task := io.in.bits
entry.waitMP := Cat(
s1Block,
Expand Down Expand Up @@ -247,7 +261,7 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete
// update info
e.waitMS := waitMSUpdate
// e.depMask := depMaskUpdate
e.rdy := !waitMSUpdate.orR && !e.waitMP && !s1_Block
e.rdy := !waitMSUpdate.orR && !e.waitMP && !s1_Block && !noFreeWay
}
}

Expand Down
10 changes: 5 additions & 5 deletions src/main/scala/coupledL2/prefetch/BestOffsetPrefetch.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

package coupledL2.prefetch

import utility.{GatedValidRegNext, ChiselDB, Constantin, MemReqSource, ParallelPriorityMux, RRArbiterInit, SRAMTemplate}
import utility.{ChiselDB, Constantin, MemReqSource, ParallelPriorityMux, RRArbiterInit, SRAMTemplate}
import org.chipsalliance.cde.config.Parameters
import chisel3.DontCare.:=
import chisel3._
Expand Down Expand Up @@ -177,13 +177,13 @@ class RecentRequestTable(implicit p: Parameters) extends BOPModule {
rrTable.io.r.req.bits.setIdx := idx(rAddr)
rData := rrTable.io.r.resp.data(0)

assert(!GatedValidRegNext(io.w.fire && io.r.req.fire), "single port SRAM should not read and write at the same time")
assert(!RegNext(io.w.fire && io.r.req.fire), "single port SRAM should not read and write at the same time")

io.w.ready := rrTable.io.w.req.ready && !io.r.req.valid
io.r.req.ready := true.B
io.r.resp.valid := GatedValidRegNext(rrTable.io.r.req.fire, false.B)
io.r.resp.bits.ptr := RegEnable(io.r.req.bits.ptr, rrTable.io.r.req.fire)
io.r.resp.bits.hit := rData.valid && rData.tag === RegEnable(tag(rAddr), rrTable.io.r.req.fire)
io.r.resp.valid := RegNext(rrTable.io.r.req.fire, false.B)
io.r.resp.bits.ptr := RegNext(io.r.req.bits.ptr)
io.r.resp.bits.hit := rData.valid && rData.tag === RegNext(tag(rAddr))

}

Expand Down
73 changes: 49 additions & 24 deletions src/main/scala/coupledL2/tl2chi/MSHRCtl.scala
Original file line number Diff line number Diff line change
Expand Up @@ -120,46 +120,68 @@ class MSHRCtl(implicit p: Parameters) extends TL2CHIL2Module {
mshrSelector.io.idle := mshrs.map(m => !m.io.status.valid)
io.toMainPipe.mshr_alloc_ptr := OHToUInt(selectedMSHROH)

/*
rxrsp for PCredit timing is quite critical and break it here
*/
val rxrspValid = RegNext(io.resps.rxrsp.valid)
val rxrspInfo = RegNext(io.resps.rxrsp.respInfo)
val rxrspMshrId = RegNext( io.resps.rxrsp.mshrId)

/*
when PCrdGrant, give credit to one entry that:
1. got RetryAck and not Reissued
2. match srcID and PCrdType
3. use Round-Robin arbiter if multi-entry match
*/
val isPCrdGrant = io.resps.rxrsp.valid && (io.resps.rxrsp.respInfo.chiOpcode.get === PCrdGrant)
val isPCrdGrantReg = RegNext(isPCrdGrant)
val waitPCrdInfo = Wire(Vec(mshrsAll, new PCrdInfo))
// val pArb = Module(new RRArbiter(UInt(), mshrsAll))
val timeOutPri = VecInit(Seq.fill(16)(false.B))
val timeOutSel = WireInit(false.B)
val pCrdPri = VecInit(Seq.fill(16)(false.B))
val pArb = Module(new RRArbiter(UInt(), mshrsAll))

val matchPCrdGrant = VecInit(waitPCrdInfo.map(p =>
isPCrdGrant && p.valid &&
p.srcID.get === io.resps.rxrsp.respInfo.srcID.get &&
p.pCrdType.get === io.resps.rxrsp.respInfo.pCrdType.get
))

/* pArb.io.in.zipWithIndex.foreach {
case (in, i) =>
in.valid := matchPCrdGrant(i)
val matchPCrdGrantReg = RegNext(matchPCrdGrant)
pArb.io.in.zipWithIndex.foreach {
case (in, i) =>
in.valid := matchPCrdGrantReg(i)
in.bits := 0.U
}
pArb.io.out.ready := true.B
val pCrdRR = VecInit(UIntToOH(pArb.io.chosen))
val pCrdPri = VecInit((matchPCrdGrant.asUInt & pCrdRR.asUInt).asBools)
//val pCrdPri = VecInit(PriorityEncoderOH(matchPCrdGrant))
val pCrdIsWait = OHToUInt(pCrdPri)
*/

/*
Random arbiter if multi-entry match
*/
val lfsr = LFSR(16, true.B)
val idx = Random(16, lfsr)
val idxOH = VecInit(UIntToOH(idx))
val pCrdOH = VecInit(UIntToOH(pArb.io.chosen).asBools)
val pCrdFixPri = VecInit(pCrdOH zip matchPCrdGrantReg map {case(a,b) => a && b})
//val pCrdFixPri = VecInit(PriorityEncoderOH(matchPCrdGrantReg)) //fix priority arbiter

// timeout protect
val counter = RegInit(VecInit(Seq.fill(mshrsAll)(0.U((log2Ceil(mshrsAll)+1).W))))

for(i <- 0 until 16) {
when(matchPCrdGrantReg(i)) {
when(!timeOutSel && pCrdFixPri(i) || timeOutPri(i)) {
counter(i):=0.U
}.otherwise {
counter(i):= counter(i) + 1.U
}
}
}
val timeOutOH = PriorityEncoderOH(counter.map(_>=12.U) zip matchPCrdGrantReg map {case(a,b) => a&&b})
timeOutPri := VecInit(timeOutOH)

timeOutSel := timeOutPri.reduce(_|_)
pCrdPri := Mux(timeOutSel, timeOutPri, pCrdFixPri)

val doubleReq = Fill(2, matchPCrdGrant.asUInt)
val doubleGnt = ~(doubleReq - idxOH.asUInt) & doubleReq
val gnt = doubleGnt(31,16) | doubleGnt(15,0)
val pCrdPri = VecInit(gnt.asBools)
val pCrdIsWait = OHToUInt(pCrdPri)
dontTouch (timeOutPri)
dontTouch (timeOutSel)
dontTouch (pCrdOH)
dontTouch (pCrdFixPri)
dontTouch (pCrdPri)

/* when PCrdGrant come before RetryAck, 16 entry CAM used to:
1. save {srcID, PCrdType}
Expand All @@ -171,7 +193,8 @@ class MSHRCtl(implicit p: Parameters) extends TL2CHIL2Module {
val pCamValids = Cat(pCam.map(_.valid))
val enqIdx = PriorityEncoder(~pCamValids.asUInt)

when (isPCrdGrant && !pCrdIsWait.orR){
// when (isPCrdGrant && !pCrdIsWait.orR){
when (isPCrdGrant){
pCam(enqIdx).valid := true.B
pCam(enqIdx).srcID.get := io.resps.rxrsp.respInfo.srcID.get
pCam(enqIdx).pCrdType.get := io.resps.rxrsp.respInfo.pCrdType.get
Expand Down Expand Up @@ -214,8 +237,10 @@ class MSHRCtl(implicit p: Parameters) extends TL2CHIL2Module {
m.io.resps.rxdat.valid := m.io.status.valid && io.resps.rxdat.valid && io.resps.rxdat.mshrId === i.U
m.io.resps.rxdat.bits := io.resps.rxdat.respInfo

m.io.resps.rxrsp.valid := (m.io.status.valid && io.resps.rxrsp.valid && !isPCrdGrant && io.resps.rxrsp.mshrId === i.U) || (isPCrdGrant && pCrdPri(i))
m.io.resps.rxrsp.bits := io.resps.rxrsp.respInfo
// m.io.resps.rxrsp.valid := (m.io.status.valid && io.resps.rxrsp.valid && !isPCrdGrant && io.resps.rxrsp.mshrId === i.U) || (isPCrdGrant && pCrdPri(i))
// m.io.resps.rxrsp.bits := io.resps.rxrsp.respInfo
m.io.resps.rxrsp.valid := (m.io.status.valid && rxrspValid && !isPCrdGrantReg && rxrspMshrId === i.U) || (isPCrdGrantReg && pCrdPri(i))
m.io.resps.rxrsp.bits := rxrspInfo

m.io.replResp.valid := io.replResp.valid && io.replResp.bits.mshrId === i.U
m.io.replResp.bits := io.replResp.bits
Expand All @@ -226,7 +251,7 @@ class MSHRCtl(implicit p: Parameters) extends TL2CHIL2Module {
m.io.aMergeTask.bits := io.aMergeTask.bits.task

waitPCrdInfo(i) := m.io.waitPCrdInfo
m.io.pCamPri := (pCamPri === i.U) && waitPCrdInfo(i).valid
m.io.pCamPri := 0.U /*(pCamPri === i.U) && waitPCrdInfo(i).valid*/
}
/* Reserve 1 entry for SinkB */
io.waitPCrdInfo <> waitPCrdInfo
Expand Down
1 change: 1 addition & 0 deletions src/main/scala/coupledL2/tl2chi/Slice.scala
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class Slice()(implicit p: Parameters) extends BaseSlice[OuterBundle]
reqBuf.io.mshrInfo := mshrCtl.io.msInfo
reqBuf.io.mainPipeBlock := mainPipe.io.toReqBuf
reqBuf.io.s1Entrance := reqArb.io.s1Entrance
reqBuf.io.taskFromArb_s2 := reqArb.io.taskToPipe_s2

mainPipe.io.taskFromArb_s2 := reqArb.io.taskToPipe_s2
mainPipe.io.taskInfo_s1 := reqArb.io.taskInfo_s1
Expand Down
9 changes: 6 additions & 3 deletions src/main/scala/coupledL2/tl2chi/TXREQ.scala
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,21 @@ class TXREQ(implicit p: Parameters) extends TL2CHIL2Module {
require(chiOpt.isDefined)

// TODO: an mshrsAll-entry queue is too much, evaluate for a proper size later
val queue = Module(new Queue(new CHIREQ, entries = mshrsAll, flow = true))
val queue = Module(new Queue(new CHIREQ, entries = mshrsAll, flow = false))

// Back pressure logic from TXREQ
val queueCnt = queue.io.count
// TODO: this may be imprecise, review this later
val pipeStatus_s1_s5 = io.pipeStatusVec
val pipeStatus_s2_s5 = pipeStatus_s1_s5.tail
val pipeStatus_s1 = pipeStatus_s1_s5.head
val pipeStatus_s2 = pipeStatus_s1_s5(1)
val s2ReturnCredit = pipeStatus_s2.valid && !(pipeStatus_s2.bits.mshrTask && pipeStatus_s2.bits.toTXREQ)
// inflightCnt equals the number of reqs on s2~s5 that may flow into TXREQ soon, plus queueCnt.
// The calculation of inflightCnt might be imprecise and leads to false positive back pressue.
val inflightCnt = PopCount(Cat(pipeStatus_s2_s5.map(s => s.valid && s.bits.mshrTask && s.bits.toTXREQ))) +
pipeStatus_s1.valid.asUInt +
// pipeStatus_s1.valid.asUInt +
1.U - s2ReturnCredit.asUInt + //Fix Timing: always take credit and s2 return if not take
queueCnt
val noSpace = inflightCnt >= mshrsAll.U

Expand All @@ -73,4 +76,4 @@ class TXREQ(implicit p: Parameters) extends TL2CHIL2Module {
io.out.bits.tgtID := SAM(sam).lookup(io.out.bits.addr)
io.out.bits.size := log2Ceil(blockBytes).U(SIZE_WIDTH.W) // TODO
io.out.bits.addr := restoreAddressUInt(queue.io.deq.bits.addr, io.sliceId)
}
}
4 changes: 2 additions & 2 deletions src/main/scala/coupledL2/tl2chi/TXRSP.scala
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class TXRSP(implicit p: Parameters) extends TL2CHIL2Module {
require(chiOpt.isDefined)

// TODO: an mshrsAll-entry queue is too much, evaluate for a proper size later
val queue = Module(new Queue(new CHIRSP, entries = mshrsAll, flow = true))
val queue = Module(new Queue(new CHIRSP, entries = mshrsAll, flow = false))

// Back pressure logic from TXRSP
val queueCnt = queue.io.count
Expand Down Expand Up @@ -88,4 +88,4 @@ class TXRSP(implicit p: Parameters) extends TL2CHIL2Module {
// TODO: Finish this
rsp
}
}
}
2 changes: 2 additions & 0 deletions src/main/scala/coupledL2/tl2tl/Slice.scala
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ class Slice()(implicit p: Parameters) extends BaseSlice[OuterBundle] {
a_reqBuf.io.mshrInfo := mshrCtl.io.msInfo
a_reqBuf.io.mainPipeBlock := mainPipe.io.toReqBuf
a_reqBuf.io.s1Entrance := reqArb.io.s1Entrance
a_reqBuf.io.taskFromArb_s2 := reqArb.io.taskToPipe_s2

sinkB.io.msInfo := mshrCtl.io.msInfo
sinkC.io.msInfo := mshrCtl.io.msInfo

Expand Down

0 comments on commit d9b702d

Please sign in to comment.