From 45bf23865f8ab60874a231ba3da705f3af485399 Mon Sep 17 00:00:00 2001 From: Chen Xi <48302201+Ivyfeather@users.noreply.github.com> Date: Wed, 13 Sep 2023 10:32:50 +0800 Subject: [PATCH] GrantBuf: ensure pftRespQueue will never overflow (#56) * GrantBuf: ensure pftRespQueue will never overflow * GrantBuf: fix pftResp backpressure --- src/main/scala/coupledL2/GrantBuffer.scala | 48 ++++++++++++++-------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/src/main/scala/coupledL2/GrantBuffer.scala b/src/main/scala/coupledL2/GrantBuffer.scala index e34165c7..e1d94d07 100644 --- a/src/main/scala/coupledL2/GrantBuffer.scala +++ b/src/main/scala/coupledL2/GrantBuffer.scala @@ -149,22 +149,26 @@ class GrantBuffer(implicit p: Parameters) extends L2Module { ) // =========== send response to prefetcher =========== - // WARNING: this should never overflow (extremely rare though) - // but a second thought, pftQueue overflow results in no functional correctness bug + val pftRespEntry = new Bundle() { + val tag = UInt(tagBits.W) + val set = UInt(setBits.W) + } + // TODO: this may not need 10 entries, but this does not take much space + val pftQueueLen = 10 + val pftRespQueue = prefetchOpt.map(_ => Module(new Queue(pftRespEntry, entries = pftQueueLen, flow = true))) prefetchOpt.map { _ => - val pftRespQueue = Module(new Queue(new TaskWithData(), entries = 4, flow = true)) - - pftRespQueue.io.enq.valid := io.d_task.valid && dtaskOpcode === HintAck && + pftRespQueue.get.io.enq.valid := io.d_task.valid && dtaskOpcode === HintAck && io.d_task.bits.task.fromL2pft.getOrElse(false.B) - pftRespQueue.io.enq.bits := io.d_task.bits + pftRespQueue.get.io.enq.bits.tag := io.d_task.bits.task.tag + pftRespQueue.get.io.enq.bits.set := io.d_task.bits.task.set val resp = io.prefetchResp.get - resp.valid := pftRespQueue.io.deq.valid - resp.bits.tag := pftRespQueue.io.deq.bits.task.tag - resp.bits.set := pftRespQueue.io.deq.bits.task.set - pftRespQueue.io.deq.ready := resp.ready + resp.valid := pftRespQueue.get.io.deq.valid + resp.bits.tag := pftRespQueue.get.io.deq.bits.tag + resp.bits.set := pftRespQueue.get.io.deq.bits.set + pftRespQueue.get.io.deq.ready := resp.ready - assert(pftRespQueue.io.enq.ready, "pftRespQueue should never be full, no back pressure logic") + assert(pftRespQueue.get.io.enq.ready, "pftRespQueue should never be full, no back pressure logic") } // If no prefetch, there never should be HintAck assert(prefetchOpt.nonEmpty.B || !io.d_task.valid || dtaskOpcode =/= HintAck) @@ -216,23 +220,32 @@ class GrantBuffer(implicit p: Parameters) extends L2Module { // =========== handle blocking - capacity conflict =========== // count the number of valid blocks + those in pipe that might use GrantBuf - // so that GrantBuffer will not exceed capacity - // TODO: we can still allow pft_resps (HintAck) to enter mainpipe + // so that GrantBuffer will not exceed capacity [back pressure] val noSpaceForSinkReq = PopCount(VecInit(io.pipeStatusVec.tail.map { case s => s.valid && (s.bits.fromA || s.bits.fromC) }).asUInt) + grantQueueCnt >= mshrsAll.U val noSpaceForMSHRReq = PopCount(VecInit(io.pipeStatusVec.map { case s => s.valid && s.bits.fromA }).asUInt) + grantQueueCnt >= mshrsAll.U + // pftRespQueue also requires back pressure to ensure that it will not exceed capacity + // Ideally, it should only block Prefetch from entering MainPipe + // But since it is extremely rare that pftRespQueue of 10 would be full, we just block all Entrance here, simpler logic + // TODO: consider optimize this + val noSpaceForSinkPft = prefetchOpt.map(_ => PopCount(VecInit(io.pipeStatusVec.tail.map { case s => + s.valid && s.bits.fromA + }).asUInt) + pftRespQueue.get.io.count >= pftQueueLen.U) + val noSpaceForMSHRPft = prefetchOpt.map(_ => PopCount(VecInit(io.pipeStatusVec.map { case s => + s.valid && s.bits.fromA + }).asUInt) + pftRespQueue.get.io.count >= pftQueueLen.U) - io.toReqArb.blockSinkReqEntrance.blockA_s1 := noSpaceForSinkReq + io.toReqArb.blockSinkReqEntrance.blockA_s1 := noSpaceForSinkReq || noSpaceForSinkPft.getOrElse(false.B) io.toReqArb.blockSinkReqEntrance.blockB_s1 := Cat(inflight_grant.map(g => g.valid && g.bits.set === io.fromReqArb.status_s1.b_set && g.bits.tag === io.fromReqArb.status_s1.b_tag)).orR //TODO: or should we still Stall B req? // A-replace related rprobe is handled in SourceB io.toReqArb.blockSinkReqEntrance.blockC_s1 := noSpaceForSinkReq - io.toReqArb.blockSinkReqEntrance.blockG_s1 := false.B - io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq + io.toReqArb.blockSinkReqEntrance.blockG_s1 := false.B // this is not used + io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq || noSpaceForMSHRPft.getOrElse(false.B) // =========== generating Hint to L1 =========== // TODO: the following keeps the exact same logic as before, but it needs serious optimization @@ -281,5 +294,8 @@ class GrantBuffer(implicit p: Parameters) extends L2Module { XSPerfHistogram(cacheParams, "grant_grantack_period", t, enable, 0, 12, 1) XSPerfMax(cacheParams, "max_grant_grantack_period", t, enable) } + // pftRespQueue is about to be full, and using back pressure to block All MainPipe Entrance + // which can SERIOUSLY affect performance, should consider less drastic prefetch policy + XSPerfAccumulate(cacheParams, "WARNING_pftRespQueue_about_to_full", noSpaceForSinkPft.getOrElse(false.B)) } }