From 45bf23865f8ab60874a231ba3da705f3af485399 Mon Sep 17 00:00:00 2001
From: Chen Xi <48302201+Ivyfeather@users.noreply.github.com>
Date: Wed, 13 Sep 2023 10:32:50 +0800
Subject: [PATCH] GrantBuf: ensure pftRespQueue will never overflow (#56)

* GrantBuf: ensure pftRespQueue will never overflow

* GrantBuf: fix pftResp backpressure
---
 src/main/scala/coupledL2/GrantBuffer.scala | 48 ++++++++++++++--------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/src/main/scala/coupledL2/GrantBuffer.scala b/src/main/scala/coupledL2/GrantBuffer.scala
index e34165c7..e1d94d07 100644
--- a/src/main/scala/coupledL2/GrantBuffer.scala
+++ b/src/main/scala/coupledL2/GrantBuffer.scala
@@ -149,22 +149,26 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
   )
 
   // =========== send response to prefetcher ===========
-  // WARNING: this should never overflow (extremely rare though)
-  // but a second thought, pftQueue overflow results in no functional correctness bug
+  val pftRespEntry = new Bundle() {
+    val tag = UInt(tagBits.W)
+    val set = UInt(setBits.W)
+  }
+  // TODO: this may not need 10 entries, but this does not take much space
+  val pftQueueLen = 10
+  val pftRespQueue = prefetchOpt.map(_ => Module(new Queue(pftRespEntry, entries = pftQueueLen, flow = true)))
   prefetchOpt.map { _ =>
-    val pftRespQueue = Module(new Queue(new TaskWithData(), entries = 4, flow = true))
-
-    pftRespQueue.io.enq.valid := io.d_task.valid && dtaskOpcode === HintAck &&
+    pftRespQueue.get.io.enq.valid := io.d_task.valid && dtaskOpcode === HintAck &&
       io.d_task.bits.task.fromL2pft.getOrElse(false.B)
-    pftRespQueue.io.enq.bits := io.d_task.bits
+    pftRespQueue.get.io.enq.bits.tag := io.d_task.bits.task.tag
+    pftRespQueue.get.io.enq.bits.set := io.d_task.bits.task.set
 
     val resp = io.prefetchResp.get
-    resp.valid := pftRespQueue.io.deq.valid
-    resp.bits.tag := pftRespQueue.io.deq.bits.task.tag
-    resp.bits.set := pftRespQueue.io.deq.bits.task.set
-    pftRespQueue.io.deq.ready := resp.ready
+    resp.valid := pftRespQueue.get.io.deq.valid
+    resp.bits.tag := pftRespQueue.get.io.deq.bits.tag
+    resp.bits.set := pftRespQueue.get.io.deq.bits.set
+    pftRespQueue.get.io.deq.ready := resp.ready
 
-    assert(pftRespQueue.io.enq.ready, "pftRespQueue should never be full, no back pressure logic")
+    assert(pftRespQueue.get.io.enq.ready, "pftRespQueue should never be full, no back pressure logic")
   }
   // If no prefetch, there never should be HintAck
   assert(prefetchOpt.nonEmpty.B || !io.d_task.valid || dtaskOpcode =/= HintAck)
@@ -216,23 +220,32 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
 
   // =========== handle blocking - capacity conflict ===========
   // count the number of valid blocks + those in pipe that might use GrantBuf
-  // so that GrantBuffer will not exceed capacity
-  // TODO: we can still allow pft_resps (HintAck) to enter mainpipe
+  // so that GrantBuffer will not exceed capacity [back pressure]
   val noSpaceForSinkReq = PopCount(VecInit(io.pipeStatusVec.tail.map { case s =>
     s.valid && (s.bits.fromA || s.bits.fromC)
   }).asUInt) + grantQueueCnt >= mshrsAll.U
   val noSpaceForMSHRReq = PopCount(VecInit(io.pipeStatusVec.map { case s =>
     s.valid && s.bits.fromA
   }).asUInt) + grantQueueCnt >= mshrsAll.U
+  // pftRespQueue also requires back pressure to ensure that it will not exceed capacity
+  // Ideally, it should only block Prefetch from entering MainPipe
+  // But since it is extremely rare that pftRespQueue of 10 would be full, we just block all Entrance here, simpler logic
+  // TODO: consider optimize this
+  val noSpaceForSinkPft = prefetchOpt.map(_ => PopCount(VecInit(io.pipeStatusVec.tail.map { case s =>
+    s.valid && s.bits.fromA
+  }).asUInt) + pftRespQueue.get.io.count >= pftQueueLen.U)
+  val noSpaceForMSHRPft = prefetchOpt.map(_ => PopCount(VecInit(io.pipeStatusVec.map { case s =>
+    s.valid && s.bits.fromA
+  }).asUInt) + pftRespQueue.get.io.count >= pftQueueLen.U)
 
-  io.toReqArb.blockSinkReqEntrance.blockA_s1 := noSpaceForSinkReq
+  io.toReqArb.blockSinkReqEntrance.blockA_s1 := noSpaceForSinkReq || noSpaceForSinkPft.getOrElse(false.B)
   io.toReqArb.blockSinkReqEntrance.blockB_s1 := Cat(inflight_grant.map(g => g.valid &&
     g.bits.set === io.fromReqArb.status_s1.b_set && g.bits.tag === io.fromReqArb.status_s1.b_tag)).orR
   //TODO: or should we still Stall B req?
   // A-replace related rprobe is handled in SourceB
   io.toReqArb.blockSinkReqEntrance.blockC_s1 := noSpaceForSinkReq
-  io.toReqArb.blockSinkReqEntrance.blockG_s1 := false.B
-  io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq
+  io.toReqArb.blockSinkReqEntrance.blockG_s1 := false.B // this is not used
+  io.toReqArb.blockMSHRReqEntrance := noSpaceForMSHRReq || noSpaceForMSHRPft.getOrElse(false.B)
 
   // =========== generating Hint to L1 ===========
   // TODO: the following keeps the exact same logic as before, but it needs serious optimization
@@ -281,5 +294,8 @@ class GrantBuffer(implicit p: Parameters) extends L2Module {
         XSPerfHistogram(cacheParams, "grant_grantack_period", t, enable, 0, 12, 1)
         XSPerfMax(cacheParams, "max_grant_grantack_period", t, enable)
     }
+    // pftRespQueue is about to be full, and using back pressure to block All MainPipe Entrance
+    // which can SERIOUSLY affect performance, should consider less drastic prefetch policy
+    XSPerfAccumulate(cacheParams, "WARNING_pftRespQueue_about_to_full", noSpaceForSinkPft.getOrElse(false.B))
   }
 }