From 6c729fe21201081b283c0a5f4c8d1e4b28acdca6 Mon Sep 17 00:00:00 2001 From: XiChen Date: Mon, 19 Aug 2024 15:00:47 +0800 Subject: [PATCH 1/2] Misc: Split Data SRAM in Chisel split into 4 smaller srams (dataWidth/4) --- src/main/scala/coupledL2/DataStorage.scala | 5 +- .../scala/coupledL2/utils/BankedSRAM.scala | 21 +++- .../scala/coupledL2/utils/SplittedSRAM.scala | 115 ++++++++++++++++++ 3 files changed, 137 insertions(+), 4 deletions(-) create mode 100644 src/main/scala/coupledL2/utils/SplittedSRAM.scala diff --git a/src/main/scala/coupledL2/DataStorage.scala b/src/main/scala/coupledL2/DataStorage.scala index 38be2e5a..74c95824 100644 --- a/src/main/scala/coupledL2/DataStorage.scala +++ b/src/main/scala/coupledL2/DataStorage.scala @@ -19,7 +19,7 @@ package coupledL2 import chisel3._ import chisel3.util._ -import coupledL2.utils.{HoldUnless, SRAMTemplate} +import coupledL2.utils.{HoldUnless, SRAMTemplate, SplittedSRAM} import utility.ClockGate import org.chipsalliance.cde.config.Parameters @@ -48,10 +48,11 @@ class DataStorage(implicit p: Parameters) extends L2Module { }) // read data is set MultiCycle Path 2 - val array = Module(new SRAMTemplate( + val array = Module(new SplittedSRAM( gen = new DSBlock, set = blocks, way = 1, + dataSplit = 4, singlePort = true, readMCP2 = true )) diff --git a/src/main/scala/coupledL2/utils/BankedSRAM.scala b/src/main/scala/coupledL2/utils/BankedSRAM.scala index e6273951..2540a779 100644 --- a/src/main/scala/coupledL2/utils/BankedSRAM.scala +++ b/src/main/scala/coupledL2/utils/BankedSRAM.scala @@ -1,3 +1,20 @@ +/** ************************************************************************************* + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + * ************************************************************************************* + */ + package coupledL2.utils import chisel3._ @@ -11,7 +28,7 @@ class BankedSRAM[T <: Data] gen: T, sets: Int, ways: Int, n: Int = 1, shouldReset: Boolean = false, holdRead: Boolean = false, singlePort: Boolean = false, bypassWrite: Boolean = false, - clkDivBy2: Boolean = false + clkDivBy2: Boolean = false, readMCP2: Boolean = false ) extends Module { val io = IO(new Bundle() { val r = Flipped(new SRAMReadBus(gen, sets, ways)) @@ -33,7 +50,7 @@ class BankedSRAM[T <: Data] gen, innerSet, ways, shouldReset = shouldReset, holdRead = holdRead, singlePort = true, bypassWrite = bypassWrite, - clkDivBy2 = clkDivBy2 + clkDivBy2 = clkDivBy2, readMCP2 = readMCP2 )) sram.io.r.req.valid := io.r.req.valid && ren sram.io.r.req.bits.apply(r_setIdx) diff --git a/src/main/scala/coupledL2/utils/SplittedSRAM.scala b/src/main/scala/coupledL2/utils/SplittedSRAM.scala new file mode 100644 index 00000000..874187e2 --- /dev/null +++ b/src/main/scala/coupledL2/utils/SplittedSRAM.scala @@ -0,0 +1,115 @@ +package coupledL2.utils + +import chisel3._ +import chisel3.util._ + +// split SRAM by set/way/data +// 1. use lower-bits of set to select bank +// 2. split ways and parallel access +// 3. split data and parallel access +// * a simple graph is shown below + +class SplittedSRAM[T <: Data] +( + gen: T, set: Int, way: Int, + setSplit: Int = 1, waySplit: Int = 1, dataSplit: Int = 1, + shouldReset: Boolean = false, holdRead: Boolean = false, + singlePort: Boolean = true, bypassWrite: Boolean = false, + clkDivBy2: Boolean = false, readMCP2: Boolean = true +) extends Module { + val io = IO(new Bundle() { + val r = Flipped(new SRAMReadBus(gen, set, way)) + val w = Flipped(new SRAMWriteBus(gen, set, way)) + }) + + + require(set % setSplit == 0, "sets must be divisible by setSplit") + val innerSets = set / setSplit + val bankBits = log2Ceil(setSplit) + val innerSetBits = log2Up(set) - bankBits + val r_setIdx = io.r.req.bits.setIdx.head(innerSetBits) + val r_bankSel = if(setSplit == 1) 0.U else io.r.req.bits.setIdx(bankBits - 1, 0) + val w_setIdx = io.w.req.bits.setIdx.head(innerSetBits) + val w_bankSel = if(setSplit == 1) 0.U else io.w.req.bits.setIdx(bankBits - 1, 0) + + require(way % waySplit == 0, "ways must be divisible by waySplit") + val innerWays = way / waySplit + + require(gen.getWidth % dataSplit == 0, "data width must be divisible by dataSplit") + val innerWidth = gen.getWidth / dataSplit + + val array = Seq.fill(setSplit)(Seq.fill(waySplit)(Seq.fill(dataSplit)( + Module(new SRAMTemplate( + UInt(innerWidth.W), innerSets, innerWays, + shouldReset = shouldReset, holdRead = holdRead, + singlePort = singlePort, bypassWrite = bypassWrite, + clkDivBy2 = clkDivBy2, readMCP2 = readMCP2 + )) + ))) + + for (i <- 0 until setSplit) { + val ren = i.U === r_bankSel + val wen = i.U === w_bankSel + + for (j <- 0 until waySplit) { + val waymask = if (way > 1) io.w.req.bits.waymask.get(innerWays * (j+1) - 1, innerWays * j) else 1.U + // if waymask-part is 0.U, we need not set array(i)(j)(_).io.w.req + // TODO: consider whether to add this, which may lower power consumption, but will add burden to timing + // val needWrite = waymask.orR + + for (k <- 0 until dataSplit) { + array(i)(j)(k).io.r.req.valid := io.r.req.valid && ren + array(i)(j)(k).io.r.req.bits.apply(r_setIdx) + array(i)(j)(k).io.w.req.valid := io.w.req.valid && wen // && needWrite + array(i)(j)(k).io.w.req.bits.apply( + VecInit(io.w.req.bits.data.slice(innerWays * j, innerWays * (j+1)).map(_.asUInt(innerWidth * (k+1) - 1, innerWidth * k))), + w_setIdx, waymask + ) + } + } + } + + val ren_vec_0 = VecInit((0 until setSplit).map(i => i.U === r_bankSel)) + val ren_vec_1 = RegNext(ren_vec_0, 0.U.asTypeOf(ren_vec_0)) + val ren_vec = if(clkDivBy2){ + RegNext(ren_vec_1, 0.U.asTypeOf(ren_vec_0)) + } else ren_vec_1 + + // only one read/write + assert({PopCount(ren_vec) <= 1.U}) + + // TODO: we should consider the readys of all sram to be accessed, and orR them + // but since waySplitted and dataSplitted smaller srams should have the same behavior + // we just use one of them for ready, for better timing + io.r.req.ready := VecInit((0 until setSplit).map(i => array(i).head.head.io.r.req.ready))(r_bankSel) + io.w.req.ready := VecInit((0 until setSplit).map(i => array(i).head.head.io.w.req.ready))(w_bankSel) + + + // * an example of "setSplit 2, waySplit 2, dataSplit 4" of an SRAM with way 2 * + // ========================================================================================= + // / way 0 -- [data 3] | [data 2] | [data 1] | [data 0] + // set[0] == 0.U -> waySplit 0 |- way 1 -- [data 3] | [data 2] | [data 1] | [data 0] + // ----------------------------------------------------------------------------------------- + // waySplit 1 |- way 0 -- [data 3] | [data 2] | [data 1] | [data 0] + // \ way 1 -- [data 3] | [data 2] | [data 1] | [data 0] + // =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= + // / way 0 -- [data 3] | [data 2] | [data 1] | [data 0] + // set[0] == 0.U -> waySplit 0 |- way 1 -- [data 3] | [data 2] | [data 1] | [data 0] + // ----------------------------------------------------------------------------------------- + // waySplit 1 |- way 0 -- [data 3] | [data 2] | [data 1] | [data 0] + // \ way 1 -- [data 3] | [data 2] | [data 1] | [data 0] + // ========================================================================================= + // 1. aggregate data of the same line first + // 2. collect all data lines in the same `WaySplit` + // 3. use flatMap to collect all `WaySplit`, and we can get the targetData (Vec[T]) + // 4. use ren_vec to select the certain set + val allData = (0 until setSplit).map(i => + VecInit((0 until waySplit).flatMap(j => + (0 until innerWays).map(w => + Cat((0 until dataSplit).map(k => array(i)(j)(k).io.r.resp.data(w)).reverse) + ) + )) + ) + + io.r.resp.data := Mux1H(ren_vec, allData).asTypeOf(Vec(way, gen)) +} From af1b30583aad6282e4425a943843f06c67078657 Mon Sep 17 00:00:00 2001 From: XiChen Date: Mon, 26 Aug 2024 14:38:42 +0800 Subject: [PATCH 2/2] misc: add SplittedSRAM chisel-test --- src/test/scala/TestSplittedSRAM.scala | 72 +++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 src/test/scala/TestSplittedSRAM.scala diff --git a/src/test/scala/TestSplittedSRAM.scala b/src/test/scala/TestSplittedSRAM.scala new file mode 100644 index 00000000..413adb27 --- /dev/null +++ b/src/test/scala/TestSplittedSRAM.scala @@ -0,0 +1,72 @@ +package coupledL2 + +import coupledL2.utils._ +import chisel3._ +import chisel3.util._ +import chiseltest._ +import chiseltest.RawTester.test +import chisel3.experimental._ +import chisel3.testers._ +import org.chipsalliance.cde.config._ +import chisel3.stage.{ChiselGeneratorAnnotation, ChiselStage} +import scala.collection.mutable.ArrayBuffer +import chiseltest.WriteVcdAnnotation +import scala.util.Random + +object TestSplittedSRAM extends App { + val dataw = 24 + val nsets = 32 + val nways = 18 + val ntests = 16 + val dumpVcd = false +// println(getVerilogString(new SplittedSRAM(UInt(dataw.W), sets = nsets, ways = nways, +// setSplit = 2, waySplit = 3, dataSplit = 4, shouldReset = false, singlePort = true))) + test( + new SplittedSRAM(UInt(dataw.W), set = nsets, way = nways, setSplit = 2, waySplit = 3, dataSplit = 4, + shouldReset = false, singlePort = true), if (dumpVcd) Seq(WriteVcdAnnotation) else Seq()) + { s => + val randomWaymask = new Random(12) + val randomData = new Random(34) + val randomSet = new Random(56) + var correctData = ArrayBuffer.fill(nsets)(ArrayBuffer.fill(nways)(0)) + s.clock.step(2) + + for (i <- 0 until ntests) { + val data = (0 until nways).map(_ => randomData.nextInt(1 << dataw)) + val waymask = randomWaymask.nextInt(1 << nways) + val set = randomSet.nextInt(nsets) + s.io.w.req.valid.poke(true.B) + + s.io.w.req.bits.setIdx.poke(set.U) + s.io.w.req.bits.waymask.map(_.poke(waymask.U)) + (0 until nways).map { + w => + s.io.w.req.bits.data(w).poke(data(w).U) + correctData(set)(w) = if ((waymask & (1 << w)) != 0) data(w) else correctData(set)(w) + } + + s.clock.step(1) + s.io.w.req.valid.poke(false.B) + + s.clock.step(1) + s.io.r.req.valid.poke(true.B) + s.io.r.req.bits.setIdx.poke(set.U) + + s.clock.step(1) + s.io.r.req.valid.poke(false.B) + + (0 until nways).map { + w => s.io.r.resp.data(w).expect(correctData(set)(w).U) + } + s.clock.step(1) + + println(s"passed $i") + } + } +} + +/* +# To build a signle test Module +mill: + mill -i CoupledL2.test.runMain coupledL2.TestSplittedSRAM +*/ \ No newline at end of file