Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

misc: Split Data SRAM in Chisel #229

Merged
merged 2 commits into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/main/scala/coupledL2/DataStorage.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package coupledL2

import chisel3._
import chisel3.util._
import coupledL2.utils.{HoldUnless, SRAMTemplate}
import coupledL2.utils.{HoldUnless, SRAMTemplate, SplittedSRAM}
import utility.ClockGate
import org.chipsalliance.cde.config.Parameters

Expand Down Expand Up @@ -48,10 +48,11 @@ class DataStorage(implicit p: Parameters) extends L2Module {
})

// read data is set MultiCycle Path 2
val array = Module(new SRAMTemplate(
val array = Module(new SplittedSRAM(
gen = new DSBlock,
set = blocks,
way = 1,
dataSplit = 4,
singlePort = true,
readMCP2 = true
))
Expand Down
21 changes: 19 additions & 2 deletions src/main/scala/coupledL2/utils/BankedSRAM.scala
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
/** *************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
* *************************************************************************************
*/

package coupledL2.utils

import chisel3._
Expand All @@ -11,7 +28,7 @@ class BankedSRAM[T <: Data]
gen: T, sets: Int, ways: Int, n: Int = 1,
shouldReset: Boolean = false, holdRead: Boolean = false,
singlePort: Boolean = false, bypassWrite: Boolean = false,
clkDivBy2: Boolean = false
clkDivBy2: Boolean = false, readMCP2: Boolean = false
) extends Module {
val io = IO(new Bundle() {
val r = Flipped(new SRAMReadBus(gen, sets, ways))
Expand All @@ -33,7 +50,7 @@ class BankedSRAM[T <: Data]
gen, innerSet, ways,
shouldReset = shouldReset, holdRead = holdRead,
singlePort = true, bypassWrite = bypassWrite,
clkDivBy2 = clkDivBy2
clkDivBy2 = clkDivBy2, readMCP2 = readMCP2
))
sram.io.r.req.valid := io.r.req.valid && ren
sram.io.r.req.bits.apply(r_setIdx)
Expand Down
115 changes: 115 additions & 0 deletions src/main/scala/coupledL2/utils/SplittedSRAM.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
package coupledL2.utils

import chisel3._
import chisel3.util._

// split SRAM by set/way/data
// 1. use lower-bits of set to select bank
// 2. split ways and parallel access
// 3. split data and parallel access
// * a simple graph is shown below

class SplittedSRAM[T <: Data]
(
gen: T, set: Int, way: Int,
setSplit: Int = 1, waySplit: Int = 1, dataSplit: Int = 1,
shouldReset: Boolean = false, holdRead: Boolean = false,
singlePort: Boolean = true, bypassWrite: Boolean = false,
clkDivBy2: Boolean = false, readMCP2: Boolean = true
) extends Module {
val io = IO(new Bundle() {
val r = Flipped(new SRAMReadBus(gen, set, way))
val w = Flipped(new SRAMWriteBus(gen, set, way))
})


require(set % setSplit == 0, "sets must be divisible by setSplit")
val innerSets = set / setSplit
val bankBits = log2Ceil(setSplit)
val innerSetBits = log2Up(set) - bankBits
val r_setIdx = io.r.req.bits.setIdx.head(innerSetBits)
val r_bankSel = if(setSplit == 1) 0.U else io.r.req.bits.setIdx(bankBits - 1, 0)
val w_setIdx = io.w.req.bits.setIdx.head(innerSetBits)
val w_bankSel = if(setSplit == 1) 0.U else io.w.req.bits.setIdx(bankBits - 1, 0)

require(way % waySplit == 0, "ways must be divisible by waySplit")
val innerWays = way / waySplit

require(gen.getWidth % dataSplit == 0, "data width must be divisible by dataSplit")
val innerWidth = gen.getWidth / dataSplit

val array = Seq.fill(setSplit)(Seq.fill(waySplit)(Seq.fill(dataSplit)(
Module(new SRAMTemplate(
UInt(innerWidth.W), innerSets, innerWays,
shouldReset = shouldReset, holdRead = holdRead,
singlePort = singlePort, bypassWrite = bypassWrite,
clkDivBy2 = clkDivBy2, readMCP2 = readMCP2
))
)))

for (i <- 0 until setSplit) {
val ren = i.U === r_bankSel
val wen = i.U === w_bankSel

for (j <- 0 until waySplit) {
val waymask = if (way > 1) io.w.req.bits.waymask.get(innerWays * (j+1) - 1, innerWays * j) else 1.U
// if waymask-part is 0.U, we need not set array(i)(j)(_).io.w.req
// TODO: consider whether to add this, which may lower power consumption, but will add burden to timing
// val needWrite = waymask.orR

for (k <- 0 until dataSplit) {
array(i)(j)(k).io.r.req.valid := io.r.req.valid && ren
array(i)(j)(k).io.r.req.bits.apply(r_setIdx)
array(i)(j)(k).io.w.req.valid := io.w.req.valid && wen // && needWrite
array(i)(j)(k).io.w.req.bits.apply(
VecInit(io.w.req.bits.data.slice(innerWays * j, innerWays * (j+1)).map(_.asUInt(innerWidth * (k+1) - 1, innerWidth * k))),
w_setIdx, waymask
)
}
}
}

val ren_vec_0 = VecInit((0 until setSplit).map(i => i.U === r_bankSel))
val ren_vec_1 = RegNext(ren_vec_0, 0.U.asTypeOf(ren_vec_0))
val ren_vec = if(clkDivBy2){
RegNext(ren_vec_1, 0.U.asTypeOf(ren_vec_0))
} else ren_vec_1

// only one read/write
assert({PopCount(ren_vec) <= 1.U})

// TODO: we should consider the readys of all sram to be accessed, and orR them
// but since waySplitted and dataSplitted smaller srams should have the same behavior
// we just use one of them for ready, for better timing
io.r.req.ready := VecInit((0 until setSplit).map(i => array(i).head.head.io.r.req.ready))(r_bankSel)
io.w.req.ready := VecInit((0 until setSplit).map(i => array(i).head.head.io.w.req.ready))(w_bankSel)


// * an example of "setSplit 2, waySplit 2, dataSplit 4" of an SRAM with way 2 *
// =========================================================================================
// / way 0 -- [data 3] | [data 2] | [data 1] | [data 0]
// set[0] == 0.U -> waySplit 0 |- way 1 -- [data 3] | [data 2] | [data 1] | [data 0]
// -----------------------------------------------------------------------------------------
// waySplit 1 |- way 0 -- [data 3] | [data 2] | [data 1] | [data 0]
// \ way 1 -- [data 3] | [data 2] | [data 1] | [data 0]
// =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
// / way 0 -- [data 3] | [data 2] | [data 1] | [data 0]
// set[0] == 0.U -> waySplit 0 |- way 1 -- [data 3] | [data 2] | [data 1] | [data 0]
// -----------------------------------------------------------------------------------------
// waySplit 1 |- way 0 -- [data 3] | [data 2] | [data 1] | [data 0]
// \ way 1 -- [data 3] | [data 2] | [data 1] | [data 0]
// =========================================================================================
// 1. aggregate data of the same line first
// 2. collect all data lines in the same `WaySplit`
// 3. use flatMap to collect all `WaySplit`, and we can get the targetData (Vec[T])
// 4. use ren_vec to select the certain set
val allData = (0 until setSplit).map(i =>
VecInit((0 until waySplit).flatMap(j =>
(0 until innerWays).map(w =>
Cat((0 until dataSplit).map(k => array(i)(j)(k).io.r.resp.data(w)).reverse)
)
))
)

io.r.resp.data := Mux1H(ren_vec, allData).asTypeOf(Vec(way, gen))
}
72 changes: 72 additions & 0 deletions src/test/scala/TestSplittedSRAM.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package coupledL2

import coupledL2.utils._
import chisel3._
import chisel3.util._
import chiseltest._
import chiseltest.RawTester.test
import chisel3.experimental._
import chisel3.testers._
import org.chipsalliance.cde.config._
import chisel3.stage.{ChiselGeneratorAnnotation, ChiselStage}
import scala.collection.mutable.ArrayBuffer
import chiseltest.WriteVcdAnnotation
import scala.util.Random

object TestSplittedSRAM extends App {
val dataw = 24
val nsets = 32
val nways = 18
val ntests = 16
val dumpVcd = false
// println(getVerilogString(new SplittedSRAM(UInt(dataw.W), sets = nsets, ways = nways,
// setSplit = 2, waySplit = 3, dataSplit = 4, shouldReset = false, singlePort = true)))
test(
new SplittedSRAM(UInt(dataw.W), set = nsets, way = nways, setSplit = 2, waySplit = 3, dataSplit = 4,
shouldReset = false, singlePort = true), if (dumpVcd) Seq(WriteVcdAnnotation) else Seq())
{ s =>
val randomWaymask = new Random(12)
val randomData = new Random(34)
val randomSet = new Random(56)
var correctData = ArrayBuffer.fill(nsets)(ArrayBuffer.fill(nways)(0))
s.clock.step(2)

for (i <- 0 until ntests) {
val data = (0 until nways).map(_ => randomData.nextInt(1 << dataw))
val waymask = randomWaymask.nextInt(1 << nways)
val set = randomSet.nextInt(nsets)
s.io.w.req.valid.poke(true.B)

s.io.w.req.bits.setIdx.poke(set.U)
s.io.w.req.bits.waymask.map(_.poke(waymask.U))
(0 until nways).map {
w =>
s.io.w.req.bits.data(w).poke(data(w).U)
correctData(set)(w) = if ((waymask & (1 << w)) != 0) data(w) else correctData(set)(w)
}

s.clock.step(1)
s.io.w.req.valid.poke(false.B)

s.clock.step(1)
s.io.r.req.valid.poke(true.B)
s.io.r.req.bits.setIdx.poke(set.U)

s.clock.step(1)
s.io.r.req.valid.poke(false.B)

(0 until nways).map {
w => s.io.r.resp.data(w).expect(correctData(set)(w).U)
}
s.clock.step(1)

println(s"passed $i")
}
}
}

/*
# To build a signle test Module
mill:
mill -i CoupledL2.test.runMain coupledL2.TestSplittedSRAM
*/
Loading