Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(Backend, MemBlock): add support for Zacas extension #3958

Draft
wants to merge 13 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion difftest
2 changes: 2 additions & 0 deletions src/main/scala/xiangshan/Parameters.scala
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,8 @@ trait HasXSParameter {
def AddrBytes = AddrBits / 8 // unused
def DataBits = XLEN
def DataBytes = DataBits / 8
def QuadWordBits = DataBits * 2
def QuadWordBytes = QuadWordBits / 8
def VDataBytes = VLEN / 8
def HasFPU = coreParams.HasFPU
def HasVPU = coreParams.HasVPU
Expand Down
2 changes: 2 additions & 0 deletions src/main/scala/xiangshan/backend/Bundles.scala
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,8 @@ object Bundles {
fuType === FuType.ldu.U && LSUOpType.isHlv(fuOpType) || fuType === FuType.stu.U && LSUOpType.isHsv(fuOpType)
}

def isAMOCAS: Bool = FuType.isAMO(fuType) && LSUOpType.isAMOCAS(fuOpType)

def srcIsReady: Vec[Bool] = {
VecInit(this.srcType.zip(this.srcState).map {
case (t, s) => SrcType.isNotReg(t) || SrcState.isReady(s)
Expand Down
13 changes: 7 additions & 6 deletions src/main/scala/xiangshan/backend/MemBlock.scala
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,8 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
io.mem_to_ooo.writebackSta <> storeUnits.map(_.io.stout)
io.mem_to_ooo.writebackStd.zip(stdExeUnits).foreach {x =>
x._1.bits := x._2.io.out.bits
x._1.valid := x._2.io.out.fire
// AMOs do not need to write back std now.
x._1.valid := x._2.io.out.fire && !FuType.storeIsAMO(x._2.io.out.bits.uop.fuType)
}
io.mem_to_ooo.writebackHyuLda <> hybridUnits.map(_.io.ldout)
io.mem_to_ooo.writebackHyuSta <> hybridUnits.map(_.io.stout)
Expand Down Expand Up @@ -1608,7 +1609,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
storeUnits(i).io.stin.valid := false.B

state := s_atomics(i)
assert(!st_atomics.zipWithIndex.filterNot(_._2 == i).unzip._1.reduce(_ || _))
// assert(!st_atomics.zipWithIndex.filterNot(_._2 == i).unzip._1.reduce(_ || _))
}
for (i <- 0 until HyuCnt) when(st_atomics(StaCnt + i)) {
io.ooo_to_mem.issueHya(i).ready := atomicsUnit.io.in.ready
Expand All @@ -1618,17 +1619,17 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
assert(!st_atomics.zipWithIndex.filterNot(_._2 == StaCnt + i).unzip._1.reduce(_ || _))
}
when (atomicsUnit.io.out.valid) {
assert((0 until StaCnt + HyuCnt).map(state === s_atomics(_)).reduce(_ || _))
state := s_normal
}

atomicsUnit.io.in.valid := st_atomics.reduce(_ || _)
atomicsUnit.io.in.bits := Mux1H(Seq.tabulate(StaCnt)(i =>
st_atomics(i) -> io.ooo_to_mem.issueSta(i).bits) ++
Seq.tabulate(HyuCnt)(i => st_atomics(StaCnt+i) -> io.ooo_to_mem.issueHya(i).bits))
atomicsUnit.io.storeDataIn.valid := st_data_atomics.reduce(_ || _)
atomicsUnit.io.storeDataIn.bits := Mux1H(Seq.tabulate(StdCnt)(i =>
st_data_atomics(i) -> stData(i).bits))
atomicsUnit.io.storeDataIn.zipWithIndex.foreach { case (stdin, i) =>
stdin.valid := st_data_atomics(i)
stdin.bits := stData(i).bits
}
atomicsUnit.io.redirect <> redirect

// TODO: complete amo's pmp support
Expand Down
10 changes: 9 additions & 1 deletion src/main/scala/xiangshan/backend/decode/DecodeUnit.scala
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ object XDecode extends DecodeConstants {
AMOMINU_W -> XSDecode(SrcType.reg, SrcType.reg, SrcType.X, FuType.mou, LSUOpType.amominu_w, SelImm.X, xWen = T, noSpec = T, blockBack = T),
AMOMAX_W -> XSDecode(SrcType.reg, SrcType.reg, SrcType.X, FuType.mou, LSUOpType.amomax_w , SelImm.X, xWen = T, noSpec = T, blockBack = T),
AMOMAXU_W -> XSDecode(SrcType.reg, SrcType.reg, SrcType.X, FuType.mou, LSUOpType.amomaxu_w, SelImm.X, xWen = T, noSpec = T, blockBack = T),
AMOCAS_W -> XSDecode(SrcType.reg, SrcType.reg, SrcType.X, FuType.mou, LSUOpType.amocas_w, SelImm.X, uopSplitType = UopSplitType.AMO_CAS_W, xWen = T, noSpec = T, blockBack = T),

AMOADD_D -> XSDecode(SrcType.reg, SrcType.reg, SrcType.X, FuType.mou, LSUOpType.amoadd_d , SelImm.X, xWen = T, noSpec = T, blockBack = T),
AMOXOR_D -> XSDecode(SrcType.reg, SrcType.reg, SrcType.X, FuType.mou, LSUOpType.amoxor_d , SelImm.X, xWen = T, noSpec = T, blockBack = T),
Expand All @@ -250,6 +251,9 @@ object XDecode extends DecodeConstants {
AMOMINU_D -> XSDecode(SrcType.reg, SrcType.reg, SrcType.X, FuType.mou, LSUOpType.amominu_d, SelImm.X, xWen = T, noSpec = T, blockBack = T),
AMOMAX_D -> XSDecode(SrcType.reg, SrcType.reg, SrcType.X, FuType.mou, LSUOpType.amomax_d , SelImm.X, xWen = T, noSpec = T, blockBack = T),
AMOMAXU_D -> XSDecode(SrcType.reg, SrcType.reg, SrcType.X, FuType.mou, LSUOpType.amomaxu_d, SelImm.X, xWen = T, noSpec = T, blockBack = T),
AMOCAS_D -> XSDecode(SrcType.reg, SrcType.reg, SrcType.X, FuType.mou, LSUOpType.amocas_d, SelImm.X, uopSplitType = UopSplitType.AMO_CAS_D, xWen = T, noSpec = T, blockBack = T),

AMOCAS_Q -> XSDecode(SrcType.reg, SrcType.reg, SrcType.X, FuType.mou, LSUOpType.amocas_q, SelImm.X, uopSplitType = UopSplitType.AMO_CAS_Q, xWen = T, noSpec = T, blockBack = T),

LR_W -> XSDecode(SrcType.reg, SrcType.imm, SrcType.X, FuType.mou, LSUOpType.lr_w, SelImm.X, xWen = T, noSpec = T, blockBack = T),
LR_D -> XSDecode(SrcType.reg, SrcType.imm, SrcType.X, FuType.mou, LSUOpType.lr_d, SelImm.X, xWen = T, noSpec = T, blockBack = T),
Expand Down Expand Up @@ -872,6 +876,9 @@ class DecodeUnit(implicit p: Parameters) extends XSModule with DecodeUnitConstan
private val isAes64ks1iIllegal =
FuType.FuTypeOrR(decodedInst.fuType, FuType.bku) && (decodedInst.fuOpType === BKUOpType.aes64ks1i) && inst.isRnumIllegal

private val isAmocasQ = FuType.FuTypeOrR(decodedInst.fuType, FuType.mou) && decodedInst.fuOpType === LSUOpType.amocas_q
private val isAmocasQIllegal = isAmocasQ && (inst.RD(0) === 1.U || inst.RS2(0) === 1.U)

private val exceptionII =
decodedInst.selImm === SelImm.INVALID_INSTR ||
vecException.io.illegalInst ||
Expand All @@ -894,7 +901,8 @@ class DecodeUnit(implicit p: Parameters) extends XSModule with DecodeUnitConstan
io.fromCSR.illegalInst.cboZ && isCboZero ||
io.fromCSR.illegalInst.cboCF && (isCboClean || isCboFlush) ||
io.fromCSR.illegalInst.cboI && isCboInval ||
isAes64ks1iIllegal
isAes64ks1iIllegal ||
isAmocasQIllegal

private val exceptionVI =
io.fromCSR.virtualInst.sfenceVMA && FuType.FuTypeOrR(decodedInst.fuType, FuType.fence) && decodedInst.fuOpType === FenceOpType.sfence ||
Expand Down
64 changes: 64 additions & 0 deletions src/main/scala/xiangshan/backend/decode/DecodeUnitComp.scala
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,70 @@ class DecodeUnitComp()(implicit p : Parameters) extends XSModule with DecodeUnit
csBundle(0.U).flushPipe := vstartReg =/= 0.U

switch(typeOfSplit) {
is(UopSplitType.AMO_CAS_W) {
csBundle(0).uopIdx := 0.U
csBundle(0).fuOpType := Cat(0.U(3.W), LSUOpType.amocas_w)
csBundle(0).lsrc(0) := src1
csBundle(0).lsrc(1) := dest
csBundle(0).waitForward := true.B
csBundle(0).blockBackward := false.B

csBundle(1).uopIdx := 1.U
csBundle(1).fuOpType := Cat(1.U(3.W), LSUOpType.amocas_w)
csBundle(1).lsrc(0) := src1
csBundle(1).lsrc(1) := src2
csBundle(1).rfWen := false.B
csBundle(1).waitForward := false.B
csBundle(1).blockBackward := true.B
}
is(UopSplitType.AMO_CAS_D) {
csBundle(0).uopIdx := 0.U
csBundle(0).fuOpType := Cat(0.U(3.W), LSUOpType.amocas_d)
csBundle(0).lsrc(0) := src1
csBundle(0).lsrc(1) := dest
csBundle(0).waitForward := true.B
csBundle(0).blockBackward := false.B

csBundle(1).uopIdx := 1.U
csBundle(1).fuOpType := Cat(1.U(3.W), LSUOpType.amocas_d)
csBundle(1).lsrc(0) := src1
csBundle(1).lsrc(1) := src2
csBundle(1).rfWen := false.B
csBundle(1).waitForward := false.B
csBundle(1).blockBackward := true.B
}
is(UopSplitType.AMO_CAS_Q) {
csBundle(0).uopIdx := 0.U
csBundle(0).fuOpType := Cat(0.U(3.W), LSUOpType.amocas_q)
csBundle(0).lsrc(0) := src1
csBundle(0).lsrc(1) := dest
csBundle(0).waitForward := true.B
csBundle(0).blockBackward := false.B

csBundle(1).uopIdx := 1.U
csBundle(1).fuOpType := Cat(1.U(3.W), LSUOpType.amocas_q)
csBundle(1).lsrc(0) := src1
csBundle(1).lsrc(1) := src2
csBundle(1).rfWen := false.B
csBundle(1).waitForward := false.B
csBundle(1).blockBackward := false.B

csBundle(2).uopIdx := 2.U
csBundle(2).fuOpType := Cat(2.U(3.W), LSUOpType.amocas_q)
csBundle(2).lsrc(0) := src1
csBundle(2).lsrc(1) := Mux(dest === 0.U, 0.U, dest + 1.U)
csBundle(2).ldest := Mux(dest === 0.U, 0.U, dest + 1.U)
csBundle(2).waitForward := false.B
csBundle(2).blockBackward := false.B

csBundle(3).uopIdx := 3.U
csBundle(3).fuOpType := Cat(3.U(3.W), LSUOpType.amocas_q)
csBundle(3).lsrc(0) := src1
csBundle(3).lsrc(1) := Mux(src2 === 0.U, 0.U, src2 + 1.U)
csBundle(3).rfWen := false.B
csBundle(3).waitForward := false.B
csBundle(3).blockBackward := true.B
}
is(UopSplitType.VSET) {
// In simple decoder, rfWen and vecWen are not set
when(isVsetSimple) {
Expand Down
5 changes: 4 additions & 1 deletion src/main/scala/xiangshan/backend/decode/UopInfoGen.scala
Original file line number Diff line number Diff line change
Expand Up @@ -236,10 +236,13 @@ class UopInfoGen (implicit p: Parameters) extends XSModule {
UopSplitType.VEC_US_FF_LD -> (numOfUopVLoadStoreStrided +& 2.U),
UopSplitType.VEC_S_LDST -> (numOfUopVLoadStoreStrided +& 2.U), // with two move instructions
UopSplitType.VEC_I_LDST -> (numOfUopVLoadStoreIndexed +& 1.U),
UopSplitType.AMO_CAS_W -> 2.U,
UopSplitType.AMO_CAS_D -> 2.U,
UopSplitType.AMO_CAS_Q -> 4.U,
))

// number of writeback num
val numOfWB = numOfUop
val numOfWB = Mux(UopSplitType.isAMOCAS(typeOfSplit), numOfUop >> 1, numOfUop)

// vector instruction's uop UopSplitType are not SCA_SIM, and when the number of uop is 1, we can regard it as a simple instruction
isComplex := typeOfSplit =/= UopSplitType.SCA_SIM
Expand Down
3 changes: 2 additions & 1 deletion src/main/scala/xiangshan/backend/issue/Scheduler.scala
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,8 @@ class SchedulerMemImp(override val wrapper: Scheduler)(implicit params: SchdBloc
d2IqStaOut.zip(staEnqs).zip(stdEnqs).foreach{ case((dp, staIQ), stdIQ) =>
val isAllReady = staIQ.ready && stdIQ.ready
dp.ready := isAllReady
staIQ.valid := dp.valid && isAllReady
val isDropAmocasSta = dp.bits.isAMOCAS && dp.bits.uopIdx(0) === 1.U
staIQ.valid := dp.valid && isAllReady && !isDropAmocasSta
stdIQ.valid := dp.valid && isAllReady && FuType.FuTypeOrR(dp.bits.fuType, FuType.stu, FuType.mou)
}

Expand Down
4 changes: 3 additions & 1 deletion src/main/scala/xiangshan/backend/rename/Rename.scala
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,9 @@ class Rename(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHe
uops(i).wfflags := (compressMasksVec(i) & Cat(io.in.map(_.bits.wfflags).reverse)).orR
uops(i).dirtyFs := (compressMasksVec(i) & Cat(io.in.map(_.bits.fpWen).reverse)).orR
// vector instructions' uopSplitType cannot be UopSplitType.SCA_SIM
uops(i).dirtyVs := (compressMasksVec(i) & Cat(io.in.map(_.bits.uopSplitType =/= UopSplitType.SCA_SIM).reverse)).orR
uops(i).dirtyVs := (compressMasksVec(i) & Cat(io.in.map{ input =>
input.bits.uopSplitType =/= UopSplitType.SCA_SIM && !UopSplitType.isAMOCAS(input.bits.uopSplitType)
}.reverse)).orR
// psrc0,psrc1,psrc2 don't require v0ReadPorts because their srcType can distinguish whether they are V0 or not
uops(i).psrc(0) := Mux1H(uops(i).srcType(0)(2, 0), Seq(io.intReadPorts(i)(0), io.fpReadPorts(i)(0), io.vecReadPorts(i)(0)))
uops(i).psrc(1) := Mux1H(uops(i).srcType(1)(2, 0), Seq(io.intReadPorts(i)(1), io.fpReadPorts(i)(1), io.vecReadPorts(i)(1)))
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/xiangshan/backend/rob/Rob.scala
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,7 @@ class RobImp(override val wrapper: Rob)(implicit p: Parameters, params: BackendP
val enqWBNumVec = VecInit(io.enq.req.map(req => req.bits.numWB))

private val enqWriteStdVec: Vec[Bool] = VecInit(io.enq.req.map {
req => FuType.isAMO(req.bits.fuType) || FuType.isStore(req.bits.fuType)
req => FuType.isStore(req.bits.fuType)
})
val fflags_wb = fflagsWBs
val vxsat_wb = vxsatWBs
Expand Down
7 changes: 6 additions & 1 deletion src/main/scala/xiangshan/cache/CacheConstants.scala
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,15 @@ trait MemoryOpConstants {
def M_CLEAN = "b10011".U // write back dirty data and retain R/W permissions
def M_SFENCE = "b10100".U // flush TLB
def M_WOK = "b10111".U // check write permissions but don't perform a write
def M_XA_CASQ = "b11000".U // AMOCAS.Q
def M_XA_CASW = "b11010".U // AMOCAS.W
def M_XA_CASD = "b11011".U // AMOCAS.D

def isAMOLogical(cmd: UInt) = cmd === M_XA_SWAP || cmd === M_XA_XOR || cmd === M_XA_OR || cmd === M_XA_AND
def isAMOArithmetic(cmd: UInt) = cmd === M_XA_ADD || cmd === M_XA_MIN || cmd === M_XA_MAX || cmd === M_XA_MINU || cmd === M_XA_MAXU
def isAMO(cmd: UInt) = isAMOLogical(cmd) || isAMOArithmetic(cmd)
def isAMOCAS(cmd: UInt) = cmd === M_XA_CASW || cmd === M_XA_CASD || cmd === M_XA_CASQ
def isAMOCASQ(cmd: UInt) = cmd === M_XA_CASQ
def isAMO(cmd: UInt) = isAMOLogical(cmd) || isAMOArithmetic(cmd) || isAMOCAS(cmd)
def isPrefetch(cmd: UInt) = cmd === M_PFR || cmd === M_PFW
def isRead(cmd: UInt) = cmd === M_XRD || cmd === M_XLR || cmd === M_XSC || isAMO(cmd)
def isWrite(cmd: UInt) = cmd === M_XWR || cmd === M_PWR || cmd === M_XSC || isAMO(cmd)
Expand Down
2 changes: 2 additions & 0 deletions src/main/scala/xiangshan/cache/L1Cache.scala
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ trait HasL1CacheParameters extends HasXSParameter
def wordBits = DataBits
def wordBytes = wordBits / 8
def wordOffBits = log2Up(wordBytes)
def quadWordOffBits = log2Up(QuadWordBytes)
// the number of words in a block
def blockWords = blockBytes / wordBytes
def refillWords = refillBytes / wordBytes
Expand All @@ -89,6 +90,7 @@ trait HasL1CacheParameters extends HasXSParameter
def get_beat(addr: UInt) = addr(blockOffBits - 1, beatOffBits)
def get_row(addr: UInt) = addr(blockOffBits - 1, rowOffBits)
def get_word(addr: UInt) = addr(blockOffBits - 1, wordOffBits)
def get_quad_word(addr: UInt) = addr(blockOffBits - 1, quadWordOffBits)

def beatRows = beatBits/rowBits
def rowWords = rowBits/wordBits
Expand Down
4 changes: 1 addition & 3 deletions src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ class UncacheWordIO(implicit p: Parameters) extends DCacheBundle
class MainPipeResp(implicit p: Parameters) extends DCacheBundle {
//distinguish amo
val source = UInt(sourceTypeWidth.W)
val data = UInt(DataBits.W)
val data = UInt(QuadWordBits.W)
val miss = Bool()
val miss_id = UInt(log2Up(cfg.nMissEntries).W)
val replay = Bool()
Expand Down Expand Up @@ -1362,8 +1362,6 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
io.lsu.atomics.resp.valid := RegNext(atomic_resp_valid)
io.lsu.atomics.resp.bits := RegEnable(mainPipe.io.atomic_resp.bits, atomic_resp_valid)
io.lsu.atomics.block_lr := mainPipe.io.block_lr
// atomicsReplayUnit.io.pipe_resp := RegNext(mainPipe.io.atomic_resp)
// atomicsReplayUnit.io.block_lr <> mainPipe.io.block_lr

// Request
val missReqArb = Module(new TreeArbiter(new MissReq, MissReqPortCount))
Expand Down
42 changes: 0 additions & 42 deletions src/main/scala/xiangshan/cache/dcache/mainpipe/AMOALU.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,48 +22,6 @@ package xiangshan.cache
import chisel3._
import chisel3.util._

class StoreGen(typ: UInt, addr: UInt, dat: UInt, maxSize: Int) {
val size = typ(log2Up(log2Up(maxSize)+1)-1,0)
def misaligned =
(addr & ((1.U << size) - 1.U)(log2Up(maxSize)-1,0)).orR

def mask = {
var res = 1.U
for (i <- 0 until log2Up(maxSize)) {
val upper = Mux(addr(i), res, 0.U) | Mux(size >= (i+1).U, ((BigInt(1) << (1 << i))-1).U, 0.U)
val lower = Mux(addr(i), 0.U, res)
res = Cat(upper, lower)
}
res
}

protected def genData(i: Int): UInt =
if (i >= log2Up(maxSize)) dat
else Mux(size === i.U, Fill(1 << (log2Up(maxSize)-i), dat((8 << i)-1,0)), genData(i+1))

def data = genData(0)
def wordData = genData(2)
}

class LoadGen(typ: UInt, signed: Bool, addr: UInt, dat: UInt, zero: Bool, maxSize: Int) {
private val size = new StoreGen(typ, addr, dat, maxSize).size

private def genData(logMinSize: Int): UInt = {
var res = dat
for (i <- log2Up(maxSize)-1 to logMinSize by -1) {
val pos = 8 << i
val shifted = Mux(addr(i), res(2*pos-1,pos), res(pos-1,0))
val doZero = (i == 0).B && zero
val zeroed = Mux(doZero, 0.U, shifted)
res = Cat(Mux(size === i.U || doZero, Fill(8*maxSize-pos, signed && zeroed(pos-1)), res(8*maxSize-1,pos)), zeroed)
}
res
}

def wordData = genData(2)
def data = genData(0)
}

class AMOALU(operandBits: Int) extends Module
with MemoryOpConstants {
val minXLen = 32
Expand Down
Loading