Bluespec technical deep dive

Download Report

Transcript Bluespec technical deep dive

Realistic Memories and
Caches
Teacher: Yoav Etsion
Taken (with permission) from
Arvind (with Asif Khan), Massachusetts Institute of Technology
Derek Chiou, The University of Texas at Austin
1
Five-Stage SMIPS
Register File
Epoch
PC
Next
Addr
Pred
Inst
Memory
ir
Decode
itr
Execute
scoreboard
er
cr
Data
Memory
L8-2
Five-Stage SMIPS
state declarations
module mkProc(Proc);
EHR#(2, Addr)
pc
RFile
rf
Scoreboard#(4)
sb
EHR#(2, Bool) epoch
NextAddrPred
bpred
CombCache
iCache
CombCache
dCache
<<<<<<<-
mkEHRU;
mkBypassRFile;
mkScoreboard;
mkEHR(False);
mkBTB;
mkICache;
mkDCache;
BFIFO#(TypeFetch2Fetch)
fr
PFIFO#(TypeFetch2Decode)
ir
PFIFO#(TypeDecode2Execute) itr
PFIFO#(TypeExecute2Memory) er
BFIFO#(TypeMemory2Memory)
mr
PFIFO#(TypeMemory2Commit)
cr
BFIFO#(TypeNextPC)
nextPC
<<<<<<<-
mkBypassFIFO;
mkPipeFIFO;
mkPipeFIFO;
mkPipeFIFO;
mkBypassFIFO;
mkPipeFIFO;
mkBypassFIFO;
L13-3
Processor Rule ordering
doCommit <
(doMem1 < doMem2) <
doExecute <
doDecode <
(doFetch1 < doFetch2)
 cr, er, itr, ir: first < deq < enq
 mr, fr: enq < first < deq
L13-4
Method ordering for modules
required by processor rules
Cache methods

req < (resp < respDeq)
Register File methods

wr < {rd1, rd2}
Scoreboard methods

remove < (search < insert)
NextAddrPred methods

prediction < update
epoch methods

r0 < w0 < r1 < w1
L13-5
Five-Stage SMIPS
Fetch rules
rule doFetch1 (fr.notFull);
iCache.req(TypeMemReq{op:Ld, addr:pc.r[1], data:?});
let ppc = bpred.prediction(pc.r[1]);
fr.enq(TypeFecth2Fetch{pc:pc.r[1], ppc:ppc,
epoch:epoch.r[1]});
pc.w[1](ppc);
endrule
rule doFetch2 (fr.notEmpty && ir.notFull);
let frpc
= fr.first.pc;
let frppc
= fr.first.ppc;
let frepoch = fr.first.epoch;
let inst = iCache.resp; iCache.respDeq;
ir.enq(TypeFetch2Decode{pc:frpc, ppc:frppc,
epoch:frepoch, inst:inst});
fr.deq;
endrule
L13-6
Five-Stage SMIPS
Decode rule
rule doDecode (ir.notEmpty && itr.notFull);
let irpc
= ir.first.pc;
let irppc
= ir.first.ppc;
let irepoch = ir.first.epoch;
let inst
= ir.first.inst;
let dInst = decode(inst);
let stall = sb.search(dInst.src1, dInst.src2);
if(!stall)
begin
let rVal1 = rf.rd1(fromMaybe(dInst.src1));
let rVal2 = rf.rd2(fromMaybe(dInst.src2));
itr.enq(TypeDecode2Execute{pc:irpc, ppc:irppc,
epoch:irepoch, dInst:dInst,
rVal1:rVal1, rVal2:rVal2});
sb.insert(dInst.rDst);
ir.deq;
end
endrule
L13-7
Five-Stage SMIPS
Execute rule
rule doExecute (itr.notEmpty && er.notFull);
let itrpc=itr.first.pc;
let itrppc=itr.first.ppc;
let dInst=itr.first.dInst;
let rVal1=itr.first.rVal1; let rVal2=itr.first.rVal2;
if(itr.first.epoch==epoch.r[0])
begin
let eInst=execute(dInst,rVal1,rVal2,itrpc,itrppc);
er.enq(TypeExecute2Memory{eInst:eInst, memData:?});
if(eInst.misprediction) begin
let npc = eInst.brTaken ? eInst.addr : itrpc+4;
pc.w[0](npc); epoch.w[0](!epoch.r[0]);
bpred.update(itrpc, npc);
end
end
else begin
ExecInst eInst = ?; eInst.iType = Nop;
er.enq(TypeExecute2Memory{eInst:eInst, memData:?});
end
itr.deq;
endrule
L13-8
Five-Stage SMIPS
Data Memory rules
rule doMemory1 (er.notEmpty && mr.notFull);
let eInst = er.first.eInst;
if(memType(eInst.iType))
dCache.req(MemReq{op:eInst.iType==Ld ? Ld : St,
addr:eInst.addr, data:eInst.data});
mr.enq(TypeMemory2Memory{eInst:eInst, memData:?});
er.deq;
endrule
rule doMemory2 (mr.notEmpty && cr.notFull);
let eInst = mr.first.eInst;
if(eInst.iType==Ld) begin
let md = dCache.resp;
dCache.respDeq;
end
cr.enq(TypeMemory2Commit{eInst:eInst, memData:md});
mr.deq;
endrule
L13-9
Five-Stage SMIPS
Commit rule
rule doCommit (cr.notEmpty);
let eInst
= cr.first.eInst;
let memData = cr.first.memData;
regUpdate(eInst, memData, rf);
cr.deq;
sb.remove;
endrule
endmodule
L13-10
Blocking Cache Interface
req
missReq
Processor
cache
resp
respDeq
mReqQ
hitQ
mRespQ
mReq
DRAM
mResp
interface Cache;
method Action req(MemReq r);
method MemResp resp;
method Action respDeq;
method ActionValue#(MemReq) mReq;
method Action mResp(MemResp r);
endinterface
L13-11
Direct mapped caches
L13-12
Blocking Cache
typedefs
typedef
typedef
typedef
typedef
typedef
32 AddrSz;
256 Rows;
Bit#(AddrSz) Addr;
Bit#(TLog#(Rows)) Index;
Bit#(TSub#(AddrSz, TAdd#(TLog#(Rows), 2))) Tag;
typedef 32 DataSz;
typedef Bit#(DataSz) Data;
tyepdef enum {Rdy, WrBack, FillReq, FillResp, FillHit}
CacheStatus deriving(Bits, Eq);
L13-13
L12-13
Blocking Cache
code structure
module mkCache(Cache);
---state declarations;
Vector#(Rows, Reg#(Bool))
vArray <replicateM(mkReg(False));
…
rule doMiss … endrule;
method Action req(MemReq r) …
method MemResp resp …
method Action respDeq …
endmethod;
endmethod;
endmethod;
method ActionValue#(MemReq) memReq … endmethod;
method Action memResp(MemResp r) …
endmethod;
endmodule
L13-14
Blocking Cache
state declarations
Vector#(Rows, Reg#(Bool))
vArray <replicateM(mkReg(False));
Vector#(Rows, Reg#(Tag))
tagArray <replicateM(mkRegU);
Vector#(Rows, Reg#(Data)) dataArray <replicateM(mkRegU);
BFIFO#(MemReq)
hitQ <- mkBypassFIFO;
Reg#(MemReq)
missReq <- mkRegU;
Reg#(CacheStatus) status <- mkReg(Rdy);
FIFOF#(MemReq)
mReqQ <- mkFIFOF;
FIFOF#(MemResp) mRespQ <- mkFIFOF;
L13-15
Blocking Cache
memory-side methods
method ActionValue#(MemReq) mReqDeq;
mReqQ.deq;
return mReqQ.first;
endmethod
method Action memResp(MemResp r);
mRespQ.enq(r);
endmethod
L13-16
Blocking I-Cache
processor-side methods
method Action req(MemReq r) if (status==Rdy);
Index idx = truncate(r.addr>>2);
Tag tag = truncateLSB(r.addr);
Bool valid = vArray[idx];
Bool tagMatch = tagArray[idx]==tag;
if(valid && tagMatch)
hitQ.enq(r);
else begin
missReq <= r; status <= FillReq; end
endmethod
method MemResp resp;
let r = hitQ.first;
Index idx = truncate(r.addr>>2);
return dataArray[idx];
endmethod
method respDeq;
hitQ.deq;
endmethod
L13-17
Blocking I-Cache
Rules to process a cache miss
rule doFillReq (status==FillReq);
mReqQ.enq(MemReq{op:Ld, addr:missReq.addr, data:?});
status <= FillResp;
endrule
rule doFillResp (status==FillResp);
let data = mRespQ.first; mRespQ.deq;
Index idx = truncate(missReq.addr>>2);
Tag tag = truncateLSB(missReq.addr);
vArray[idx] <= True;
tagArray[idx] <= tag;
dataArray[idx] <= data;
status <= FillHit;
endrule
rule doFillHit (status==FillHit);
hitQ.enq(missReq);
status <= Rdy;
endrule
L13-18
Blocking D-Cache
processor-side methods
method Action req(MemReq r) if (status==Rdy);
Index idx = truncate(r.addr>>2);
Tag tag = truncateLSB(r.addr);
Bool valid = vArray[idx];
Bool tagMatch = tagArray[idx]==tag;
if(valid && tagMatch) hitQ.enq(r);
else begin
missReq <= r; status <= valid ? WrBack : FillReq; end
endmethod
method MemResp resp if (hitQ.first.op==Ld);
let r = hitQ.first;
Index idx = truncate(r.addr>>2); return dataArray[idx];
endmethod
method Action respDeq if (hitQ.first.op==Ld);
hitQ.deq;
endmethod
rule doStore(hitQ.first.op==St);
let r = hitQ.first; Index idx = truncate(r.add>>2);
dataArray[idx] <= r.data; hitQ.deq;
endrule
L13-19
Blocking D-Cache
Rules to process a cache miss
rule doWrBack (status==WrBack);
Index idx = truncate(missReq.addr>>2);
mReqQ.enq(MemReq{op:St,
addr:{tagArray[idx],idx,2’b00},
data:dataArray[idx]});
status <= FillReq;
endrule
rule doFillReq (status==FillReq);
mReqQ.enq(MemReq{op:Ld, addr:missReq.addr, data:?});
status <= FillResp;
endrule
Both load miss and store miss
generate a memory load request
L13-20
Blocking D-Cache
Rules to process a cache miss
rule doFillResp (status==FillResp);
let data = mRespQ.first; mRespQ.deq;
Index idx = truncate(missReq.addr>>2);
Tag tag = truncateLSB(missReq.addr);
vArray[idx] <= True;
tagArray[idx] <= tag;
dataArray[idx] <= data;
status <= FillHit;
endrule
rule doFillHit (status==FillHit);
hitQ.enq(missReq);
status <= Rdy;
endrule
same as I-Cache
L13-21
Hit and miss behaviors
Hit

Combinational read/write, i.e. 0-cycle response
Miss


No writeback: memory load latency plus combinational
read/write
Writeback: memory store followed by memory load
latency plus combinational read/write
L13-22
Non-blocking cache, V1
req
proc
req
Processor
resp
respDeq
cbuf
mReqQ
cache
mRespQ
mReq
DRAM
mResp
Completion buffer controls the entries of
requests and ensures that responses are in
order even if loads complete out-of-order
23
Non-blocking cache, V2
req
req
Processor
resp
cbuf
resp
respDeq
req
proc
mReqQ
cache
mRespQ
mReq
mResp
respDeq
FIFO responses
inputs are tagged;
responses can be OOO
Completion buffer controls the entries of
requests and ensures that departures take
place in order even if loads complete out-oforder
24
Completion buffer: Interface
getToken
getResult
cbuf
deqResult
put (result & token)
interface CBuffer#(type t);
method ActionValue#(Token) getToken;
method Action put(Token tok, t d);
method t getResult;
method Action deqResult;
endinterface
Concurrency requirement
getToken < put < getResult < deqResult
25
Non-blocking FIFO Cache
module mkNBFifoCache(Cache);
CBuffer
cBuf <- mkCompletionBuffer;
NBCache nbCache <- mkNBtaggedCache;
rule nbCacheResponse;
cBuf.put(nbCache.resp.tok, nbCache.resp.resp);
nbCache.respDeq;
endrule
method Action req(MemReq x);
let tok <- cBuf.getToken;
nbCache.req(TaggedMemReq{req:x, tag:tok});
endmethod
req
method MemResp resp;
return cBuf.getResult
endmethod
cbuf
resp
method Action respDeq;
cBuf.deqResult;
respDeq
endmethod
endmodule
L24-26
Completion buffer:
Implementation
A circular buffer with two pointers
iidx and ridx, and a counter cnt
iidx
ridx
Elements are of Maybe type
cnt
I
I
V
I
V
I
buf
module mkCompletionBuffer(CompletionBuffer#(size));
Vector#(size, EHR#(Maybe#(t))) cb
<- replicateM(mkEHR(Invalid));
Reg#(Bit#(TAdd#(TLog#(size),1)))
iidx <- mkReg(0);
Reg#(Bit#(TAdd#(TLog#(size),1)))
ridx <- mkReg(0);
EHR#(Bit#(TAdd#(TLog#(size),1)))
cnt <- mkEHR(0);
Integer vsize = valueOf(size);
Bit#(TAdd#(TLog#(size),1)) sz = fromInteger(vsize);
rules and methods...
endmodule
27
Completion Buffer cont
method ActionValue#(t) getToken() if(cnt.r[0]!==sz);
cb[iidx].w[0] = Invalid;
iidx <= iidx==sz-1 ? 0 : iidx + 1;
cnt.w[0] = cnt.r[0] + 1;
return iidx;
endmethod
method Action put(Token idx, t data);
cb[idx].w[1] = Valid data;
endmethod
method ActionValue#(t) getResult()
if(cnt.r[1] !== 0 &&&
(cb[ridx].r[2] matches tagged (Valid .x));
return x;
endmethod
method Action deqResult if(cnt.r[1]!=0);
cb[ridx].w[2](Invalid);
ridx <= ridx==sz-1 ? 0 : ridx + 1;
cnt.w[1](cnt.r[1] – 1);
endmethod
getToken < put < getResult < deqResult
L24-28
Non-blocking Cache
req
resp
respDeq
hitQ
St Tag Data
St
Buff
wbQ
Ld
Buff
mReqQ
Wait
Buff
mRespQ
29
Store Buffer
module mkStBuff(StBuff#(sz));
Vector#(sz, EHR2#(Maybe#(MemReq))) buff
<- replicateM(mkEHR2(Invalid));
Reg#(Bit#(TAdd#(TLog#(sz),1))) iidx <- mkReg(0);
EHR2#(Bit#(TAdd#(TLog#(sz),1))) ridx <- mkEHR2(0);
EHR2#(Bit#(TAdd#(TLog#(sz),1))) cnt <- mkEHR2(0);
method ActionValue#(MemReq) remove if(cnt.r[0]!=0);
buff[ridx.r[0]].w[0](Invalid);
ridx.w[0](ridx.r[0]==fromInteger(valueOf(sz))-1
? 0
: ridx.r[0] + 1);
cnt.w[0](cnt.r[0] - 1);
return unJust(buff[ridx.r[0]].r[0]);
endmethod
30
Store Buffer
cont
method Maybe#(Data) search(Addr a);
Maybe#(Data) m = Invalid; let idx = ridx.r[1];
for(Integer i=0; i<valueOf(sz); i=i+1) begin
if(isValid(buff[idx].r[1]) &&
unJust(buff[idx].r[1]).addr==a)
m = Valid (unJust(buff[idx].r[1]).data);
idx = idx + 1;
end
return m;
endmethod
method Action insert(MemReq r)
if(cnt.r[1]!=fromInteger(valueOf(sz)));
buff[iidx].w[1](Valid (r));
iidx <= iidx==fromInteger(valueOf(sz))-1 ? 0 :
iidx + 1;
cnt.w[1](cnt.r[1] + 1);
endmethod
endmodule
31
Load Buffer
module mkLdBuff(LdBuff#(sz));
Vector#(sz, EHR3#(LdStatus))
st
<- replicateM(mkEHR3(Invalid));
Vector#(sz, EHR3#(TaggedMemReq)) buff
<- replicateM(mkEHR3U);
EHR3#(Bit#(TAdd#(TLog#(sz),1))) cnt <- mkEHR3(0);
method ActionValue#(TaggedMemReq) remove(Addr a)
if(cnt.r[0]!=0);
Bit#(TLog#(sz)) idx = 0;
for(Integer i=0; i<valueOf(sz); i=i+1)
if(buff[i].r[0].req.addr==a)
idx = fromInteger(i);
st[idx].w[0](Invalid);
cnt.w[0](cnt.r[0] - 1);
return buff[idx].r[0];
endmethod
32
Load Buffer
cont
method Action update(Addr a, LdStatus st);
for(Integer i=0; i<valueOf(size); i=i+1)
if(st[i].r[2]!= Invalid &&
buff[i].r[2].req.addr==a)
st[i].w[2](st);
endmethod
method TypeUpdate usearch;
TypeUpdate u = TypeUpdate{valid:False, st:?,
addr:?};
for(Integer i=0; i<valueOf(size); i=i+1)
if(st[i].r[2]==WrBack || st[i].r[2]==FillReq)
u = TypeUpdate{valid:True, st:st[i].r[2],
addr:buff[i].r[2].req.addr};
return u;
endmethod
33
Load Buffer
cont
method Bool search(Addr a);
Bool s = False;
for(Integer i=0; i<valueOf(sz); i=i+1)
if(st[i].r[1]!=Invalid &&
buff[i].r[1].req.addr==a)
s = True;
return s;
endmethod
method Action insert(TaggedMemReq r, LdStatus st)
if(cnt.r[1]!=fromInteger(valueOf(sz)));
Bit#(TLog#(sz)) idx = 0;
for(Integer i=0; i<valueOf(sz); i=i+1)
if(st[i].r[1]==Invalid)
idx = fromInteger(i);
buff[idx].w[1](r);
st[idx].w[1](st);
cnt.w[1](cnt.r[1] + 1);
endmethod
34
Non-blocking Cache
state declaration
module mkNBCache(NBCache);
RegFile#(Index, LineStatus) sArray
RegFile#(Index, Tag)
tagArray
RegFile#(Index, Data)
dataArray
StBuff#(StBuffSz)
stBuff
LdBuff#(LdBuffSz)
ldBuff
PipeFIFO#(TaggedMemReq)
waitBuff
<<<<<<-
mkRegFileFull;
mkRegFileFull;
mkRegFileFull;
mkStBuff;
mkLdBuff;
mkPipeFIFO;
FIFOF#(MemReq)
wbQ <- mkFIFOF;
FIFOF#(MemReq)
mReqQ <- mkFIFOF;
FIFOF#(MemResp) mRespQ <- mkFIFOF;
EHRBypassReg#(TypeHit) hitQ <- mkEHRBypassReg;
Reg#(Bool) procWaitBuff <- mkReg(False);
35
Non-blocking Cache
memory-side methods
method ActionValue#(MemReq) wb;
wbQ.deq;
return wbQ.first;
endmethod
method ActionValue#(MemReq) mReq;
mReqQ.deq;
return mReqQ.first;
endmethod
method Action mResp(MemResp r);
mRespQ.enq(r);
endmethod
endmodule
36
Non-blocking Cache
processor-side methods
method Action req(TaggedMemReq r) if(!procWaitBuff);
Index idx = truncate(r.req.addr>>2);
Tag tag = truncateLSB(r.req.addr);
let lnSt = sArray.sub(idx);
Bool tagMatch = tagArray.sub(idx)==tag;
let sbMatch = stBuff.search(r.req.addr);
let lbMatch = ldBuff.search(r.req.addr);
if(lnSt!=Invalid && tagMatch)
hitQ.enq(TypeHit{tag:r.tag, req:r.req,
data:Invalid});
else if(lbMatch) waitBuff.enq(r);
else if(r.req.op==St) stBuff.insert(r.req);
else if(r.req.op==Ld && isValid(sbMatch))
hitQ.enq(TypeHit{tag:r.tag, req:r.req,
data:sbMatch});
else
ldBuff.insert(r, lnSt==Dirty ? WrBack : FillReq);
endmethod
37
Non-blocking Cache
processor-side methods
method TaggedMemResp resp if(hitQ.first.req.op==Ld);
let r = hitQ.first.req;
Index idx = truncate(r.addr>>2);
Data d = isValid(hitQ.first.data) ?
unJust(hitQ.first.data) :
dataArray.sub(idx);
return TaggedMemResp{tag:hitQ.first.tag, resp:d};
endmethod
method Action respDeq if(hitQ.first.req.op==Ld);
hitQ.deq;
endmethod
38
Non-blocking Cache
store rules
rule storeHit(hitQ.first.req.op==St);
let r = hitQ.first.req;
Index idx = truncate(r.addr>>2);
dataArray.upd(idx, r.data);
sArray.upd(idx, Dirty);
hitQ.deq;
endrule
rule stBuffRemove;
let r <- stBuff.remove;
wbQ.enq(r);
endrule
39
Non-blocking Cache
load rule
rule ldBuffUpdate;
let u = ldBuff.usearch;
if(u.valid)
if(u.st==FillReq) begin
mReqQ.enq(MemReq{op:Ld, addr:u.addr, data:?});
ldBuff.update(u.addr, FillResp);
end
else begin
Index idx = truncate(u.addr>>2);
wbQ.enq(MemReq{op:St,
addr:{tagArray.sub(idx),idx,2'b00},
data:dataArray.sub(idx)});
mReqQ.enq(MemReq{op:Ld, addr:u.addr, data:?});
ldBuff.update(u.addr, FillResp);
end
endrule
40
Non-blocking Cache
memory response rule
rule mRespAvailable;
let data = mRespQ.first.data;
let addr = mRespQ.first.addr;
Index idx = truncate(addr>>2);
Tag tag = truncateLSB(addr);
sArray.upd(idx, Clean);
tagArray.upd(idx, tag);
dataArray.upd(idx, data);
let r <- ldBuff.remove(addr);
hitQ.enq(TypeHit{tag:r.tag, req:r.req,
data:Valid (data)});
if(waitBuff.search(addr))
procWaitBuff <= True;
else
mRespQ.deq;
endrule
41
Non-blocking Cache
wait buffer rule
rule goThroughWaitBuff(procWaitBuff);
let data = mRespQ.first.data;
let addr = mRespQ.first.addr;
let r = waitBuff.first;
waitBuff.deq;
if(r.req.addr==addr)
hitQ.enq(TypeHit{tag:r.tag, req:r.req,
data:Valid (data)});
else
waitBuff.enq(r);
if(!waitBuff.search(addr)) begin
procWaitBuff <= False;
mRespQ.deq;
end
endrule
42