L17-BranchPrediction-2

Download Report

Transcript L17-BranchPrediction-2

Computer Architecture: A Constructive Approach
Branch Prediction - 2
Arvind
Computer Science & Artificial Intelligence Lab.
Massachusetts Institute of Technology
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-1
Two-Stage pipeline
PC
nextPC
fEpoch
Bypass
FIFO
+4
ir
eEpoch
A robust two-rule solution
Register File
Decode
Execute
Pipeline
FIFO
Data
Memory
Inst
Memory
Either fifo can be a normal (>1 element) fifo
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-2
Decoupled Fetch and Execute
<updated pc>
Fetch
nextPC
Execute
ir
<instructions,
pc, epoch>
Properly decoupled systems permit greater
freedom in independent refinement of blocks
FIFOs must permit concurrent enq and deq
For pipelined behavior ir behavior must be
deq<enq
For proper scheduling nextPC behavior must
be enq<deq (deq < enq would be just wrong)
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-3
Three one-element FIFOs
enq
notFull
deq
enq
notEmpty
notFull
deq
or
Ordinary
FIFO
Ordinary: No concurrent
enq/deq
Pipeline: deq before enq,
combinational path
Bypass: enq before deq,
combinational path
Pipeline and Bypass fifos can
create combinational cycles in
the presence of feedback
April 11, 2012
notEmpty
Pipeline
FIFO
enq
notFull
http://csg.csail.mit.edu/6.S078
deq
notEmpty
or
Bypass
FIFO
L17-4
Multi-element FIFOs
Normal FIFO



Permits concurrent enq and deq when notFull and
notEmpty
Unlike a pipeline FIFO, does not permit enq when
full, even if there is a concurrent deq
Unlike a bypass FIFO, does not permit deq when
empty, even if there is a concurrent enq
Normal FIFO implementations have at least
two elements, but they do not have
combinational paths => make it easier to
reduce critical paths at the expense of area
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-5
A decoupled solution using
epoch
Add fEpoch and eEpoch registers to the
processor state; initialize them to the same
value
The epoch changes whenever Execute
determines that the pc prediction is wrong.
This change is reflected immediately in eEpoch
and eventually in fEpoch via nextPC FIFO
Associate the fEpoch with every instruction
when it is fetched
In the execute stage, reject, i.e., kill, the
instruction if its epoch does not match eEpoch
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-6
Two-stage pipeline
Decoupled
module mkProc(Proc);
Reg#(Addr)
pc <- mkRegU;
RFile
rf <- mkRFile;
IMemory
iMem <- mkIMemory;
DMemory
dMem <- mkDMemory;
PipeReg#(TypeFetch2Decode) ir <- mkPipeReg;
Reg#(Bool)
fEpoch <- mkReg(False);
Reg#(Bool)
eEpoch <- mkReg(False);
FIFOF#(Tuple2#(Addr,bool))
nextPC <- mkBypassFIFOF;
rule doFetch
rule doExecute
…
endrule
… endrule
endmodule
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-7
Two-stage pipeline
doFetch rule
explicit guard
rule doFetch (ir.notFull);
let inst = iMem(pc);
ir.enq(TypeFetch2Decode
{pc:pc, epoch:fEpoch, inst:inst});
if(nextPC.notEmpty) begin
match{.ipc,.epoch} = nextPC.first;
pc<=ipc; fEpoch<=epoch; nextPC.deq;
end
simple branch prediction
else pc <= pc + 4;
endrule
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-8
Two-stage pipeline
doExecute rule
rule doExecute (ir.notEmpty);
let irpc = ir.first.pc; let inst = ir.first.inst;
if(ir.first.epoch==eEpoch) begin
let eInst = decodeExecute(irpc, inst, rf);
let memData <- dMemAction(eInst, dMem);
regUpdate(eInst, memData, rf);
if (eInst.brTaken) begin
nepoch = next(epoch);
eEpoch <= nepoch;
nextPC.enq(tuple2(eInst.addr, nepoch);
end
end
ir.deq;
endrule
endmodule
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-9
ir
+
PC
Branch
Predictor
eEpoch
nextPC
fEpoch
Two-Stage pipeline with a
Branch Predictor
Register File
Decode
Execute
ppc
Data
Memory
Inst
Memory
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-10
Branch Predictor Interface
interface NextAddressPredictor;
method Addr prediction(Addr pc);
method Action update(Addr pc,
Addr target);
endinterface
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-11
Example
Null Branch Prediction
module mkNeverTaken(NextAddressPredictor);
method Addr prediction(Addr pc);
return pc+4;
endmethod
method Action update(Addr pc, Addr target);
noAction;
endmethod
endmodule
Replaces PC+4 with …

Already implemented in the pipeline
Right most of the time

April 11, 2012
Why?
http://csg.csail.mit.edu/6.S078
L17-12
Example
Branch Target Prediction (BTB)
module mkBTB(NextAddressPredictor);
RegFile#(LineIdx, Addr)
tagArr <- mkRegFileFull;
RegFile#(LineIdx, Addr) targetArr <- mkRegFileFull;
method Addr prediction(Addr pc);
LineIdx index = truncate(pc >> 2);
let tag = tagArr.sub(index);
let target = targetArr.sub(index);
if (tag==pc) return target; else return (pc+4);
endmethod
method Action update(Addr pc, Addr target);
LineIdx index = truncate(pc >> 2);
tagArr.upd(index, pc);
targetArr.upd(index, target);
endmethod
endmodule
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-13
Two-stage pipeline + BP
module mkProc(Proc);
Reg#(Addr)
pc <- mkRegU;
RFile
rf <- mkRFile;
IMemory
iMem <- mkIMemory;
DMemory
dMem <- mkDMemory;
PipeReg#(TypeFetch2Decode) ir <- mkPipeReg;
Reg#(Bool)
fEpoch <- mkReg(False);
Reg#(Bool)
eEpoch <- mkReg(False);
FIFOF#(Tuple3#(Addr,Addr,Bool))
nextPC <- mkBypassFIFOF;
NextAddressPredictor bpred <- mkNeverTaken; Some
target
The definition of TypeFetch2Decode is changed to
predictor
include predicted pc
typedef struct {
Addr pc; Addr ppc; Bool epoch; Data inst;
} TypeFetch2Decode deriving (Bits, Eq);
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-14
Two-stage pipeline + BP
Fetch rule
rule doFetch (ir.notFull);
let ppc = bpred.prediction(pc);
let inst = iMem(pc);
ir.enq(TypeFetch2Decode
{pc:pc, ppc:ppc, epoch:fEpoch, inst:inst});
if(nextPC.notEmpty) begin
match{.ipc, .ippc, .epoch} = nextPC.first;
pc <= ippc; fEpoch <= epoch; nextPC.deq;
bpred.update(ipc, ippc);
end
else pc <= ppc;
endrule
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-15
Two-stage pipeline + BP
Execute rule
rule doExecute (ir.notEmpty);
let irpc = ir.first.pc; let inst = ir.first.inst;
let irppc = ir.first.ppc;
if(ir.first.epoch==eEpoch) begin
let eInst = decodeExecute(irpc, irppc, inst, rf);
let memData <- dMemAction(eInst, dMem);
regUpdate(eInst, memData, rf);
if (eInst.missPrediction) begin
nepoch = next(eEpoch); eEpoch <= nepoch;
nextPC.enq(tuple3(irpc,
eInst.brTaken ? eInst.addr : irpc+4),
nepoch));
end
end
Requires changes in decodeExecute
ir.deq;
to return missPrediction as opposed
endrule endmodule
to brTaken information
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-16
Execute Function
function ExecInst exec(DecodedInst dInst, Data rVal1,
Data rVal2, Addr pc, Addr ppc);
ExecInst einst = ?;
let aluVal2 = (dInst.immValid)? dInst.imm : rVal2
let aluRes = alu(rVal1, aluVal2, dInst.aluFunc);
let brAddr = brAddrCal(pc, rVal1, dInst.iType,
dInst.imm);
einst.itype = dInst.iType;
einst.addr = (memType(dInst.iType)? aluRes : brAddr;
einst.data = dInst.iType==St ? rVal2 : aluRes;
einst.brTaken = aluBr(rVal1, aluVal2, dInst.brComp);
einst.missPrediction = brTaken ? brAddr!=ppc :
(pc+4)!=ppc;
einst.rDst = dInst.rDst;
return einst;
endfunction
http://csg.csail.mit.edu/6.S078
L17-17
April 11, 2012
Multiple predictors
For multiple predictors to make sense we first
need to have more than two stage pipeline
With a slightly different (even a 2-satge)
pipeline we also need to resolve data-hazards
simultaneously
Plan


Present a different two stage pipeline with data
hazards
Present a three stage pipeline with
 One branch predictor
 Two branch predictors
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-18
fEpoch
A different 2-Stage pipeline
nextPC
Register File
PC
Execute
Decode
Branch
Predictor
Inst
Memory
April 11, 2012
eEpoch
itr
stall
http://csg.csail.mit.edu/6.S078
Data
Memory
L17-19
TypeDecode2Execute
typedef struct {
Addr pc; Addr ppc; Bool epoch;
DecodedInst dInst; Data rVal1; Data rVal2
} TypeDecode2Execute deriving (Bits, Eq);
value instead of register names
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-20
The stall function
src1, src2 and rDst in DecodedInst are changed
from Rindx to Maybe#(Rindx)to determine the stall condition
function Bool stall(Maybe#(Rindx) src1,
Maybe#(Rindx) src2,
PipeReg#(TypeDecode2Execute) itr);
dst = itr.first.dInst.rDst;
return (itr.notEmpty && isValid(dst)
&& ((validValue(dst)==validValue(src1) &&
isValid(src1))
||
(validValue(dst)==validValue(src2) &&
isValid(src2))));
endfunction
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-21
A different 2-Stage pipeline
module mkProc(Proc);
Reg#(Addr)
pc
RFile
rf
IMemory
iMem
DMemory
dMem
<<<<-
mkRegU;
mkConfigRFile;
mkIMemory;
mkDMemory;
PipeReg#(TypeDecode2Execute) itr <- mkConfigPipeReg;
Reg#(Bool)
Reg#(Bool)
fEpoch <- mkReg(False);
eEpoch <- mkReg(False);
FIFOF#(Tuple3#(Addr,Addr,Bool))
nextPC <- mkBypassFIFOF;
NextAddressPredictor bpred <- mkNeverTaken;
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-22
A different 2-Stage pipeline
doFetch rule
rule doFetch (itr.notFull);
let inst = iMem(pc);
let dInst = decode(inst);
if(!stall(dInst.src1, dInst.src2, itr)) begin
let ppc = bpred.prediction(pc);
let rVal1 = rf.rd1(validValue(dInst.src1));
let rVal2 = rf.rd2(validValue(dInst.src2));
itr.enq(TypeDecode2Execute{pc:pc, ppc:ppc,
epoch:fEpoch, dInst:dInst,
rVal1:rVal1, rVal2:rVal2});
if(nextPC.notEmpty) begin
match{.ipc, .ippc, .epoch} = nextPC.first;
pc <= ippc; fEpoch <= epoch; nextPC.deq;
bpred.update(ipc, ippc);
end
else pc <= ppc;
end
endrule
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-23
A different 2-Stage pipeline
doExecute rule
rule doExecute (itr.notEmpty);
let itrpc=itr.first.pc;
let dInst=itr.first.dInst;
let itrppc=itr.first.ppc; let rVal1=itr.first.rVal1;
let rVal2=itr.first.rVal2;
if(itr.first.epoch==eEpoch) begin
let eInst = execute(dInst, rVal1, rVal2, itrpc);
let memData <- dMemAction(eInst, dMem);
regUpdate(eInst, memData, rf);
if(eInst.missPrediction) begin
nepoch = next(epoch); eEpoch <= nepoch;
nextPC.enq(tuple3(itrpc,
eInst.brTaken ? eInst.addr : itrpc+4)
nepoch);
end
end
itr.deq;
endrule endmodule
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-24
Concurrency analysis
nextPC bypass fifo functionality: enq < deq

Hence doExecute happens before doFetch every cycle
itr pipeline fifo functionality: deq < enq

Hence doExecute happens before doFetch every cycle
itr pipeline fifo functionality: first < deq


Hence doFetch happens before doExecute every cycle to
determine the stall condition
Use config pipeline fifo to remove scheduling constraint
mkRFile functionality: {rd1, rd2} < wr


April 11, 2012
Hence doFetch happens before doExecute every cycle
Use mkConfigRFile to remove scheduling constraint
http://csg.csail.mit.edu/6.S078
L17-25
fEpoch
3-Stage pipeline – 1 predictor
PC
dEpoch
Inst
Memory
eEpoch
Execute
Decode
Branch
Predictor
April 11, 2012
nextPC
nextPC
Register File
itr
ir
stall
http://csg.csail.mit.edu/6.S078
Data
Memory
L17-26
3-Stage pipeline – 1 predictor
module mkProc(Proc);
Reg#(Addr)
pc <- mkRegU;
RFile
rf <- mkConfigRFile;
IMemory
iMem <- mkIMemory;
DMemory
dMem <- mkDMemory;
PipeReg#(TypeFetch2Decode)
ir <- mkPipeReg;
PipeReg#(TypeDecode2Execute) itr <- mkConfigPipeReg;
Reg#(Bool)
fEpoch <- mkReg(False);
Reg#(Bool)
dEpoch <- mkReg(False);
Reg#(Bool)
eEpoch <- mkReg(False);
FIFOF#(Tuple2#(Addr,Addr)) nextPCE2D <-mkBypassFIFOF;
FIFOF#(Tuple2#(Addr,Addr)) nextPCD2F <-mkBypassFIFOF;
NextAddressPredictor bpred <- mkNeverTaken;
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-27
3-Stage pipeline – 1 predictor
rule doFetch (ir.notFull);
let inst = iMem(pc);
let ppc = bpred.prediction(pc);
ir.enq(TypeFetch2Decode{
pc:pc, ppc:ppc, epoch:fEpoch, inst:inst});
if(nextPCD2F.notEmpty) begin
match{.ipc, .ippc} = nextPCD2F.first;
pc <= ippc; fEpoch <= !fEpoch; nextPCD2F.deq;
bpred.update(ipc, ippc);
end
else pc <= ppc;
end
endrule
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-28
3-Stage pipeline – 1
predictor
rule doDecode (itr.notFull && ir.notEmpty);
let irpc=ir.first.pc; let irppc=ir.first.ppc;
let inst=ir.first.inst;
if(nextPCE2D.notEmpty)
begin
dEpoch <= !dEpoch; nextPCD2F.enq(nextPCE2D.first);
nextPCE2D.deq; ir.deq; end
else if(ir.first.epoch==dEpoch) begin
let dInst = decode(inst);
if(!stall(dInst.src1, dInst.src2, itr)) begin
let rVal1 = rf.rd1(validValue(dInst.src1));
let rVal2 = rf.rd2(validValue(dInst.src2));
itr.enq(TypeDecode2Execute{pc:irpc, ppc:irppc,
epoch:dEpoch, dInst:dInst,
rVal1:rVal1, rVal2:rVal2});
ir.deq;
end
end
else ir.deq; endrule
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-29
3-Stage pipeline – 1 predictor
rule doExecute (itr.notEmpty);
let itrpc=itr.first.pc;
let dInst=itr.first.dInst;
let itrppc=itr.first.ppc; let rVal1=itr.first.rVal1;
let rVal2=itr.first.rVal2;
if(itr.first.epoch==eEpoch) begin
let eInst = execute(dInst, rVal1, rVal2, itrpc);
let memData <- dMemAction(eInst, dMem);
regUpdate(eInst, memData, rf);
if(eInst.missPrediction) begin
nextPCE2D.enq(tuple2(itrpc,
eInst.brTaken ? eInst.addr : itrpc+4));
eEpoch <= !eEpoch;
end
end
itr.deq;
endrule endmodule
April 11, 2012
http://csg.csail.mit.edu/6.S078
L17-30