Merge pull request #105 from StanfordVLSI/fpga_experiment

Emulation updates
StanfordVLSI · Jun 29, 2020 · d07ec12 · d07ec12
2 parents c932e74 + c170c95
commit d07ec12
Show file tree

Hide file tree

Showing 20 changed files with 793 additions and 113 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -62,6 +62,10 @@ steps:
       python3.7 -m venv venv
       source venv/bin/activate
 
+      # scale down the size of the FFE and MLSD to fit on the regression FPGA
+      sed -i 's/\&ffe_length [[:digit:]]\+/\&ffe_length 4/g' config/system.yml
+      sed -i 's/\&estimate_depth [[:digit:]]\+/\&estimate_depth 4/g' config/system.yml
+
       # run regression script
       source regress.sh
 

diff --git a/codecov.yml b/codecov.yml
@@ -0,0 +1,14 @@
+# configuration related to pull request comments
+comment: no # do not comment PR with the result
+
+coverage:
+  range: 50..90 # coverage lower than 50 is red, higher than 90 green, between color code
+
+  status:
+    project: # settings affecting project coverage
+      default:
+        target: auto # auto % coverage target
+        threshold: 5%  # allow for 5% reduction of coverage without failing
+
+    # do not run coverage on patch nor changes
+    patch: false
diff --git a/config/fpga/system_fpga.yml b/config/fpga/system_fpga.yml
diff --git a/conftest.py b/conftest.py
@@ -13,6 +13,18 @@ def pytest_addoption(parser):
         '--ser_port', default='/dev/ttyUSB2', type=str, help='USB serial path.'
     )
 
+    parser.addoption(
+        '--ffe_length', default=4, type=int, help='Number of FFE coefficients per channel.'
+    )
+
+    parser.addoption(
+        '--emu_clk_freq', default=5.0e6, type=float, help='Frequency of emulator clock (Hz)'
+    )
+
+    parser.addoption(
+        '--prbs_test_dur', default=10.0, type=float, help='Length of time of the PRBS emulation test.'
+    )
+
 @pytest.fixture
 def dump_waveforms(request):
     return request.config.getoption('--dump_waveforms')
@@ -23,4 +35,16 @@ def board_name(request):
 
 @pytest.fixture
 def ser_port(request):
-    return request.config.getoption('--ser_port')
+    return request.config.getoption('--ser_port')
+
+@pytest.fixture
+def ffe_length(request):
+    return request.config.getoption('--ffe_length')
+
+@pytest.fixture
+def emu_clk_freq(request):
+    return request.config.getoption('--emu_clk_freq')
+
+@pytest.fixture
+def prbs_test_dur(request):
+    return request.config.getoption('--prbs_test_dur')
diff --git a/dragonphy/fpga_models/chan_core.py b/dragonphy/fpga_models/chan_core.py
@@ -33,7 +33,7 @@ def __init__(self, filename=None, **system_values):
         view = system_values['view']
 
         # read in the channel data
-        chan = Filter.from_file(get_file('build/fpga_models/adapt_fir/chan.npy'))
+        chan = Filter.from_file(get_file('build/chip_src/adapt_fir/chan.npy'))
 
         # create a function
         domain = [chan.t_vec[0], chan.t_vec[-1]]

diff --git a/dragonphy/views.py b/dragonphy/views.py
@@ -14,7 +14,10 @@ def remove_dup(seq):
 def find_preferred_impl(cell_name, view_order, override):
     # if there is a specific view desired for this cell, use it instead of the view order
     if cell_name in override:
-        view_order = [override[cell_name]]
+        if isinstance(override[cell_name], Path):
+            return override[cell_name]
+        else:
+            view_order = [override[cell_name]]
 
     # walk through the view names in order, checking to see if there are any matches in each
     for view_name in view_order:
@@ -175,9 +178,12 @@ def get_deps_asic(cell_name=None, impl_file=None, process='tsmc16'):
     # Return the dependencies
     return deps
 
-def get_deps_cpu_sim(cell_name=None, impl_file=None):
-    deps = []
+def get_deps_cpu_sim(cell_name=None, impl_file=None, override=None):
+    # set defaults
+    if override is None:
+        override = {}
 
+    deps = []
     deps += get_deps(
         cell_name=cell_name,
         impl_file=impl_file,
@@ -190,7 +196,8 @@ def get_deps_cpu_sim(cell_name=None, impl_file=None):
             'DAVE_TIMEUNIT': '1fs',
             'NCVLOG': None,
             'SIMULATION': None  # for MDLL simulation
-        }
+        },
+        override=override
     )
 
     return deps

diff --git a/experiments/cpu_emu_comparison/V2T_clock_gen_S2D.sv b/experiments/cpu_emu_comparison/V2T_clock_gen_S2D.sv
@@ -0,0 +1,14 @@
+// simple model used for performance comparison with emulation
+
+`timescale 1s/1fs
+
+module V2T_clock_gen_S2D (
+    input wire logic in,        // input signal
+    output wire logic out,      // delayed output signal (+)
+    output reg outb             // delayed output signal (-)
+);
+
+    assign out = in;
+    assign outb = ~in;
+
+endmodule
diff --git a/experiments/cpu_emu_comparison/experiment.py b/experiments/cpu_emu_comparison/experiment.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+from dragonphy import *
+
+THIS_DIR = Path(__file__).parent.resolve()
+BUILD_DIR = THIS_DIR / 'build'
+
+deps = get_deps_cpu_sim(
+    impl_file=THIS_DIR / 'test.sv',
+    override={
+        'snh': THIS_DIR / 'snh.sv',
+        'V2T_clock_gen_S2D': THIS_DIR / 'V2T_clock_gen_S2D.sv',
+        'stochastic_adc_PR': THIS_DIR / 'stochastic_adc_PR.sv',
+        'phase_interpolator': THIS_DIR / 'phase_interpolator.sv',
+        'input_divider': THIS_DIR / 'input_divider.sv',
+        'output_buffer': THIS_DIR / 'output_buffer.sv',
+        'mdll_r1_top': 'chip_stubs'
+    }
+)
+print(deps)
+
+DragonTester(
+    ext_srcs=deps,
+    directory=BUILD_DIR,
+    dump_waveforms=False
+).run()
diff --git a/experiments/cpu_emu_comparison/input_divider.sv b/experiments/cpu_emu_comparison/input_divider.sv
@@ -0,0 +1,26 @@
+// simple model used for performance comparison with emulation
+
+`timescale 1s/1fs
+
+module input_divider (
+    input wire logic in,
+    input wire logic in_mdll,
+    input wire logic sel_clk_source,
+    input wire logic en,
+    input wire logic en_meas,
+    input wire logic [2:0] ndiv,
+    input wire logic bypass_div,
+    input wire logic bypass_div2,
+    output wire logic out,
+    output wire logic out_meas
+);
+    logic div_state = 1'b0;
+    always @(posedge in) begin
+        div_state <= ~div_state;
+    end
+
+    assign out = div_state;
+
+    // out_meas is unused
+    assign out_meas = 1'b0;
+endmodule
diff --git a/experiments/cpu_emu_comparison/output_buffer.sv b/experiments/cpu_emu_comparison/output_buffer.sv
@@ -0,0 +1,24 @@
+// simple model used for performance comparison with emulation
+
+module output_buffer (
+    input [15:0] bufferend_signals,
+    input [3:0] sel_outbuff,
+    input [3:0] sel_trigbuff,
+    input en_outbuff,
+    input en_trigbuff,
+    input bypass_out_div,
+    input bypass_trig_div,
+    input [2:0] Ndiv_outbuff,
+    input [2:0] Ndiv_trigbuff,
+    output clock_out_p,
+    output clock_out_n,
+    output trigg_out_p,
+    output trigg_out_n
+);
+
+    assign clock_out_p = 1'b0;
+    assign clock_out_n = 1'b0;
+    assign trigg_out_p = 1'b0;
+    assign trigg_out_n = 1'b0;
+
+endmodule
diff --git a/experiments/cpu_emu_comparison/phase_interpolator.sv b/experiments/cpu_emu_comparison/phase_interpolator.sv
@@ -0,0 +1,67 @@
+// simple model used for performance comparison with emulation
+
+`timescale 1s/1fs
+
+`include "iotype.sv"
+
+module phase_interpolator #(
+    parameter Nbit = 9,
+    parameter Nctl_dcdl = 2,
+    parameter Nunit = 32,
+    parameter Nblender = 4
+)(
+    input rstb,
+    input clk_in,
+    input clk_async,
+    input clk_encoder,
+    input disable_state,
+    input en_arb,
+    input en_cal,
+    input en_clk_sw,
+    input en_delay,
+    input en_ext_Qperi,
+    input en_gf,
+    input ctl_valid,
+    input [Nbit-1:0]  ctl,
+    input [Nctl_dcdl-1:0] ctl_dcdl_sw,
+    input [Nctl_dcdl-1:0] ctl_dcdl_slice,
+    input [Nctl_dcdl-1:0] ctl_dcdl_clk_encoder,
+    input [Nunit-1:0]  inc_del,
+    input [$clog2(Nunit)-1:0] ext_Qperi,
+    input [1:0] sel_pm_sign,
+    input en_pm,
+
+    output cal_out,
+    output reg clk_out_slice=1'b0,
+    output clk_out_sw,
+    output del_out,
+
+    output [$clog2(Nunit)-1:0] Qperi,
+    output [$clog2(Nunit)-1:0] max_sel_mux,
+    output cal_out_dmm,
+    output [19:0]  pm_out
+);
+
+    // delay clk_in to clk_out_slice
+
+    real delay_s;
+    always @(clk_in) begin
+        // compute the delay
+        delay_s = ((1.0*ctl)/(2.0**(Nbit)))*(250.0e-12);
+
+        // apply the delay
+        clk_out_slice <= #(delay_s*1s) clk_in;
+    end
+
+    // outputs that are not modeled
+
+    assign cal_out = 0;
+    assign clk_out_sw = 0;
+    assign del_out = 0;
+    assign Qperi = 0;
+    assign max_sel_mux = 0;
+    assign cal_out_dmm = 0;
+    assign pm_out = 0;
+
+endmodule
+
diff --git a/experiments/cpu_emu_comparison/results.md b/experiments/cpu_emu_comparison/results.md
@@ -0,0 +1,24 @@
+Jun 22, 2020
+* simulation with 4x channels:
+  * PRBS test took 27.614053 seconds.
+  * Total bits: 600000
+  * Throughput: 21.7 kb/s
+
+Jun 24, 2020
+* Emulation with 16x channels on ZC706:
+  * PRBS test took 30.073444843292236 seconds.
+  * Total bits: 150258080
+  * 4.996 Mb/s
+  * Slice LUTs: 58678 / 218600
+  * Slice Registers: 24928 / 437200
+  * Slice: 19436 / 54650
+  * DSP: 299 / 900
+  * BRAM: 42.5 / 545
+  * Build time: 30m 35.161s with Vivado 2020.1 on Intel(R) Core(TM) i5-2320 CPU @ 3.00GHz, Ubuntu 18.04.2 LTS, 6 GB RAM
+    * use `cat /proc/cpuinfo`, `cat /proc/meminfo`, `lsb_release -a`
+* Simulation with 16x channels:
+  * PRBS test took 42.509094 seconds.
+  * Total_bits: 608192
+  * Throughput: 14.3 kb/s
+  * r7cad-generic processor, CentOS Linux release 7.7.1908 (Core), 128 GB RAM
+    * /proc/cpuinfo did not display the real CPU information since r7cad-generic is a VM
diff --git a/experiments/cpu_emu_comparison/snh.sv b/experiments/cpu_emu_comparison/snh.sv
@@ -0,0 +1,24 @@
+// simple model used for performance comparison with emulation
+
+`timescale 1s/1fs
+
+`include "iotype.sv"
+
+module snh import const_pack::Nout; (
+    input wire logic [Nout-1:0] clk,        // sampling clocks of the first s&h sw group
+    input wire logic [Nout-1:0] clkb,       // ~clkb
+    input `pwl_t in_p,                      // + signal input
+    input `pwl_t in_n,                      // - signal input
+    output `pwl_t out_p [Nout-1:0],         // sampled (+) outputs
+    output `pwl_t out_n [Nout-1:0]          // sampled (-) outputs
+);
+
+    genvar i;
+    generate
+        for (i=0; i<Nout; i=i+1) begin
+            assign out_p[i] = in_p;
+            assign out_n[i] = in_n;
+        end
+    endgenerate
+
+endmodule