From afb0a64953cbe7a0ac8bca96a12cf5920df350e6 Mon Sep 17 00:00:00 2001 From: Stefan Holst Date: Mon, 3 Jul 2023 00:48:58 +0900 Subject: [PATCH] wsa accumulation in wavesim --- src/kyupy/logic_sim.py | 8 +++--- src/kyupy/sim.py | 18 +++++++----- src/kyupy/wave_sim.py | 64 +++++++++++++++++++++++++++--------------- tests/test_wave_sim.py | 2 +- 4 files changed, 58 insertions(+), 34 deletions(-) diff --git a/src/kyupy/logic_sim.py b/src/kyupy/logic_sim.py index 92150fa..c3a3455 100644 --- a/src/kyupy/logic_sim.py +++ b/src/kyupy/logic_sim.py @@ -73,7 +73,7 @@ class LogicSim(sim.SimOps): if inject_cb is None: _prop_cpu(self.ops, self.c_locs, self.c[...,:nbytes]) else: - for op, o0, i0, i1, i2, i3 in self.ops: + for op, o0, i0, i1, i2, i3 in self.ops[:,:6]: o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)] if op == sim.BUF1: self.c[o0]=self.c[i0] elif op == sim.INV1: self.c[o0] = ~self.c[i0] @@ -107,7 +107,7 @@ class LogicSim(sim.SimOps): else: print(f'unknown op {op}') inject_cb(o0, self.s[o0]) elif self.m == 4: - for op, o0, i0, i1, i2, i3 in self.ops: + for op, o0, i0, i1, i2, i3 in self.ops[:,:6]: o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)] if op == sim.BUF1: self.c[o0]=self.c[i0] elif op == sim.INV1: logic.bp4v_not(self.c[o0], self.c[i0]) @@ -168,7 +168,7 @@ class LogicSim(sim.SimOps): logic.bp4v_or(self.c[o0], self.c[self.c_locs[self.tmp_idx]], self.c[self.c_locs[self.tmp2_idx]]) else: print(f'unknown op {op}') else: - for op, o0, i0, i1, i2, i3 in self.ops: + for op, o0, i0, i1, i2, i3 in self.ops[:,:6]: o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)] if op == sim.BUF1: self.c[o0]=self.c[i0] elif op == sim.INV1: logic.bp8v_not(self.c[o0], self.c[i0]) @@ -263,7 +263,7 @@ class LogicSim(sim.SimOps): @numba.njit def _prop_cpu(ops, c_locs, c): - for op, o0, i0, i1, i2, i3 in ops: + for op, o0, i0, i1, i2, i3 in ops[:,:6]: o0, i0, i1, i2, i3 = [c_locs[x] for x in (o0, i0, i1, i2, i3)] if op == sim.BUF1: c[o0]=c[i0] elif op == sim.INV1: c[o0] = ~c[i0] diff --git a/src/kyupy/sim.py b/src/kyupy/sim.py index bd9a549..bdb4d95 100644 --- a/src/kyupy/sim.py +++ b/src/kyupy/sim.py @@ -146,13 +146,17 @@ class SimOps: :param keep_signals: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces memory footprint, but intermediate signal waveforms become unaccessible after a propagation. """ - def __init__(self, circuit, c_caps=1, c_caps_min=1, c_reuse=False, strip_forks=False): + def __init__(self, circuit, c_caps=1, c_caps_min=1, a_ctrl=None, c_reuse=False, strip_forks=False): self.circuit = circuit self.s_len = len(circuit.s_nodes) if isinstance(c_caps, int): c_caps = [c_caps] * len(circuit.lines) + if a_ctrl is None: + a_ctrl = np.zeros((len(circuit.lines), 3), dtype=np.int32) + a_ctrl[:,0] = -1 + # special locations and offsets in c_locs/c_caps self.zero_idx = len(circuit.lines) self.tmp_idx = self.zero_idx + 1 @@ -168,14 +172,14 @@ class SimOps: if n in interface_dict: inp_idx = self.ppi_offset + interface_dict[n] if len(n.outs) > 0 and n.outs[0] is not None: # first output of a PI/PPI - ops.append((BUF1, n.outs[0].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx)) + ops.append((BUF1, n.outs[0].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx, *a_ctrl[n.outs[0]])) if 'dff' in n.kind.lower(): # second output of DFF is inverted if len(n.outs) > 1 and n.outs[1] is not None: - ops.append((INV1, n.outs[1].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx)) + ops.append((INV1, n.outs[1].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx, *a_ctrl[n.outs[1]])) else: # if not DFF, no output is inverted. for o_line in n.outs[1:]: if o_line is not None: - ops.append((BUF1, o_line.index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx)) + ops.append((BUF1, o_line.index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx, *a_ctrl[o_line])) continue # regular node, not PI/PPI or PO/PPO o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx @@ -188,7 +192,7 @@ class SimOps: if not strip_forks: for o_line in n.outs: if o_line is not None: - ops.append((BUF1, o_line.index, i0_idx, i1_idx, i2_idx, i3_idx)) + ops.append((BUF1, o_line.index, i0_idx, i1_idx, i2_idx, i3_idx, *a_ctrl[o_line])) continue sp = None for prefix, prims in kind_prefixes.items(): @@ -202,7 +206,7 @@ class SimOps: if sp is None: print('unknown cell type', kind) else: - ops.append((sp, o0_idx, i0_idx, i1_idx, i2_idx, i3_idx)) + ops.append((sp, o0_idx, i0_idx, i1_idx, i2_idx, i3_idx, *a_ctrl[o0_idx])) self.ops = np.asarray(ops, dtype='int32') @@ -299,7 +303,7 @@ class SimOps: self.c_len = h.max_size d = defaultdict(int) - for op, _, _, _, _, _ in self.ops: d[names[op]] += 1 + for op in self.ops[:,0]: d[names[op]] += 1 self.prim_counts = dict(d) self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0) diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py index 45b4e49..c9f25e6 100644 --- a/src/kyupy/wave_sim.py +++ b/src/kyupy/wave_sim.py @@ -47,8 +47,8 @@ class WaveSim(sim.SimOps): :param c_reuse: If enabled, memory of intermediate signal waveforms will be re-used. This greatly reduces memory footprint, but intermediate signal waveforms become unaccessible after a propagation. """ - def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False): - super().__init__(circuit, c_caps=c_caps, c_caps_min=4, c_reuse=c_reuse, strip_forks=strip_forks) + def __init__(self, circuit, delays, sims=8, c_caps=16, a_ctrl=None, c_reuse=False, strip_forks=False): + super().__init__(circuit, c_caps=c_caps, c_caps_min=4, a_ctrl=a_ctrl, c_reuse=c_reuse, strip_forks=strip_forks) self.sims = sims if delays.ndim == 3: delays = np.expand_dims(delays, axis=0) self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype) @@ -78,6 +78,9 @@ class WaveSim(sim.SimOps): final values in the waveforms are still valid. """ + self.abuf_len = self.ops[:,6].max() + 1 + self.abuf = np.zeros((self.abuf_len, sims), dtype=np.int32) if self.abuf_len > 0 else np.zeros((1, 1), dtype=np.int32) + self.simctl_int = np.zeros((2, sims), dtype=np.int32) """Per-simulation delay configuration. @@ -113,7 +116,7 @@ class WaveSim(sim.SimOps): """ sims = min(sims or self.sims, self.sims) for op_start, op_stop in zip(self.level_starts, self.level_stops): - level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, 0, sims, self.delays, self.simctl_int, seed) + level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, 0, sims, self.delays, self.simctl_int, seed) def c_to_s(self, time=TMAX, sd=0.0, seed=1): """Simulates a capture operation at all sequential elements and primary outputs. @@ -141,9 +144,16 @@ class WaveSim(sim.SimOps): self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] -def _wave_eval(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0): +def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0): overflows = int(0) + lut = op[0] + z_idx = op[1] + a_idx = op[2] + b_idx = op[3] + c_idx = op[4] + d_idx = op[5] + if len(delays) > 1: if simctl_int[1] == 0: delays = delays[seed] @@ -240,22 +250,26 @@ def _wave_eval(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, sim # generate or propagate overflow flag cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d) + nrise = max(0, (z_cur+1) // 2 - (cbuf[z_mem, sim] == TMIN)) + nfall = z_cur // 2 -_wave_eval_cpu = numba.njit(_wave_eval) + return nrise, nfall -@numba.njit -def wave_eval_cpu(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0): - lut, z_idx, a_idx, b_idx, c_idx, d_idx = op - _wave_eval_cpu(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed) +wave_eval_cpu = numba.njit(_wave_eval) @numba.njit -def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, sim_start, sim_stop, delays, simctl_int, seed): +def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed): for op_idx in range(op_start, op_stop): op = ops[op_idx] for sim in range(sim_start, sim_stop): - wave_eval_cpu(op, c, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) + nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) + a_loc = op[6] + a_wr = op[7] + a_wf = op[8] + if a_loc >= 0: + abuf[a_loc, sim] += nrise*a_wr + nfall*a_wf @numba.njit @@ -311,8 +325,8 @@ class WaveSimCuda(WaveSim): All internal memories are mirrored into GPU memory upon construction. Some operations like access to single waveforms can involve large communication overheads. """ - def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False): - super().__init__(circuit, delays, sims, c_caps, c_reuse, strip_forks) + def __init__(self, circuit, delays, sims=8, c_caps=16, a_ctrl=None, c_reuse=False, strip_forks=False): + super().__init__(circuit, delays, sims, c_caps, a_ctrl=a_ctrl, c_reuse=c_reuse, strip_forks=strip_forks) self.c = cuda.to_device(self.c) self.s = cuda.to_device(self.s) @@ -321,6 +335,7 @@ class WaveSimCuda(WaveSim): self.c_caps = cuda.to_device(self.c_caps) self.delays = cuda.to_device(self.delays) self.simctl_int = cuda.to_device(self.simctl_int) + self.abuf = cuda.to_device(self.abuf) self._block_dim = (32, 16) @@ -333,6 +348,7 @@ class WaveSimCuda(WaveSim): state['c_caps'] = np.array(self.c_caps) state['delays'] = np.array(self.delays) state['simctl_int'] = np.array(self.simctl_int) + state['abuf'] = np.array(self.abuf) return state def __setstate__(self, state): @@ -344,6 +360,7 @@ class WaveSimCuda(WaveSim): self.c_caps = cuda.to_device(self.c_caps) self.delays = cuda.to_device(self.delays) self.simctl_int = cuda.to_device(self.simctl_int) + self.abuf = cuda.to_device(self.abuf) def s_to_c(self): grid_dim = self._grid_dim(self.sims, self.s_len) @@ -355,7 +372,7 @@ class WaveSimCuda(WaveSim): sims = min(sims or self.sims, self.sims) for op_start, op_stop in zip(self.level_starts, self.level_stops): grid_dim = self._grid_dim(sims, op_stop - op_start) - wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, int(0), + wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, int(0), sims, self.delays, self.simctl_int, seed) cuda.synchronize() @@ -397,21 +414,24 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True) @cuda.jit() -def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, sim_start, sim_stop, delays, simctl_int, seed): +def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed): x, y = cuda.grid(2) sim = sim_start + x op_idx = op_start + y if sim >= sim_stop: return if op_idx >= op_stop: return - lut = ops[op_idx, 0] - z_idx = ops[op_idx, 1] - a_idx = ops[op_idx, 2] - b_idx = ops[op_idx, 3] - c_idx = ops[op_idx, 4] - d_idx = ops[op_idx, 5] + op = ops[op_idx] + a_loc = op[6] + a_wr = op[7] + a_wf = op[8] + + nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) - _wave_eval_gpu(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) + # accumulate WSA into abuf + if a_loc >= 0: + #abuf[a_loc, sim] += nrise*a_wr + nfall*a_wf + cuda.atomic.add(abuf, (a_loc, sim), nrise*a_wr + nfall*a_wf) @cuda.jit() diff --git a/tests/test_wave_sim.py b/tests/test_wave_sim.py index b07f683..407b90d 100644 --- a/tests/test_wave_sim.py +++ b/tests/test_wave_sim.py @@ -6,7 +6,7 @@ from kyupy import logic, bench, sim from kyupy.logic import mvarray def test_nand_delays(): - op = (sim.NAND4, 4, 0, 1, 2, 3) + op = (sim.NAND4, 4, 0, 1, 2, 3, -1, 0, 0) #op = (0b0111, 4, 0, 1) c = np.full((5*16, 1), TMAX) # 5 waveforms of capacity 16 c_locs = np.zeros((5,), dtype='int')