From 4c55dcec60038033e199272a05e8cbed9bb68e63 Mon Sep 17 00:00:00 2001 From: Stefan Holst Date: Mon, 8 Apr 2024 00:29:12 +0900 Subject: [PATCH] delta sim for improving fault sim performance --- src/kyupy/wave_sim.py | 51 +++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py index 8a25e54..7bee0a2 100644 --- a/src/kyupy/wave_sim.py +++ b/src/kyupy/wave_sim.py @@ -99,7 +99,7 @@ class WaveSim(sim.SimOps): self.simctl_int[0] = range(sims) # unique seed for each sim by default, zero this to pick same delays for all sims. self.simctl_int[1] = 2 # random picking by default. - self.e = np.zeros((self.c_locs_len, sims), dtype=np.uint8) # aux data for each line and sim + self.e = np.zeros((self.c_locs_len, sims, 2), dtype=np.uint8) # aux data for each line and sim self.error_counts = np.zeros(self.s_len, dtype=np.uint32) # number of capture errors by PPO self.lsts = np.zeros(self.s_len, dtype=np.float32) # LST by PPO @@ -123,7 +123,7 @@ class WaveSim(sim.SimOps): self.c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX]) self.c[self.pippi_c_locs+2] = TMAX - def c_prop(self, sims=None, seed=1): + def c_prop(self, sims=None, seed=1, delta=0): """Propagates all waveforms from the (pseudo) primary inputs to the (pseudo) primary outputs. :param sims: Number of parallel simulations to execute. If None, all available simulations are performed. @@ -131,7 +131,7 @@ class WaveSim(sim.SimOps): """ sims = min(sims or self.sims, self.sims) for op_start, op_stop in zip(self.level_starts, self.level_stops): - level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, 0, sims, self.delays, self.simctl_int, seed) + level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, 0, sims, self.delays, self.simctl_int, seed, delta) def c_to_s(self, time=TMAX, sd=0.0, seed=1): """Simulates a capture operation at all sequential elements and primary outputs. @@ -159,7 +159,7 @@ class WaveSim(sim.SimOps): self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] -def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed): +def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed, delta): overflows = int(0) lut = op[0] @@ -169,6 +169,18 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed): c_idx = op[4] d_idx = op[5] + input_epoch = (ebuf[a_idx, sim, 1]| + ebuf[b_idx, sim, 1]| + ebuf[c_idx, sim, 1]| + ebuf[d_idx, sim, 1]) + + output_epoch = ebuf[z_idx, sim, 1] + + if (delta): + if input_epoch == 0 and output_epoch == 0: return 0, 0 + + out_changed = output_epoch + if len(delays) > 1: if simctl_int[1] == 0: delays = delays[seed] @@ -242,6 +254,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed): or (current_t - previous_t) > thresh # -OR- the generated hazard is wider than pulse threshold. ): if z_cur < (z_cap - 1): # enough space in z_mem? + if delta and (cbuf[z_mem + z_cur, sim] != current_t): + out_changed = 1 cbuf[z_mem + z_cur, sim] = current_t previous_t = current_t z_cur += 1 @@ -262,6 +276,9 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed): current_t = min(a, b, c, d) + if delta and (cbuf[z_mem + z_cur, sim] != TMAX): + out_changed = 1 + # generate or propagate overflow flag cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d) @@ -272,7 +289,9 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed): e |= z_val # final value e |= (nrise + nfall)<<2 # number of transitions - ebuf[z_idx, sim] = e + ebuf[z_idx, sim, 0] = e + + ebuf[z_idx, sim, 1] = input_epoch & out_changed return nrise, nfall @@ -281,11 +300,11 @@ wave_eval_cpu = numba.njit(_wave_eval) @numba.njit -def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, seed): +def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, seed, delta): for op_idx in range(op_start, op_stop): op = ops[op_idx] for sim in range(sim_start, sim_stop): - nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], seed) + nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], seed, delta) a_loc = op[6] a_wr = op[7] a_wf = op[8] @@ -363,7 +382,7 @@ class WaveSimCuda(WaveSim): self.lsts = cuda.to_device(self.lsts) self.overflows = cuda.to_device(self.overflows) - self.retval_int = cuda.to_device(np.array([0], dtype=np.int32)) + self.aux = cuda.to_device(np.zeros(8*1024, dtype=np.int32)) self._block_dim = (32, 16) @@ -381,7 +400,7 @@ class WaveSimCuda(WaveSim): state['error_counts'] = np.array(self.error_counts) state['lsts'] = np.array(self.lsts) state['overflows'] = np.array(self.overflows) - state['retval_int'] = np.array(self.retval_int) + state['aux'] = np.array(self.aux) return state def __setstate__(self, state): @@ -398,7 +417,7 @@ class WaveSimCuda(WaveSim): self.error_counts = cuda.to_device(self.error_counts) self.lsts = cuda.to_device(self.lsts) self.overflows = cuda.to_device(self.overflows) - self.retval_int = cuda.to_device(self.retval_int) + self.aux = cuda.to_device(self.aux) def s_to_c(self): grid_dim = self._grid_dim(self.sims, self.s_len) @@ -406,23 +425,23 @@ class WaveSimCuda(WaveSim): def _grid_dim(self, x, y): return cdiv(x, self._block_dim[0]), cdiv(y, self._block_dim[1]) - def c_prop(self, sims=None, seed=1, op_from=0, op_to=None): + def c_prop(self, sims=None, seed=1, op_from=0, op_to=None, delta=0): sims = min(sims or self.sims, self.sims) for op_start, op_stop in zip(self.level_starts, self.level_stops): if op_from > op_start: continue if op_to is not None and op_to <= op_start: break grid_dim = self._grid_dim(sims, op_stop - op_start) wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, int(0), - sims, self.delays, self.simctl_int, seed) + sims, self.delays, self.simctl_int, seed, delta) cuda.synchronize() - def c_prop_level(self, level, sims=None, seed=1): + def c_prop_level(self, level, sims=None, seed=1, delta=0): sims = min(sims or self.sims, self.sims) op_start = self.level_starts[level] op_stop = self.level_stops[level] grid_dim = self._grid_dim(sims, op_stop - op_start) wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, int(0), - sims, self.delays, self.simctl_int, seed) + sims, self.delays, self.simctl_int, seed, delta) def c_to_s(self, time=TMAX, sd=0.0, seed=1): grid_dim = self._grid_dim(self.sims, self.s_len) @@ -533,7 +552,7 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True) @cuda.jit() -def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, seed): +def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, seed, delta): x, y = cuda.grid(2) sim = sim_start + x op_idx = op_start + y @@ -545,7 +564,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, ebuf, abuf, sim_ a_wr = op[7] a_wf = op[8] - nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], seed) + nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], seed, delta) # accumulate WSA into abuf if a_loc >= 0: