From 70caea065ee6e4ce2f9f4499d72b11328fda3b34 Mon Sep 17 00:00:00 2001 From: Stefan Holst Date: Mon, 20 Mar 2023 14:14:06 +0900 Subject: [PATCH] more cleanup --- src/kyupy/wave_sim.py | 136 +++++++----------------------------------- 1 file changed, 23 insertions(+), 113 deletions(-) diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py index 647872f..006b52e 100644 --- a/src/kyupy/wave_sim.py +++ b/src/kyupy/wave_sim.py @@ -16,7 +16,7 @@ import math import numpy as np -from . import numba, cuda, hr_bytes, sim +from . import numba, cuda, hr_bytes, sim, cdiv TMAX = np.float32(2 ** 127) @@ -97,17 +97,15 @@ class WaveSim(sim.SimOps): self.c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX]) self.c[self.pippi_c_locs+2] = TMAX - def c_prop(self, sims=None, sd=0.0, seed=1): + def c_prop(self, sims=None, seed=1): """Propagates all waveforms from the (pseudo) primary inputs to the (pseudo) primary outputs. :param sims: Number of parallel simulations to execute. If None, all available simulations are performed. - :param sd: Standard deviation for injection of random delay variation. Active, if value is positive. :param seed: Random seed for delay variations. """ sims = min(sims or self.sims, self.sims) for op_start, op_stop in zip(self.level_starts, self.level_stops): - level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, 0, sims, - self.delays, self.params, sd, seed) + level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, 0, sims, self.delays, self.params, seed) def c_to_s(self, time=TMAX, sd=0.0, seed=1): """Simulates a capture operation at all sequential elements and primary outputs. @@ -135,11 +133,7 @@ class WaveSim(sim.SimOps): self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] -@numba.njit -def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=0): - lut, z_idx, a_idx, b_idx, c_idx, d_idx = op - - # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>> +def _wave_eval(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, st_idx, delays, param, seed=0): overflows = int(0) if len(delays) > 1: @@ -234,13 +228,21 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed= cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) +_wave_eval_cpu = numba.njit(_wave_eval) + + @numba.njit -def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, delays, params, sd, seed): - overflows = 0 +def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, seed=0): + lut, z_idx, a_idx, b_idx, c_idx, d_idx = op + _wave_eval_cpu(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, st_idx, delays, param, seed) + + +@numba.njit +def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, delays, params, seed): for op_idx in range(op_start, op_stop): op = ops[op_idx] for st_idx in range(st_start, st_stop): - wave_eval_cpu(op, c, c_locs, c_caps, st_idx, delays, params[st_idx], sd, seed) + wave_eval_cpu(op, c, c_locs, c_caps, st_idx, delays, params[st_idx], seed) @numba.njit @@ -313,17 +315,14 @@ class WaveSimCuda(WaveSim): grid_dim = self._grid_dim(self.sims, self.s_len) wave_assign_gpu[grid_dim, self._block_dim](self.c, self.s, self.c_locs, self.ppi_offset) - def _grid_dim(self, x, y): - gx = math.ceil(x / self._block_dim[0]) - gy = math.ceil(y / self._block_dim[1]) - return gx, gy + def _grid_dim(self, x, y): return cdiv(x, self._block_dim[0]), cdiv(y, self._block_dim[1]) - def c_prop(self, sims=None, sd=0.0, seed=1): + def c_prop(self, sims=None, seed=1): sims = min(sims or self.sims, self.sims) for op_start, op_stop in zip(self.level_starts, self.level_stops): grid_dim = self._grid_dim(sims, op_stop - op_start) wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, int(0), - sims, self.delays, self.params, sd, seed) + sims, self.delays, self.params, seed) cuda.synchronize() def c_to_s(self, time=TMAX, sd=0.0, seed=1): @@ -360,8 +359,11 @@ def wave_assign_gpu(c, s, c_locs, ppi_offset): c[c_loc+2, x] = TMAX +_wave_eval_gpu = cuda.jit(_wave_eval, device=True) + + @cuda.jit() -def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, delays, param, sd, seed): +def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, delays, param, seed): x, y = cuda.grid(2) st_idx = st_start + x op_idx = op_start + y @@ -377,99 +379,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_sto param = param[st_idx] - # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>> - overflows = int(0) - - if len(delays) > 1: - _rnd = (seed << 4) + (z_idx << 20) + (st_idx << 1) - for _ in range(4): - _rnd = int(0xDEECE66D) * _rnd + 0xB - delays = delays[_rnd % len(delays)] - else: - delays = delays[0] - - a_mem = c_locs[a_idx] - b_mem = c_locs[b_idx] - c_mem = c_locs[c_idx] - d_mem = c_locs[d_idx] - z_mem = c_locs[z_idx] - z_cap = c_caps[z_idx] - - a_cur = int(0) - b_cur = int(0) - c_cur = int(0) - d_cur = int(0) - z_cur = lut & 1 - if z_cur == 1: - cbuf[z_mem, st_idx] = TMIN - - z_val = z_cur - - a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val] - b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val] - c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val] - d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val] - - previous_t = TMIN - - current_t = min(a, b, c, d) - inputs = int(0) - - while current_t < TMAX: - if a == current_t: - a_cur += 1 - inputs ^= 1 - thresh = delays[a_idx, 0, z_val] - a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val] - next_t = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val ^ 1] - elif b == current_t: - b_cur += 1 - inputs ^= 2 - thresh = delays[b_idx, 0, z_val] - b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val] - next_t = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val ^ 1] - elif c == current_t: - c_cur += 1 - inputs ^= 4 - thresh = delays[c_idx, 0, z_val] - c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val] - next_t = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val ^ 1] - else: - d_cur += 1 - inputs ^= 8 - thresh = delays[d_idx, 0, z_val] - d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val] - next_t = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val ^ 1] - - if (z_cur & 1) != ((lut >> inputs) & 1): - # we generate an edge in z_mem, if ... - if (z_cur == 0 # it is the first edge in z_mem ... - or next_t < current_t # -OR- the next edge on SAME input is EARLIER (need current edge to filter BOTH in next iteration) ... - or (current_t - previous_t) > thresh # -OR- the generated hazard is wider than pulse threshold. - ): - if z_cur < (z_cap - 1): # enough space in z_mem? - cbuf[z_mem + z_cur, st_idx] = current_t - previous_t = current_t - z_cur += 1 - else: - overflows += 1 - previous_t = cbuf[z_mem + z_cur - 1, st_idx] - z_cur -= 1 - else: - z_cur -= 1 - previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN - - # output value of cell changed. update all delayed inputs. - z_val = z_val ^ 1 - a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val] - b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val] - c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val] - d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val] - - current_t = min(a, b, c, d) - - # generate or propagate overflow flag - cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) + _wave_eval_gpu(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, st_idx, delays, param, seed) @cuda.jit()