diff --git a/src/kyupy/sim.py b/src/kyupy/sim.py index 6ed703b..32b7459 100644 --- a/src/kyupy/sim.py +++ b/src/kyupy/sim.py @@ -187,7 +187,6 @@ class SimOps: levels = [] ppio2idx = dict((n, i) for i, n in enumerate(circuit.s_nodes)) - pis = set([n for n in circuit.s_nodes if len(n.ins) == 0]) ppos = set([n for n in circuit.s_nodes if len(n.ins) > 0]) readers = np.array([1 if l.reader in ppos else len(l.reader.outs) for l in circuit.lines], dtype=np.int32) # for ref-counting forks @@ -311,6 +310,15 @@ class SimOps: if len(n.ins) > 0: self.c_locs[self.ppo_offset + i], self.c_caps[self.ppo_offset + i] = self.c_locs[n.ins[0]], self.c_caps[n.ins[0]] + # line use information + self.line_use_start = np.full(self.c_locs_len, -1, dtype=np.int32) + self.line_use_stop = np.full(self.c_locs_len, len(self.levels), dtype=np.int32) + for i, lv in enumerate(self.levels): + for op in lv: + self.line_use_start[op[1]] = i + for x in [2, 3, 4, 5]: + self.line_use_stop[op[x]] = i + self.c_len = h.max_size d = defaultdict(int) diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py index 36866c4..e02cbed 100644 --- a/src/kyupy/wave_sim.py +++ b/src/kyupy/wave_sim.py @@ -13,10 +13,11 @@ Two simulators are available: :py:class:`WaveSim` runs on the CPU, and the deriv """ import math +from collections import defaultdict import numpy as np -from . import numba, cuda, sim, cdiv, eng +from . import log, numba, cuda, sim, cdiv, eng TMAX = np.float32(2 ** 127) @@ -59,8 +60,8 @@ class WaveSim(sim.SimOps): self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype) self.delays[:, :delays.shape[1]] = delays - self.c = np.zeros((self.c_len, sims), dtype=np.float32) + TMAX - self.s = np.zeros((11, self.s_len, sims), dtype=np.float32) + self.c = np.full((self.c_len, self.sims), TMAX, dtype=np.float32) + self.s = np.zeros((11, self.s_len, self.sims), dtype=np.float32) """Information about the logic values and transitions around the sequential elements (flip-flops) and ports. The first 3 values are read by :py:func:`s_to_c`. @@ -98,7 +99,26 @@ class WaveSim(sim.SimOps): self.simctl_int[0] = range(sims) # unique seed for each sim by default, zero this to pick same delays for all sims. self.simctl_int[1] = 2 # random picking by default. - self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.simctl_int)]) + # flat array for line use information + line_use = defaultdict(list) + for lidx in range(len(self.circuit.lines)): + if self.line_use_start[lidx] < 0: continue + if self.line_use_stop[lidx] < 0: + log.warn(f'line {lidx} never read?') + for i in range(self.line_use_start[lidx], self.line_use_stop[lidx]): + line_use[i].append(lidx) + + self.line_use_counts = np.array([len(line_use[i]) for i in range(len(self.levels))], dtype=np.int32) + self.line_use_offsets = np.zeros_like(self.line_use_counts) + self.line_use_offsets[1:] = self.line_use_counts.cumsum()[:-1] + self.line_use = np.hstack([line_use[i] for i in range(len(self.levels))]) + + self.h = np.zeros((self.c_locs_len, sims), dtype=np.float32) # hashes of generated waveforms + self.h_base = np.zeros_like(self.h) # base hashes to compare to + + self.error_counts = np.zeros(self.s_len, dtype=np.uint32) # number of capture errors by PPO + + self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.h, self.c_locs, self.c_caps, self.ops, self.simctl_int)]) def __repr__(self): dev = 'GPU' if hasattr(self.c, 'copy_to_host') else 'CPU' @@ -124,7 +144,7 @@ class WaveSim(sim.SimOps): """ sims = min(sims or self.sims, self.sims) for op_start, op_stop in zip(self.level_starts, self.level_stops): - level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, 0, sims, self.delays, self.simctl_int, seed) + level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, 0, sims, self.delays, self.simctl_int, seed) def c_to_s(self, time=TMAX, sd=0.0, seed=1): """Simulates a capture operation at all sequential elements and primary outputs. @@ -152,7 +172,7 @@ class WaveSim(sim.SimOps): self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] -def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0): +def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed): overflows = int(0) lut = op[0] @@ -182,6 +202,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0): z_mem = c_locs[z_idx] z_cap = c_caps[z_idx] + h = np.float32(0) + a_cur = int(0) b_cur = int(0) c_cur = int(0) @@ -229,6 +251,7 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0): next_t = cbuf[d_mem + d_cur, sim] + delays[d_idx, (d_cur & 1) ^ 1, z_val ^ 1] if (z_cur & 1) != ((lut >> inputs) & 1): + h += h*3 + max(current_t, -10) # hash based on generated transitions before filtering # we generate an edge in z_mem, if ... if (z_cur == 0 # it is the first edge in z_mem ... or next_t < current_t # -OR- the next edge on SAME input is EARLIER (need current edge to filter BOTH in next iteration) ... @@ -240,8 +263,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0): z_cur += 1 else: overflows += 1 - previous_t = cbuf[z_mem + z_cur - 1, sim] z_cur -= 1 + previous_t = cbuf[z_mem + z_cur, sim] else: z_cur -= 1 previous_t = cbuf[z_mem + z_cur - 1, sim] if z_cur > 0 else TMIN @@ -258,6 +281,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0): # generate or propagate overflow flag cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d) + hbuf[z_idx, sim] = h + nrise = max(0, (z_cur+1) // 2 - (cbuf[z_mem, sim] == TMIN)) nfall = z_cur // 2 @@ -268,11 +293,11 @@ wave_eval_cpu = numba.njit(_wave_eval) @numba.njit -def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed): +def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, hbuf, abuf, sim_start, sim_stop, delays, simctl_int, seed): for op_idx in range(op_start, op_stop): op = ops[op_idx] for sim in range(sim_start, sim_stop): - nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) + nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, hbuf, sim, delays, simctl_int[:, sim], seed) a_loc = op[6] a_wr = op[7] a_wf = op[8] @@ -345,12 +370,18 @@ class WaveSimCuda(WaveSim): self.delays = cuda.to_device(self.delays) self.simctl_int = cuda.to_device(self.simctl_int) self.abuf = cuda.to_device(self.abuf) + self.h = cuda.to_device(self.h) + self.h_base = cuda.to_device(self.h_base) + self.line_use = cuda.to_device(self.line_use) + self.error_counts = cuda.to_device(self.error_counts) + + self.retval_int = cuda.to_device(np.array([0], dtype=np.int32)) self._block_dim = (32, 16) def __getstate__(self): state = self.__dict__.copy() - state['c'] = np.array(self.c) + del state['c'] state['s'] = np.array(self.s) state['ops'] = np.array(self.ops) state['c_locs'] = np.array(self.c_locs) @@ -358,11 +389,16 @@ class WaveSimCuda(WaveSim): state['delays'] = np.array(self.delays) state['simctl_int'] = np.array(self.simctl_int) state['abuf'] = np.array(self.abuf) + state['h'] = np.array(self.h) + state['h_base'] = np.array(self.h_base) + state['line_use'] = np.array(self.line_use) + state['error_counts'] = np.array(self.error_counts) + state['retval_int'] = np.array(self.retval_int) return state def __setstate__(self, state): self.__dict__.update(state) - self.c = cuda.to_device(self.c) + self.c = cuda.to_device(np.full((self.c_len, self.sims), TMAX, dtype=np.float32)) self.s = cuda.to_device(self.s) self.ops = cuda.to_device(self.ops) self.c_locs = cuda.to_device(self.c_locs) @@ -370,6 +406,11 @@ class WaveSimCuda(WaveSim): self.delays = cuda.to_device(self.delays) self.simctl_int = cuda.to_device(self.simctl_int) self.abuf = cuda.to_device(self.abuf) + self.h = cuda.to_device(self.h) + self.h_base = cuda.to_device(self.h_base) + self.line_use = cuda.to_device(self.line_use) + self.error_counts = cuda.to_device(self.error_counts) + self.retval_int = cuda.to_device(self.retval_int) def s_to_c(self): grid_dim = self._grid_dim(self.sims, self.s_len) @@ -383,10 +424,18 @@ class WaveSimCuda(WaveSim): if op_from > op_start: continue if op_to is not None and op_to <= op_start: break grid_dim = self._grid_dim(sims, op_stop - op_start) - wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, int(0), + wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, int(0), sims, self.delays, self.simctl_int, seed) cuda.synchronize() + def c_prop_level(self, level, sims=None, seed=1): + sims = min(sims or self.sims, self.sims) + op_start = self.level_starts[level] + op_stop = self.level_stops[level] + grid_dim = self._grid_dim(sims, op_stop - op_start) + wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, int(0), + sims, self.delays, self.simctl_int, seed) + def c_to_s(self, time=TMAX, sd=0.0, seed=1): grid_dim = self._grid_dim(self.sims, self.s_len) wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.c_locs, self.c_caps, self.ppo_offset, @@ -396,6 +445,52 @@ class WaveSimCuda(WaveSim): grid_dim = self._grid_dim(self.sims, self.s_len) ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.c_locs, time, self.ppi_offset, self.ppo_offset) + def set_base_hashes(self): + nitems = self.h_base.shape[0] * self.h_base.shape[1] + grid_dim = cdiv(nitems, 256) + memcpy_gpu[grid_dim, 256](self.h, self.h_base, nitems) + + def compare_hashes_level(self, lv): + self.retval_int[0] = 0 + grid_dim = self._grid_dim(self.sims, self.line_use_counts[lv]) + diff_hash_gpu[grid_dim, self._block_dim](self.h, self.h_base, self.line_use, self.line_use_offsets[lv], + self.line_use_counts[lv], self.retval_int) + return self.retval_int[0] + + def calc_error_counts(self, sims=None): + sims = min(sims or self.sims, self.sims) + grid_dim = cdiv(self.s_len, 256) + calc_error_counts_gpu[grid_dim, 256](self.s, sims, self.error_counts) + return np.array(self.error_counts) + + +@cuda.jit() +def memcpy_gpu (src, dst, nitems): + tid = cuda.grid(1) + stride = cuda.gridDim.x * cuda.blockDim.x + for i in range(tid, nitems, stride): + dst.flat[i] = src.flat[i] + + +@cuda.jit() +def diff_hash_gpu(hbuf1, hbuf2, h_locs, h_locs_offset, h_locs_cnt, differs): + x, y = cuda.grid(2) + if x >= hbuf1.shape[1]: return + if y >= h_locs_cnt: return + h_loc = h_locs[h_locs_offset+y] + if hbuf1[h_loc, x] != hbuf2[h_loc, x]: + differs[0] = 1 + + +@cuda.jit() +def calc_error_counts_gpu(s, sims, error_counts): + x = cuda.grid(1) + if x >= s.shape[1]: return + cnt = 0 + for i in range(sims): + cnt += (s[6,x,i] != s[8,x,i]) + error_counts[x] = cnt + @cuda.jit() def wave_assign_gpu(c, s, c_locs, ppi_offset): @@ -425,7 +520,7 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True) @cuda.jit() -def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed): +def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, hbuf, abuf, sim_start, sim_stop, delays, simctl_int, seed): x, y = cuda.grid(2) sim = sim_start + x op_idx = op_start + y @@ -437,7 +532,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, abuf, sim_start, a_wr = op[7] a_wf = op[8] - nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) + nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int[:, sim], seed) # accumulate WSA into abuf if a_loc >= 0: diff --git a/tests/test_wave_sim.py b/tests/test_wave_sim.py index d09cfc1..996ca54 100644 --- a/tests/test_wave_sim.py +++ b/tests/test_wave_sim.py @@ -8,9 +8,10 @@ from kyupy.logic import mvarray def test_xnor2_delays(): op = (sim.XNOR2, 2, 0, 1, 3, 3, -1, 0, 0) #op = (0b0111, 4, 0, 1) - c = np.full((4*16, 1), TMAX) # 4 waveforms of capacity 16 + c = np.full((4*16, 1), TMAX, dtype=np.float32) # 4 waveforms of capacity 16 c_locs = np.zeros((4,), dtype='int') c_caps = np.zeros((4,), dtype='int') + h = np.zeros((4, 1), dtype=np.float32) for i in range(4): c_locs[i], c_caps[i] = i*16, 16 # 1:1 mapping delays = np.zeros((1, 4, 2, 2)) @@ -27,7 +28,7 @@ def test_xnor2_delays(): def wave_assert(inputs, output): for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i - wave_eval_cpu(op, c, c_locs, c_caps, 0, delays, simctl_int) + wave_eval_cpu(op, c, c_locs, c_caps, h, 0, delays, simctl_int, 0) for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[2,i], v) wave_assert([[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # XNOR(1,1) => 1 @@ -40,9 +41,10 @@ def test_xnor2_delays(): def test_nand_delays(): op = (sim.NAND4, 4, 0, 1, 2, 3, -1, 0, 0) #op = (0b0111, 4, 0, 1) - c = np.full((5*16, 1), TMAX) # 5 waveforms of capacity 16 + c = np.full((5*16, 1), TMAX, dtype=np.float32) # 5 waveforms of capacity 16 c_locs = np.zeros((5,), dtype='int') c_caps = np.zeros((5,), dtype='int') + h = np.zeros((5, 1), dtype=np.float32) for i in range(5): c_locs[i], c_caps[i] = i*16, 16 # 1:1 mapping @@ -64,7 +66,7 @@ def test_nand_delays(): def wave_assert(inputs, output): for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i - wave_eval_cpu(op, c, c_locs, c_caps, 0, delays, simctl_int) + wave_eval_cpu(op, c, c_locs, c_caps, h, 0, delays, simctl_int, 0) for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[4,i], v) wave_assert([[TMAX,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(0,0,1,1) => 1