From f04f1b0012e5680791b06c1f7ba47bc00befc456 Mon Sep 17 00:00:00 2001 From: Stefan Holst Date: Mon, 20 Mar 2023 10:31:55 +0900 Subject: [PATCH] cleanup --- src/kyupy/logic_sim.py | 30 +---- src/kyupy/sim.py | 29 ++++- src/kyupy/wave_sim.py | 288 +---------------------------------------- 3 files changed, 28 insertions(+), 319 deletions(-) diff --git a/src/kyupy/logic_sim.py b/src/kyupy/logic_sim.py index 8eace75..b6d20a4 100644 --- a/src/kyupy/logic_sim.py +++ b/src/kyupy/logic_sim.py @@ -34,21 +34,6 @@ class LogicSim(sim.SimOps): self.s = np.zeros((2, self.s_len, 3, nbytes), dtype=np.uint8) self.s[:,:,1,:] = 255 # unassigned - self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0) - self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0) - self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes)) - - self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs]) - self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs]) - - self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs] - self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs] - self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs] - self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs] - - self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs]) - self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs]) - def __repr__(self): return f'' @@ -81,15 +66,6 @@ class LogicSim(sim.SimOps): """Propagate the input values towards the outputs (Perform all logic operations in topological order). If the circuit is sequential (it contains flip-flops), one call simulates one clock cycle. - Multiple clock cycles are simulated by a assign-propagate-capture loop: - - .. code-block:: python - - # initial state in state_bp - for cycle in range(10): # simulate 10 clock cycles - sim.assign(state_bp) - sim.propagate() - sim.capture(state_bp) :param inject_cb: A callback function for manipulating intermediate signal values. This function is called with a line and its new logic values (in bit-parallel format) after @@ -113,7 +89,7 @@ class LogicSim(sim.SimOps): elif op == sim.NOR2: self.c[o0] = ~(self.c[i0] | self.c[i1]) elif op == sim.XOR2: self.c[o0] = self.c[i0] ^ self.c[i1] elif op == sim.XNOR2: self.c[o0] = ~(self.c[i0] ^ self.c[i1]) - else: print(f'unknown sim {op}') + else: print(f'unknown op {op}') inject_cb(o0, self.s[o0]) elif self.m == 4: pass @@ -128,7 +104,7 @@ class LogicSim(sim.SimOps): elif op == sim.NOR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0]) elif op == sim.XOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]) elif op == sim.XNOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0]) - else: print(f'unknown sim {op}') + else: print(f'unknown op {op}') if inject_cb is not None: inject_cb(o0, self.s[o0]) def s_ppo_to_ppi(self): @@ -169,4 +145,4 @@ def _prop_cpu(ops, c_locs, c): elif op == sim.NOR2: c[o0] = ~(c[i0] | c[i1]) elif op == sim.XOR2: c[o0] = c[i0] ^ c[i1] elif op == sim.XNOR2: c[o0] = ~(c[i0] ^ c[i1]) - else: print(f'unknown sim {op}') + else: print(f'unknown op {op}') diff --git a/src/kyupy/sim.py b/src/kyupy/sim.py index 29da8fc..6bc5961 100644 --- a/src/kyupy/sim.py +++ b/src/kyupy/sim.py @@ -145,7 +145,7 @@ class SimOps: :param keep_signals: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces memory footprint, but intermediate signal waveforms become unaccessible after a propagation. """ - def __init__(self, circuit, c_caps=1, c_reuse=False, strip_forks=False): + def __init__(self, circuit, c_caps=1, c_caps_min=1, c_reuse=False, strip_forks=False): self.circuit = circuit dffs = [n for n in circuit.nodes if 'dff' in n.kind.lower()] latches = [n for n in circuit.nodes if 'latch' in n.kind.lower()] @@ -205,7 +205,7 @@ class SimOps: print('unknown gate type', kind) else: ops.append((sp, o0_idx, i0_idx, i1_idx, i2_idx, i3_idx)) - + self.ops = np.asarray(ops, dtype='int32') # create a map from fanout lines to stem lines for fork stripping @@ -244,19 +244,19 @@ class SimOps: # state allocation table. maps line and interface indices to self.state memory locations self.c_locs = np.full((self.c_locs_len,), -1, dtype=np.int32) self.c_caps = np.zeros((self.c_locs_len,), dtype=np.int32) - + h = Heap() # allocate and keep memory for special fields - self.c_locs[self.zero_idx], self.c_caps[self.zero_idx] = h.alloc(1), 1 - self.c_locs[self.tmp_idx], self.c_caps[self.tmp_idx] = h.alloc(1), 1 + self.c_locs[self.zero_idx], self.c_caps[self.zero_idx] = h.alloc(c_caps_min), c_caps_min + self.c_locs[self.tmp_idx], self.c_caps[self.tmp_idx] = h.alloc(c_caps_min), c_caps_min ref_count[self.zero_idx] += 1 ref_count[self.tmp_idx] += 1 # allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later) for i, n in enumerate(self.s_nodes): if len(n.outs) > 0: - self.c_locs[self.ppi_offset + i], self.c_caps[self.ppi_offset + i] = h.alloc(1), 1 + self.c_locs[self.ppi_offset + i], self.c_caps[self.ppi_offset + i] = h.alloc(c_caps_min), c_caps_min ref_count[self.ppi_offset + i] += 1 if len(n.ins) > 0: i0_idx = stems[n.ins[0]] if stems[n.ins[0]] >= 0 else n.ins[0] @@ -280,7 +280,7 @@ class SimOps: if ref_count[i2_idx] <= 0: free_list.append(self.c_locs[i2_idx]) if ref_count[i3_idx] <= 0: free_list.append(self.c_locs[i3_idx]) o_idx = op[1] - cap = c_caps[o_idx] + cap = max(c_caps_min, c_caps[o_idx]) self.c_locs[o_idx], self.c_caps[o_idx] = h.alloc(cap), cap if not keep_signals: for loc in free_list: @@ -301,3 +301,18 @@ class SimOps: from collections import defaultdict self.prim_counts = defaultdict(int) for op, _, _, _, _, _ in self.ops: self.prim_counts[names[op]] += 1 + + self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0) + self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0) + self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes)) + + self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs]) + self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs]) + + self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs] + self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs] + self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs] + self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs] + + self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs]) + self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs]) diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py index a6feffd..647872f 100644 --- a/src/kyupy/wave_sim.py +++ b/src/kyupy/wave_sim.py @@ -48,14 +48,8 @@ class WaveSim(sim.SimOps): memory footprint, but intermediate signal waveforms become unaccessible after a propagation. """ def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False): - assert c_caps > 0 and c_caps % 4 == 0 - super().__init__(circuit, c_caps=c_caps//4, c_reuse=c_reuse, strip_forks=strip_forks) + super().__init__(circuit, c_caps=c_caps, c_caps_min=4, c_reuse=c_reuse, strip_forks=strip_forks) self.sims = sims - - self.c_len *= 4 - self.c_locs[...] *= 4 - self.c_caps[...] *= 4 - self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype) self.delays[:, :delays.shape[1]] = delays @@ -87,21 +81,6 @@ class WaveSim(sim.SimOps): self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.params)]) - self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0) - self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0) - self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes)) - - self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs]) - self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs]) - - self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs] - self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs] - self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs] - self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs] - - self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs]) - self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs]) - def __repr__(self): return f'<{type(self).__name__} {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \ f'levels={len(self.level_starts)} mem={hr_bytes(self.nbytes)}>' @@ -156,131 +135,6 @@ class WaveSim(sim.SimOps): self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] -@numba.njit -def rand_gauss_cpu(seed, sd): - clamp = 0.5 - if sd <= 0.0: - return 1.0 - while True: - x = -6.0 - for _ in range(12): - seed = int(0xDEECE66D) * seed + 0xB - x += float((seed >> 8) & 0xffffff) / float(1 << 24) - x *= sd - if abs(x) <= clamp: - break - return x + 1.0 - - -@numba.njit -def wave_eval_cpu_old(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, seed=0): - lut, z_idx, a_idx, b_idx, c_idx, d_idx = op - - # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>> - overflows = int(0) - - _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1) - - a_mem = c_locs[a_idx] - b_mem = c_locs[b_idx] - c_mem = c_locs[c_idx] - d_mem = c_locs[d_idx] - z_mem = c_locs[z_idx] - z_cap = c_caps[z_idx] - - a_cur = int(0) - b_cur = int(0) - c_cur = int(0) - d_cur = int(0) - z_cur = lut & 1 - if z_cur == 1: - cbuf[z_mem, st_idx] = TMIN - - a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ a_mem ^ z_cur, sd) * param[0] - if int(param[1]) == a_idx: a += param[2+z_cur] - b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ b_mem ^ z_cur, sd) * param[0] - if int(param[1]) == b_idx: b += param[2+z_cur] - c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ c_mem ^ z_cur, sd) * param[0] - if int(param[1]) == c_idx: c += param[2+z_cur] - d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ d_mem ^ z_cur, sd) * param[0] - if int(param[1]) == d_idx: d += param[2+z_cur] - - previous_t = TMIN - - current_t = min(a, b, c, d) - inputs = int(0) - - while current_t < TMAX: - z_val = z_cur & 1 - if a == current_t: - a_cur += 1 - a = cbuf[a_mem + a_cur, st_idx] - a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0] - thresh = line_times[a_idx, 1, z_val] * rand_gauss_cpu(_seed ^ a_mem ^ z_val, sd) * param[0] - if int(param[1]) == a_idx: - a += param[2+(z_val^1)] - thresh += param[2+z_val] - inputs ^= 1 - next_t = a - - elif b == current_t: - b_cur += 1 - b = cbuf[b_mem + b_cur, st_idx] - b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0] - thresh = line_times[b_idx, 1, z_val] * rand_gauss_cpu(_seed ^ b_mem ^ z_val, sd) * param[0] - if int(param[1]) == b_idx: - b += param[2+(z_val^1)] - thresh += param[2+z_val] - inputs ^= 2 - next_t = b - - elif c == current_t: - c_cur += 1 - c = cbuf[c_mem + c_cur, st_idx] - c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0] - thresh = line_times[c_idx, 1, z_val] * rand_gauss_cpu(_seed ^ c_mem ^ z_val, sd) * param[0] - if int(param[1]) == c_idx: - c += param[2+(z_val^1)] - thresh += param[2+z_val] - inputs ^= 4 - next_t = c - - else: - d_cur += 1 - d = cbuf[d_mem + d_cur, st_idx] - d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0] - thresh = line_times[d_idx, 1, z_val] * rand_gauss_cpu(_seed ^ d_mem ^ z_val, sd) * param[0] - if int(param[1]) == d_idx: - d += param[2+(z_val^1)] - thresh += param[2+z_val] - inputs ^= 8 - next_t = d - - if (z_cur & 1) != ((lut >> inputs) & 1): - # we generate a toggle in z_mem, if: - # ( it is the first toggle in z_mem OR - # following toggle is earlier OR - # pulse is wide enough ) AND enough space in z_mem. - if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh: - if z_cur < (z_cap - 1): - cbuf[z_mem + z_cur, st_idx] = current_t - previous_t = current_t - z_cur += 1 - else: - overflows += 1 - previous_t = cbuf[z_mem + z_cur - 1, st_idx] - z_cur -= 1 - else: - z_cur -= 1 - previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN - - current_t = min(a, b, c, d) - - # generate or propagate overflow flag - cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) - - - @numba.njit def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=0): lut, z_idx, a_idx, b_idx, c_idx, d_idx = op @@ -379,6 +233,7 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed= # generate or propagate overflow flag cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) + @numba.njit def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, delays, params, sd, seed): overflows = 0 @@ -505,143 +360,6 @@ def wave_assign_gpu(c, s, c_locs, ppi_offset): c[c_loc+2, x] = TMAX -@cuda.jit(device=True) -def rand_gauss_gpu(seed, sd): - clamp = 0.5 - if sd <= 0.0: - return 1.0 - while True: - x = -6.0 - for _ in range(12): - seed = int(0xDEECE66D) * seed + 0xB - x += float((seed >> 8) & 0xffffff) / float(1 << 24) - x *= sd - if abs(x) <= clamp: - break - return x + 1.0 - - -@cuda.jit() -def wave_eval_gpu_old(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, line_times, param, sd, seed): - x, y = cuda.grid(2) - st_idx = st_start + x - op_idx = op_start + y - if st_idx >= st_stop: return - if op_idx >= op_stop: return - - lut = ops[op_idx, 0] - z_idx = ops[op_idx, 1] - a_idx = ops[op_idx, 2] - b_idx = ops[op_idx, 3] - c_idx = ops[op_idx, 4] - d_idx = ops[op_idx, 5] - - param = param[st_idx] - - # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>> - overflows = int(0) - - _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1) - - a_mem = c_locs[a_idx] - b_mem = c_locs[b_idx] - c_mem = c_locs[c_idx] - d_mem = c_locs[d_idx] - z_mem = c_locs[z_idx] - z_cap = c_caps[z_idx] - - a_cur = int(0) - b_cur = int(0) - c_cur = int(0) - d_cur = int(0) - z_cur = lut & 1 - if z_cur == 1: - cbuf[z_mem, st_idx] = TMIN - - a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ a_mem ^ z_cur, sd) * param[0] - if int(param[1]) == a_idx: a += param[2+z_cur] - b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ b_mem ^ z_cur, sd) * param[0] - if int(param[1]) == b_idx: b += param[2+z_cur] - c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ c_mem ^ z_cur, sd) * param[0] - if int(param[1]) == c_idx: c += param[2+z_cur] - d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ d_mem ^ z_cur, sd) * param[0] - if int(param[1]) == d_idx: d += param[2+z_cur] - - previous_t = TMIN - - current_t = min(a, b, c, d) - inputs = int(0) - - while current_t < TMAX: - z_val = z_cur & 1 - if a == current_t: - a_cur += 1 - a = cbuf[a_mem + a_cur, st_idx] - a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0] - thresh = line_times[a_idx, 1, z_val] * rand_gauss_gpu(_seed ^ a_mem ^ z_val, sd) * param[0] - if int(param[1]) == a_idx: - a += param[2+(z_val^1)] - thresh += param[2+z_val] - inputs ^= 1 - next_t = a - - elif b == current_t: - b_cur += 1 - b = cbuf[b_mem + b_cur, st_idx] - b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0] - thresh = line_times[b_idx, 1, z_val] * rand_gauss_gpu(_seed ^ b_mem ^ z_val, sd) * param[0] - if int(param[1]) == b_idx: - b += param[2+(z_val^1)] - thresh += param[2+z_val] - inputs ^= 2 - next_t = b - - elif c == current_t: - c_cur += 1 - c = cbuf[c_mem + c_cur, st_idx] - c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0] - thresh = line_times[c_idx, 1, z_val] * rand_gauss_gpu(_seed ^ c_mem ^ z_val, sd) * param[0] - if int(param[1]) == c_idx: - c += param[2+(z_val^1)] - thresh += param[2+z_val] - inputs ^= 4 - next_t = c - - else: - d_cur += 1 - d = cbuf[d_mem + d_cur, st_idx] - d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0] - thresh = line_times[d_idx, 1, z_val] * rand_gauss_gpu(_seed ^ d_mem ^ z_val, sd) * param[0] - if int(param[1]) == d_idx: - d += param[2+(z_val^1)] - thresh += param[2+z_val] - inputs ^= 8 - next_t = d - - if (z_cur & 1) != ((lut >> inputs) & 1): - # we generate a toggle in z_mem, if: - # ( it is the first toggle in z_mem OR - # following toggle is earlier OR - # pulse is wide enough ) AND enough space in z_mem. - if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh: - if z_cur < (z_cap - 1): - cbuf[z_mem + z_cur, st_idx] = current_t - previous_t = current_t - z_cur += 1 - else: - overflows += 1 - previous_t = cbuf[z_mem + z_cur - 1, st_idx] - z_cur -= 1 - else: - z_cur -= 1 - previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN - - current_t = min(a, b, c, d) - - # generate or propagate overflow flag - cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) - - @cuda.jit() def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, delays, param, sd, seed): x, y = cuda.grid(2) @@ -759,7 +477,7 @@ def wave_capture_gpu(c, s, c_locs, c_caps, ppo_offset, time, s_sqrt2, seed): x, y = cuda.grid(2) if ppo_offset + y >= len(c_locs): return line = c_locs[ppo_offset + y] - tdim = c_caps[ppo_offset + y] + tdim = c_caps[ppo_offset + y] if line < 0: return if x >= c.shape[-1]: return vector = x