Browse Source

cleanup

devel
Stefan Holst 2 years ago
parent
commit
f04f1b0012
  1. 30
      src/kyupy/logic_sim.py
  2. 29
      src/kyupy/sim.py
  3. 288
      src/kyupy/wave_sim.py

30
src/kyupy/logic_sim.py

@ -34,21 +34,6 @@ class LogicSim(sim.SimOps): @@ -34,21 +34,6 @@ class LogicSim(sim.SimOps):
self.s = np.zeros((2, self.s_len, 3, nbytes), dtype=np.uint8)
self.s[:,:,1,:] = 255 # unassigned
self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
def __repr__(self):
return f'<LogicSim {self.circuit.name} sims={self.sims} m={self.m} state_mem={hr_bytes(self.c.nbytes)}>'
@ -81,15 +66,6 @@ class LogicSim(sim.SimOps): @@ -81,15 +66,6 @@ class LogicSim(sim.SimOps):
"""Propagate the input values towards the outputs (Perform all logic operations in topological order).
If the circuit is sequential (it contains flip-flops), one call simulates one clock cycle.
Multiple clock cycles are simulated by a assign-propagate-capture loop:
.. code-block:: python
# initial state in state_bp
for cycle in range(10): # simulate 10 clock cycles
sim.assign(state_bp)
sim.propagate()
sim.capture(state_bp)
:param inject_cb: A callback function for manipulating intermediate signal values.
This function is called with a line and its new logic values (in bit-parallel format) after
@ -113,7 +89,7 @@ class LogicSim(sim.SimOps): @@ -113,7 +89,7 @@ class LogicSim(sim.SimOps):
elif op == sim.NOR2: self.c[o0] = ~(self.c[i0] | self.c[i1])
elif op == sim.XOR2: self.c[o0] = self.c[i0] ^ self.c[i1]
elif op == sim.XNOR2: self.c[o0] = ~(self.c[i0] ^ self.c[i1])
else: print(f'unknown sim {op}')
else: print(f'unknown op {op}')
inject_cb(o0, self.s[o0])
elif self.m == 4:
pass
@ -128,7 +104,7 @@ class LogicSim(sim.SimOps): @@ -128,7 +104,7 @@ class LogicSim(sim.SimOps):
elif op == sim.NOR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
elif op == sim.XOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1])
elif op == sim.XNOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
else: print(f'unknown sim {op}')
else: print(f'unknown op {op}')
if inject_cb is not None: inject_cb(o0, self.s[o0])
def s_ppo_to_ppi(self):
@ -169,4 +145,4 @@ def _prop_cpu(ops, c_locs, c): @@ -169,4 +145,4 @@ def _prop_cpu(ops, c_locs, c):
elif op == sim.NOR2: c[o0] = ~(c[i0] | c[i1])
elif op == sim.XOR2: c[o0] = c[i0] ^ c[i1]
elif op == sim.XNOR2: c[o0] = ~(c[i0] ^ c[i1])
else: print(f'unknown sim {op}')
else: print(f'unknown op {op}')

29
src/kyupy/sim.py

@ -145,7 +145,7 @@ class SimOps: @@ -145,7 +145,7 @@ class SimOps:
:param keep_signals: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
"""
def __init__(self, circuit, c_caps=1, c_reuse=False, strip_forks=False):
def __init__(self, circuit, c_caps=1, c_caps_min=1, c_reuse=False, strip_forks=False):
self.circuit = circuit
dffs = [n for n in circuit.nodes if 'dff' in n.kind.lower()]
latches = [n for n in circuit.nodes if 'latch' in n.kind.lower()]
@ -205,7 +205,7 @@ class SimOps: @@ -205,7 +205,7 @@ class SimOps:
print('unknown gate type', kind)
else:
ops.append((sp, o0_idx, i0_idx, i1_idx, i2_idx, i3_idx))
self.ops = np.asarray(ops, dtype='int32')
# create a map from fanout lines to stem lines for fork stripping
@ -244,19 +244,19 @@ class SimOps: @@ -244,19 +244,19 @@ class SimOps:
# state allocation table. maps line and interface indices to self.state memory locations
self.c_locs = np.full((self.c_locs_len,), -1, dtype=np.int32)
self.c_caps = np.zeros((self.c_locs_len,), dtype=np.int32)
h = Heap()
# allocate and keep memory for special fields
self.c_locs[self.zero_idx], self.c_caps[self.zero_idx] = h.alloc(1), 1
self.c_locs[self.tmp_idx], self.c_caps[self.tmp_idx] = h.alloc(1), 1
self.c_locs[self.zero_idx], self.c_caps[self.zero_idx] = h.alloc(c_caps_min), c_caps_min
self.c_locs[self.tmp_idx], self.c_caps[self.tmp_idx] = h.alloc(c_caps_min), c_caps_min
ref_count[self.zero_idx] += 1
ref_count[self.tmp_idx] += 1
# allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later)
for i, n in enumerate(self.s_nodes):
if len(n.outs) > 0:
self.c_locs[self.ppi_offset + i], self.c_caps[self.ppi_offset + i] = h.alloc(1), 1
self.c_locs[self.ppi_offset + i], self.c_caps[self.ppi_offset + i] = h.alloc(c_caps_min), c_caps_min
ref_count[self.ppi_offset + i] += 1
if len(n.ins) > 0:
i0_idx = stems[n.ins[0]] if stems[n.ins[0]] >= 0 else n.ins[0]
@ -280,7 +280,7 @@ class SimOps: @@ -280,7 +280,7 @@ class SimOps:
if ref_count[i2_idx] <= 0: free_list.append(self.c_locs[i2_idx])
if ref_count[i3_idx] <= 0: free_list.append(self.c_locs[i3_idx])
o_idx = op[1]
cap = c_caps[o_idx]
cap = max(c_caps_min, c_caps[o_idx])
self.c_locs[o_idx], self.c_caps[o_idx] = h.alloc(cap), cap
if not keep_signals:
for loc in free_list:
@ -301,3 +301,18 @@ class SimOps: @@ -301,3 +301,18 @@ class SimOps:
from collections import defaultdict
self.prim_counts = defaultdict(int)
for op, _, _, _, _, _ in self.ops: self.prim_counts[names[op]] += 1
self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])

288
src/kyupy/wave_sim.py

@ -48,14 +48,8 @@ class WaveSim(sim.SimOps): @@ -48,14 +48,8 @@ class WaveSim(sim.SimOps):
memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
"""
def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False):
assert c_caps > 0 and c_caps % 4 == 0
super().__init__(circuit, c_caps=c_caps//4, c_reuse=c_reuse, strip_forks=strip_forks)
super().__init__(circuit, c_caps=c_caps, c_caps_min=4, c_reuse=c_reuse, strip_forks=strip_forks)
self.sims = sims
self.c_len *= 4
self.c_locs[...] *= 4
self.c_caps[...] *= 4
self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype)
self.delays[:, :delays.shape[1]] = delays
@ -87,21 +81,6 @@ class WaveSim(sim.SimOps): @@ -87,21 +81,6 @@ class WaveSim(sim.SimOps):
self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.params)])
self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
def __repr__(self):
return f'<{type(self).__name__} {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
f'levels={len(self.level_starts)} mem={hr_bytes(self.nbytes)}>'
@ -156,131 +135,6 @@ class WaveSim(sim.SimOps): @@ -156,131 +135,6 @@ class WaveSim(sim.SimOps):
self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs]
@numba.njit
def rand_gauss_cpu(seed, sd):
clamp = 0.5
if sd <= 0.0:
return 1.0
while True:
x = -6.0
for _ in range(12):
seed = int(0xDEECE66D) * seed + 0xB
x += float((seed >> 8) & 0xffffff) / float(1 << 24)
x *= sd
if abs(x) <= clamp:
break
return x + 1.0
@numba.njit
def wave_eval_cpu_old(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, seed=0):
lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
# >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
overflows = int(0)
_seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
a_mem = c_locs[a_idx]
b_mem = c_locs[b_idx]
c_mem = c_locs[c_idx]
d_mem = c_locs[d_idx]
z_mem = c_locs[z_idx]
z_cap = c_caps[z_idx]
a_cur = int(0)
b_cur = int(0)
c_cur = int(0)
d_cur = int(0)
z_cur = lut & 1
if z_cur == 1:
cbuf[z_mem, st_idx] = TMIN
a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
if int(param[1]) == a_idx: a += param[2+z_cur]
b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
if int(param[1]) == b_idx: b += param[2+z_cur]
c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
if int(param[1]) == c_idx: c += param[2+z_cur]
d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
if int(param[1]) == d_idx: d += param[2+z_cur]
previous_t = TMIN
current_t = min(a, b, c, d)
inputs = int(0)
while current_t < TMAX:
z_val = z_cur & 1
if a == current_t:
a_cur += 1
a = cbuf[a_mem + a_cur, st_idx]
a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[a_idx, 1, z_val] * rand_gauss_cpu(_seed ^ a_mem ^ z_val, sd) * param[0]
if int(param[1]) == a_idx:
a += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 1
next_t = a
elif b == current_t:
b_cur += 1
b = cbuf[b_mem + b_cur, st_idx]
b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[b_idx, 1, z_val] * rand_gauss_cpu(_seed ^ b_mem ^ z_val, sd) * param[0]
if int(param[1]) == b_idx:
b += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 2
next_t = b
elif c == current_t:
c_cur += 1
c = cbuf[c_mem + c_cur, st_idx]
c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[c_idx, 1, z_val] * rand_gauss_cpu(_seed ^ c_mem ^ z_val, sd) * param[0]
if int(param[1]) == c_idx:
c += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 4
next_t = c
else:
d_cur += 1
d = cbuf[d_mem + d_cur, st_idx]
d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[d_idx, 1, z_val] * rand_gauss_cpu(_seed ^ d_mem ^ z_val, sd) * param[0]
if int(param[1]) == d_idx:
d += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 8
next_t = d
if (z_cur & 1) != ((lut >> inputs) & 1):
# we generate a toggle in z_mem, if:
# ( it is the first toggle in z_mem OR
# following toggle is earlier OR
# pulse is wide enough ) AND enough space in z_mem.
if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
if z_cur < (z_cap - 1):
cbuf[z_mem + z_cur, st_idx] = current_t
previous_t = current_t
z_cur += 1
else:
overflows += 1
previous_t = cbuf[z_mem + z_cur - 1, st_idx]
z_cur -= 1
else:
z_cur -= 1
previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
current_t = min(a, b, c, d)
# generate or propagate overflow flag
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
@numba.njit
def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=0):
lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
@ -379,6 +233,7 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed= @@ -379,6 +233,7 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=
# generate or propagate overflow flag
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
@numba.njit
def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, delays, params, sd, seed):
overflows = 0
@ -505,143 +360,6 @@ def wave_assign_gpu(c, s, c_locs, ppi_offset): @@ -505,143 +360,6 @@ def wave_assign_gpu(c, s, c_locs, ppi_offset):
c[c_loc+2, x] = TMAX
@cuda.jit(device=True)
def rand_gauss_gpu(seed, sd):
clamp = 0.5
if sd <= 0.0:
return 1.0
while True:
x = -6.0
for _ in range(12):
seed = int(0xDEECE66D) * seed + 0xB
x += float((seed >> 8) & 0xffffff) / float(1 << 24)
x *= sd
if abs(x) <= clamp:
break
return x + 1.0
@cuda.jit()
def wave_eval_gpu_old(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, line_times, param, sd, seed):
x, y = cuda.grid(2)
st_idx = st_start + x
op_idx = op_start + y
if st_idx >= st_stop: return
if op_idx >= op_stop: return
lut = ops[op_idx, 0]
z_idx = ops[op_idx, 1]
a_idx = ops[op_idx, 2]
b_idx = ops[op_idx, 3]
c_idx = ops[op_idx, 4]
d_idx = ops[op_idx, 5]
param = param[st_idx]
# >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
overflows = int(0)
_seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
a_mem = c_locs[a_idx]
b_mem = c_locs[b_idx]
c_mem = c_locs[c_idx]
d_mem = c_locs[d_idx]
z_mem = c_locs[z_idx]
z_cap = c_caps[z_idx]
a_cur = int(0)
b_cur = int(0)
c_cur = int(0)
d_cur = int(0)
z_cur = lut & 1
if z_cur == 1:
cbuf[z_mem, st_idx] = TMIN
a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
if int(param[1]) == a_idx: a += param[2+z_cur]
b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
if int(param[1]) == b_idx: b += param[2+z_cur]
c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
if int(param[1]) == c_idx: c += param[2+z_cur]
d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
if int(param[1]) == d_idx: d += param[2+z_cur]
previous_t = TMIN
current_t = min(a, b, c, d)
inputs = int(0)
while current_t < TMAX:
z_val = z_cur & 1
if a == current_t:
a_cur += 1
a = cbuf[a_mem + a_cur, st_idx]
a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[a_idx, 1, z_val] * rand_gauss_gpu(_seed ^ a_mem ^ z_val, sd) * param[0]
if int(param[1]) == a_idx:
a += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 1
next_t = a
elif b == current_t:
b_cur += 1
b = cbuf[b_mem + b_cur, st_idx]
b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[b_idx, 1, z_val] * rand_gauss_gpu(_seed ^ b_mem ^ z_val, sd) * param[0]
if int(param[1]) == b_idx:
b += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 2
next_t = b
elif c == current_t:
c_cur += 1
c = cbuf[c_mem + c_cur, st_idx]
c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[c_idx, 1, z_val] * rand_gauss_gpu(_seed ^ c_mem ^ z_val, sd) * param[0]
if int(param[1]) == c_idx:
c += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 4
next_t = c
else:
d_cur += 1
d = cbuf[d_mem + d_cur, st_idx]
d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[d_idx, 1, z_val] * rand_gauss_gpu(_seed ^ d_mem ^ z_val, sd) * param[0]
if int(param[1]) == d_idx:
d += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 8
next_t = d
if (z_cur & 1) != ((lut >> inputs) & 1):
# we generate a toggle in z_mem, if:
# ( it is the first toggle in z_mem OR
# following toggle is earlier OR
# pulse is wide enough ) AND enough space in z_mem.
if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
if z_cur < (z_cap - 1):
cbuf[z_mem + z_cur, st_idx] = current_t
previous_t = current_t
z_cur += 1
else:
overflows += 1
previous_t = cbuf[z_mem + z_cur - 1, st_idx]
z_cur -= 1
else:
z_cur -= 1
previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
current_t = min(a, b, c, d)
# generate or propagate overflow flag
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
@cuda.jit()
def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, delays, param, sd, seed):
x, y = cuda.grid(2)
@ -759,7 +477,7 @@ def wave_capture_gpu(c, s, c_locs, c_caps, ppo_offset, time, s_sqrt2, seed): @@ -759,7 +477,7 @@ def wave_capture_gpu(c, s, c_locs, c_caps, ppo_offset, time, s_sqrt2, seed):
x, y = cuda.grid(2)
if ppo_offset + y >= len(c_locs): return
line = c_locs[ppo_offset + y]
tdim = c_caps[ppo_offset + y]
tdim = c_caps[ppo_offset + y]
if line < 0: return
if x >= c.shape[-1]: return
vector = x

Loading…
Cancel
Save