Browse Source

wsa accumulation in wavesim

devel
Stefan Holst 1 year ago
parent
commit
afb0a64953
  1. 8
      src/kyupy/logic_sim.py
  2. 18
      src/kyupy/sim.py
  3. 64
      src/kyupy/wave_sim.py
  4. 2
      tests/test_wave_sim.py

8
src/kyupy/logic_sim.py

@ -73,7 +73,7 @@ class LogicSim(sim.SimOps):
if inject_cb is None: if inject_cb is None:
_prop_cpu(self.ops, self.c_locs, self.c[...,:nbytes]) _prop_cpu(self.ops, self.c_locs, self.c[...,:nbytes])
else: else:
for op, o0, i0, i1, i2, i3 in self.ops: for op, o0, i0, i1, i2, i3 in self.ops[:,:6]:
o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)] o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)]
if op == sim.BUF1: self.c[o0]=self.c[i0] if op == sim.BUF1: self.c[o0]=self.c[i0]
elif op == sim.INV1: self.c[o0] = ~self.c[i0] elif op == sim.INV1: self.c[o0] = ~self.c[i0]
@ -107,7 +107,7 @@ class LogicSim(sim.SimOps):
else: print(f'unknown op {op}') else: print(f'unknown op {op}')
inject_cb(o0, self.s[o0]) inject_cb(o0, self.s[o0])
elif self.m == 4: elif self.m == 4:
for op, o0, i0, i1, i2, i3 in self.ops: for op, o0, i0, i1, i2, i3 in self.ops[:,:6]:
o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)] o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)]
if op == sim.BUF1: self.c[o0]=self.c[i0] if op == sim.BUF1: self.c[o0]=self.c[i0]
elif op == sim.INV1: logic.bp4v_not(self.c[o0], self.c[i0]) elif op == sim.INV1: logic.bp4v_not(self.c[o0], self.c[i0])
@ -168,7 +168,7 @@ class LogicSim(sim.SimOps):
logic.bp4v_or(self.c[o0], self.c[self.c_locs[self.tmp_idx]], self.c[self.c_locs[self.tmp2_idx]]) logic.bp4v_or(self.c[o0], self.c[self.c_locs[self.tmp_idx]], self.c[self.c_locs[self.tmp2_idx]])
else: print(f'unknown op {op}') else: print(f'unknown op {op}')
else: else:
for op, o0, i0, i1, i2, i3 in self.ops: for op, o0, i0, i1, i2, i3 in self.ops[:,:6]:
o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)] o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)]
if op == sim.BUF1: self.c[o0]=self.c[i0] if op == sim.BUF1: self.c[o0]=self.c[i0]
elif op == sim.INV1: logic.bp8v_not(self.c[o0], self.c[i0]) elif op == sim.INV1: logic.bp8v_not(self.c[o0], self.c[i0])
@ -263,7 +263,7 @@ class LogicSim(sim.SimOps):
@numba.njit @numba.njit
def _prop_cpu(ops, c_locs, c): def _prop_cpu(ops, c_locs, c):
for op, o0, i0, i1, i2, i3 in ops: for op, o0, i0, i1, i2, i3 in ops[:,:6]:
o0, i0, i1, i2, i3 = [c_locs[x] for x in (o0, i0, i1, i2, i3)] o0, i0, i1, i2, i3 = [c_locs[x] for x in (o0, i0, i1, i2, i3)]
if op == sim.BUF1: c[o0]=c[i0] if op == sim.BUF1: c[o0]=c[i0]
elif op == sim.INV1: c[o0] = ~c[i0] elif op == sim.INV1: c[o0] = ~c[i0]

18
src/kyupy/sim.py

@ -146,13 +146,17 @@ class SimOps:
:param keep_signals: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces :param keep_signals: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
memory footprint, but intermediate signal waveforms become unaccessible after a propagation. memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
""" """
def __init__(self, circuit, c_caps=1, c_caps_min=1, c_reuse=False, strip_forks=False): def __init__(self, circuit, c_caps=1, c_caps_min=1, a_ctrl=None, c_reuse=False, strip_forks=False):
self.circuit = circuit self.circuit = circuit
self.s_len = len(circuit.s_nodes) self.s_len = len(circuit.s_nodes)
if isinstance(c_caps, int): if isinstance(c_caps, int):
c_caps = [c_caps] * len(circuit.lines) c_caps = [c_caps] * len(circuit.lines)
if a_ctrl is None:
a_ctrl = np.zeros((len(circuit.lines), 3), dtype=np.int32)
a_ctrl[:,0] = -1
# special locations and offsets in c_locs/c_caps # special locations and offsets in c_locs/c_caps
self.zero_idx = len(circuit.lines) self.zero_idx = len(circuit.lines)
self.tmp_idx = self.zero_idx + 1 self.tmp_idx = self.zero_idx + 1
@ -168,14 +172,14 @@ class SimOps:
if n in interface_dict: if n in interface_dict:
inp_idx = self.ppi_offset + interface_dict[n] inp_idx = self.ppi_offset + interface_dict[n]
if len(n.outs) > 0 and n.outs[0] is not None: # first output of a PI/PPI if len(n.outs) > 0 and n.outs[0] is not None: # first output of a PI/PPI
ops.append((BUF1, n.outs[0].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx)) ops.append((BUF1, n.outs[0].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx, *a_ctrl[n.outs[0]]))
if 'dff' in n.kind.lower(): # second output of DFF is inverted if 'dff' in n.kind.lower(): # second output of DFF is inverted
if len(n.outs) > 1 and n.outs[1] is not None: if len(n.outs) > 1 and n.outs[1] is not None:
ops.append((INV1, n.outs[1].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx)) ops.append((INV1, n.outs[1].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx, *a_ctrl[n.outs[1]]))
else: # if not DFF, no output is inverted. else: # if not DFF, no output is inverted.
for o_line in n.outs[1:]: for o_line in n.outs[1:]:
if o_line is not None: if o_line is not None:
ops.append((BUF1, o_line.index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx)) ops.append((BUF1, o_line.index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx, *a_ctrl[o_line]))
continue continue
# regular node, not PI/PPI or PO/PPO # regular node, not PI/PPI or PO/PPO
o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx
@ -188,7 +192,7 @@ class SimOps:
if not strip_forks: if not strip_forks:
for o_line in n.outs: for o_line in n.outs:
if o_line is not None: if o_line is not None:
ops.append((BUF1, o_line.index, i0_idx, i1_idx, i2_idx, i3_idx)) ops.append((BUF1, o_line.index, i0_idx, i1_idx, i2_idx, i3_idx, *a_ctrl[o_line]))
continue continue
sp = None sp = None
for prefix, prims in kind_prefixes.items(): for prefix, prims in kind_prefixes.items():
@ -202,7 +206,7 @@ class SimOps:
if sp is None: if sp is None:
print('unknown cell type', kind) print('unknown cell type', kind)
else: else:
ops.append((sp, o0_idx, i0_idx, i1_idx, i2_idx, i3_idx)) ops.append((sp, o0_idx, i0_idx, i1_idx, i2_idx, i3_idx, *a_ctrl[o0_idx]))
self.ops = np.asarray(ops, dtype='int32') self.ops = np.asarray(ops, dtype='int32')
@ -299,7 +303,7 @@ class SimOps:
self.c_len = h.max_size self.c_len = h.max_size
d = defaultdict(int) d = defaultdict(int)
for op, _, _, _, _, _ in self.ops: d[names[op]] += 1 for op in self.ops[:,0]: d[names[op]] += 1
self.prim_counts = dict(d) self.prim_counts = dict(d)
self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0) self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)

64
src/kyupy/wave_sim.py

@ -47,8 +47,8 @@ class WaveSim(sim.SimOps):
:param c_reuse: If enabled, memory of intermediate signal waveforms will be re-used. This greatly reduces :param c_reuse: If enabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
memory footprint, but intermediate signal waveforms become unaccessible after a propagation. memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
""" """
def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False): def __init__(self, circuit, delays, sims=8, c_caps=16, a_ctrl=None, c_reuse=False, strip_forks=False):
super().__init__(circuit, c_caps=c_caps, c_caps_min=4, c_reuse=c_reuse, strip_forks=strip_forks) super().__init__(circuit, c_caps=c_caps, c_caps_min=4, a_ctrl=a_ctrl, c_reuse=c_reuse, strip_forks=strip_forks)
self.sims = sims self.sims = sims
if delays.ndim == 3: delays = np.expand_dims(delays, axis=0) if delays.ndim == 3: delays = np.expand_dims(delays, axis=0)
self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype) self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype)
@ -78,6 +78,9 @@ class WaveSim(sim.SimOps):
final values in the waveforms are still valid. final values in the waveforms are still valid.
""" """
self.abuf_len = self.ops[:,6].max() + 1
self.abuf = np.zeros((self.abuf_len, sims), dtype=np.int32) if self.abuf_len > 0 else np.zeros((1, 1), dtype=np.int32)
self.simctl_int = np.zeros((2, sims), dtype=np.int32) self.simctl_int = np.zeros((2, sims), dtype=np.int32)
"""Per-simulation delay configuration. """Per-simulation delay configuration.
@ -113,7 +116,7 @@ class WaveSim(sim.SimOps):
""" """
sims = min(sims or self.sims, self.sims) sims = min(sims or self.sims, self.sims)
for op_start, op_stop in zip(self.level_starts, self.level_stops): for op_start, op_stop in zip(self.level_starts, self.level_stops):
level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, 0, sims, self.delays, self.simctl_int, seed) level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, 0, sims, self.delays, self.simctl_int, seed)
def c_to_s(self, time=TMAX, sd=0.0, seed=1): def c_to_s(self, time=TMAX, sd=0.0, seed=1):
"""Simulates a capture operation at all sequential elements and primary outputs. """Simulates a capture operation at all sequential elements and primary outputs.
@ -141,9 +144,16 @@ class WaveSim(sim.SimOps):
self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs]
def _wave_eval(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0): def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
overflows = int(0) overflows = int(0)
lut = op[0]
z_idx = op[1]
a_idx = op[2]
b_idx = op[3]
c_idx = op[4]
d_idx = op[5]
if len(delays) > 1: if len(delays) > 1:
if simctl_int[1] == 0: if simctl_int[1] == 0:
delays = delays[seed] delays = delays[seed]
@ -240,22 +250,26 @@ def _wave_eval(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, sim
# generate or propagate overflow flag # generate or propagate overflow flag
cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d) cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
nrise = max(0, (z_cur+1) // 2 - (cbuf[z_mem, sim] == TMIN))
nfall = z_cur // 2
_wave_eval_cpu = numba.njit(_wave_eval) return nrise, nfall
@numba.njit wave_eval_cpu = numba.njit(_wave_eval)
def wave_eval_cpu(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
_wave_eval_cpu(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed)
@numba.njit @numba.njit
def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, sim_start, sim_stop, delays, simctl_int, seed): def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed):
for op_idx in range(op_start, op_stop): for op_idx in range(op_start, op_stop):
op = ops[op_idx] op = ops[op_idx]
for sim in range(sim_start, sim_stop): for sim in range(sim_start, sim_stop):
wave_eval_cpu(op, c, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed)
a_loc = op[6]
a_wr = op[7]
a_wf = op[8]
if a_loc >= 0:
abuf[a_loc, sim] += nrise*a_wr + nfall*a_wf
@numba.njit @numba.njit
@ -311,8 +325,8 @@ class WaveSimCuda(WaveSim):
All internal memories are mirrored into GPU memory upon construction. All internal memories are mirrored into GPU memory upon construction.
Some operations like access to single waveforms can involve large communication overheads. Some operations like access to single waveforms can involve large communication overheads.
""" """
def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False): def __init__(self, circuit, delays, sims=8, c_caps=16, a_ctrl=None, c_reuse=False, strip_forks=False):
super().__init__(circuit, delays, sims, c_caps, c_reuse, strip_forks) super().__init__(circuit, delays, sims, c_caps, a_ctrl=a_ctrl, c_reuse=c_reuse, strip_forks=strip_forks)
self.c = cuda.to_device(self.c) self.c = cuda.to_device(self.c)
self.s = cuda.to_device(self.s) self.s = cuda.to_device(self.s)
@ -321,6 +335,7 @@ class WaveSimCuda(WaveSim):
self.c_caps = cuda.to_device(self.c_caps) self.c_caps = cuda.to_device(self.c_caps)
self.delays = cuda.to_device(self.delays) self.delays = cuda.to_device(self.delays)
self.simctl_int = cuda.to_device(self.simctl_int) self.simctl_int = cuda.to_device(self.simctl_int)
self.abuf = cuda.to_device(self.abuf)
self._block_dim = (32, 16) self._block_dim = (32, 16)
@ -333,6 +348,7 @@ class WaveSimCuda(WaveSim):
state['c_caps'] = np.array(self.c_caps) state['c_caps'] = np.array(self.c_caps)
state['delays'] = np.array(self.delays) state['delays'] = np.array(self.delays)
state['simctl_int'] = np.array(self.simctl_int) state['simctl_int'] = np.array(self.simctl_int)
state['abuf'] = np.array(self.abuf)
return state return state
def __setstate__(self, state): def __setstate__(self, state):
@ -344,6 +360,7 @@ class WaveSimCuda(WaveSim):
self.c_caps = cuda.to_device(self.c_caps) self.c_caps = cuda.to_device(self.c_caps)
self.delays = cuda.to_device(self.delays) self.delays = cuda.to_device(self.delays)
self.simctl_int = cuda.to_device(self.simctl_int) self.simctl_int = cuda.to_device(self.simctl_int)
self.abuf = cuda.to_device(self.abuf)
def s_to_c(self): def s_to_c(self):
grid_dim = self._grid_dim(self.sims, self.s_len) grid_dim = self._grid_dim(self.sims, self.s_len)
@ -355,7 +372,7 @@ class WaveSimCuda(WaveSim):
sims = min(sims or self.sims, self.sims) sims = min(sims or self.sims, self.sims)
for op_start, op_stop in zip(self.level_starts, self.level_stops): for op_start, op_stop in zip(self.level_starts, self.level_stops):
grid_dim = self._grid_dim(sims, op_stop - op_start) grid_dim = self._grid_dim(sims, op_stop - op_start)
wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, int(0), wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, int(0),
sims, self.delays, self.simctl_int, seed) sims, self.delays, self.simctl_int, seed)
cuda.synchronize() cuda.synchronize()
@ -397,21 +414,24 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True)
@cuda.jit() @cuda.jit()
def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, sim_start, sim_stop, delays, simctl_int, seed): def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed):
x, y = cuda.grid(2) x, y = cuda.grid(2)
sim = sim_start + x sim = sim_start + x
op_idx = op_start + y op_idx = op_start + y
if sim >= sim_stop: return if sim >= sim_stop: return
if op_idx >= op_stop: return if op_idx >= op_stop: return
lut = ops[op_idx, 0] op = ops[op_idx]
z_idx = ops[op_idx, 1] a_loc = op[6]
a_idx = ops[op_idx, 2] a_wr = op[7]
b_idx = ops[op_idx, 3] a_wf = op[8]
c_idx = ops[op_idx, 4]
d_idx = ops[op_idx, 5] nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed)
_wave_eval_gpu(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) # accumulate WSA into abuf
if a_loc >= 0:
#abuf[a_loc, sim] += nrise*a_wr + nfall*a_wf
cuda.atomic.add(abuf, (a_loc, sim), nrise*a_wr + nfall*a_wf)
@cuda.jit() @cuda.jit()

2
tests/test_wave_sim.py

@ -6,7 +6,7 @@ from kyupy import logic, bench, sim
from kyupy.logic import mvarray from kyupy.logic import mvarray
def test_nand_delays(): def test_nand_delays():
op = (sim.NAND4, 4, 0, 1, 2, 3) op = (sim.NAND4, 4, 0, 1, 2, 3, -1, 0, 0)
#op = (0b0111, 4, 0, 1) #op = (0b0111, 4, 0, 1)
c = np.full((5*16, 1), TMAX) # 5 waveforms of capacity 16 c = np.full((5*16, 1), TMAX) # 5 waveforms of capacity 16
c_locs = np.zeros((5,), dtype='int') c_locs = np.zeros((5,), dtype='int')

Loading…
Cancel
Save