Compare commits

...

2 Commits

Author SHA1 Message Date
Stefan Holst aa7536b8b0 line use and diff 11 months ago
Stefan Holst fccf5e0d84 fix log limit 11 months ago
  1. 2
      src/kyupy/__init__.py
  2. 10
      src/kyupy/sim.py
  3. 123
      src/kyupy/wave_sim.py
  4. 10
      tests/test_wave_sim.py

2
src/kyupy/__init__.py

@ -161,6 +161,8 @@ class Log:
def __setstate__(self, state): def __setstate__(self, state):
self.logfile = sys.stdout self.logfile = sys.stdout
self.indent = 0 self.indent = 0
self._limit = -1
self.filtered = 0
self.start = time.perf_counter() - state['elapsed'] self.start = time.perf_counter() - state['elapsed']
def write(self, s, indent=0): def write(self, s, indent=0):

10
src/kyupy/sim.py

@ -187,7 +187,6 @@ class SimOps:
levels = [] levels = []
ppio2idx = dict((n, i) for i, n in enumerate(circuit.s_nodes)) ppio2idx = dict((n, i) for i, n in enumerate(circuit.s_nodes))
pis = set([n for n in circuit.s_nodes if len(n.ins) == 0])
ppos = set([n for n in circuit.s_nodes if len(n.ins) > 0]) ppos = set([n for n in circuit.s_nodes if len(n.ins) > 0])
readers = np.array([1 if l.reader in ppos else len(l.reader.outs) for l in circuit.lines], dtype=np.int32) # for ref-counting forks readers = np.array([1 if l.reader in ppos else len(l.reader.outs) for l in circuit.lines], dtype=np.int32) # for ref-counting forks
@ -311,6 +310,15 @@ class SimOps:
if len(n.ins) > 0: if len(n.ins) > 0:
self.c_locs[self.ppo_offset + i], self.c_caps[self.ppo_offset + i] = self.c_locs[n.ins[0]], self.c_caps[n.ins[0]] self.c_locs[self.ppo_offset + i], self.c_caps[self.ppo_offset + i] = self.c_locs[n.ins[0]], self.c_caps[n.ins[0]]
# line use information
self.line_use_start = np.full(self.c_locs_len, -1, dtype=np.int32)
self.line_use_stop = np.full(self.c_locs_len, len(self.levels), dtype=np.int32)
for i, lv in enumerate(self.levels):
for op in lv:
self.line_use_start[op[1]] = i
for x in [2, 3, 4, 5]:
self.line_use_stop[op[x]] = i
self.c_len = h.max_size self.c_len = h.max_size
d = defaultdict(int) d = defaultdict(int)

123
src/kyupy/wave_sim.py

@ -13,10 +13,11 @@ Two simulators are available: :py:class:`WaveSim` runs on the CPU, and the deriv
""" """
import math import math
from collections import defaultdict
import numpy as np import numpy as np
from . import numba, cuda, sim, cdiv, eng from . import log, numba, cuda, sim, cdiv, eng
TMAX = np.float32(2 ** 127) TMAX = np.float32(2 ** 127)
@ -59,8 +60,8 @@ class WaveSim(sim.SimOps):
self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype) self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype)
self.delays[:, :delays.shape[1]] = delays self.delays[:, :delays.shape[1]] = delays
self.c = np.zeros((self.c_len, sims), dtype=np.float32) + TMAX self.c = np.full((self.c_len, self.sims), TMAX, dtype=np.float32)
self.s = np.zeros((11, self.s_len, sims), dtype=np.float32) self.s = np.zeros((11, self.s_len, self.sims), dtype=np.float32)
"""Information about the logic values and transitions around the sequential elements (flip-flops) and ports. """Information about the logic values and transitions around the sequential elements (flip-flops) and ports.
The first 3 values are read by :py:func:`s_to_c`. The first 3 values are read by :py:func:`s_to_c`.
@ -98,7 +99,26 @@ class WaveSim(sim.SimOps):
self.simctl_int[0] = range(sims) # unique seed for each sim by default, zero this to pick same delays for all sims. self.simctl_int[0] = range(sims) # unique seed for each sim by default, zero this to pick same delays for all sims.
self.simctl_int[1] = 2 # random picking by default. self.simctl_int[1] = 2 # random picking by default.
self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.simctl_int)]) # flat array for line use information
line_use = defaultdict(list)
for lidx in range(len(self.circuit.lines)):
if self.line_use_start[lidx] < 0: continue
if self.line_use_stop[lidx] < 0:
log.warn(f'line {lidx} never read?')
for i in range(self.line_use_start[lidx], self.line_use_stop[lidx]):
line_use[i].append(lidx)
self.line_use_counts = np.array([len(line_use[i]) for i in range(len(self.levels))], dtype=np.int32)
self.line_use_offsets = np.zeros_like(self.line_use_counts)
self.line_use_offsets[1:] = self.line_use_counts.cumsum()[:-1]
self.line_use = np.hstack([line_use[i] for i in range(len(self.levels))])
self.h = np.zeros((self.c_locs_len, sims), dtype=np.float32) # hashes of generated waveforms
self.h_base = np.zeros_like(self.h) # base hashes to compare to
self.error_counts = np.zeros(self.s_len, dtype=np.uint32) # number of capture errors by PPO
self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.h, self.c_locs, self.c_caps, self.ops, self.simctl_int)])
def __repr__(self): def __repr__(self):
dev = 'GPU' if hasattr(self.c, 'copy_to_host') else 'CPU' dev = 'GPU' if hasattr(self.c, 'copy_to_host') else 'CPU'
@ -124,7 +144,7 @@ class WaveSim(sim.SimOps):
""" """
sims = min(sims or self.sims, self.sims) sims = min(sims or self.sims, self.sims)
for op_start, op_stop in zip(self.level_starts, self.level_stops): for op_start, op_stop in zip(self.level_starts, self.level_stops):
level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, 0, sims, self.delays, self.simctl_int, seed) level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, 0, sims, self.delays, self.simctl_int, seed)
def c_to_s(self, time=TMAX, sd=0.0, seed=1): def c_to_s(self, time=TMAX, sd=0.0, seed=1):
"""Simulates a capture operation at all sequential elements and primary outputs. """Simulates a capture operation at all sequential elements and primary outputs.
@ -152,7 +172,7 @@ class WaveSim(sim.SimOps):
self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs]
def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0): def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed):
overflows = int(0) overflows = int(0)
lut = op[0] lut = op[0]
@ -182,6 +202,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
z_mem = c_locs[z_idx] z_mem = c_locs[z_idx]
z_cap = c_caps[z_idx] z_cap = c_caps[z_idx]
h = np.float32(0)
a_cur = int(0) a_cur = int(0)
b_cur = int(0) b_cur = int(0)
c_cur = int(0) c_cur = int(0)
@ -229,6 +251,7 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
next_t = cbuf[d_mem + d_cur, sim] + delays[d_idx, (d_cur & 1) ^ 1, z_val ^ 1] next_t = cbuf[d_mem + d_cur, sim] + delays[d_idx, (d_cur & 1) ^ 1, z_val ^ 1]
if (z_cur & 1) != ((lut >> inputs) & 1): if (z_cur & 1) != ((lut >> inputs) & 1):
h += h*3 + max(current_t, -10) # hash based on generated transitions before filtering
# we generate an edge in z_mem, if ... # we generate an edge in z_mem, if ...
if (z_cur == 0 # it is the first edge in z_mem ... if (z_cur == 0 # it is the first edge in z_mem ...
or next_t < current_t # -OR- the next edge on SAME input is EARLIER (need current edge to filter BOTH in next iteration) ... or next_t < current_t # -OR- the next edge on SAME input is EARLIER (need current edge to filter BOTH in next iteration) ...
@ -240,8 +263,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
z_cur += 1 z_cur += 1
else: else:
overflows += 1 overflows += 1
previous_t = cbuf[z_mem + z_cur - 1, sim]
z_cur -= 1 z_cur -= 1
previous_t = cbuf[z_mem + z_cur, sim]
else: else:
z_cur -= 1 z_cur -= 1
previous_t = cbuf[z_mem + z_cur - 1, sim] if z_cur > 0 else TMIN previous_t = cbuf[z_mem + z_cur - 1, sim] if z_cur > 0 else TMIN
@ -258,6 +281,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
# generate or propagate overflow flag # generate or propagate overflow flag
cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d) cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
hbuf[z_idx, sim] = h
nrise = max(0, (z_cur+1) // 2 - (cbuf[z_mem, sim] == TMIN)) nrise = max(0, (z_cur+1) // 2 - (cbuf[z_mem, sim] == TMIN))
nfall = z_cur // 2 nfall = z_cur // 2
@ -268,11 +293,11 @@ wave_eval_cpu = numba.njit(_wave_eval)
@numba.njit @numba.njit
def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed): def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, hbuf, abuf, sim_start, sim_stop, delays, simctl_int, seed):
for op_idx in range(op_start, op_stop): for op_idx in range(op_start, op_stop):
op = ops[op_idx] op = ops[op_idx]
for sim in range(sim_start, sim_stop): for sim in range(sim_start, sim_stop):
nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, hbuf, sim, delays, simctl_int[:, sim], seed)
a_loc = op[6] a_loc = op[6]
a_wr = op[7] a_wr = op[7]
a_wf = op[8] a_wf = op[8]
@ -345,12 +370,18 @@ class WaveSimCuda(WaveSim):
self.delays = cuda.to_device(self.delays) self.delays = cuda.to_device(self.delays)
self.simctl_int = cuda.to_device(self.simctl_int) self.simctl_int = cuda.to_device(self.simctl_int)
self.abuf = cuda.to_device(self.abuf) self.abuf = cuda.to_device(self.abuf)
self.h = cuda.to_device(self.h)
self.h_base = cuda.to_device(self.h_base)
self.line_use = cuda.to_device(self.line_use)
self.error_counts = cuda.to_device(self.error_counts)
self.retval_int = cuda.to_device(np.array([0], dtype=np.int32))
self._block_dim = (32, 16) self._block_dim = (32, 16)
def __getstate__(self): def __getstate__(self):
state = self.__dict__.copy() state = self.__dict__.copy()
state['c'] = np.array(self.c) del state['c']
state['s'] = np.array(self.s) state['s'] = np.array(self.s)
state['ops'] = np.array(self.ops) state['ops'] = np.array(self.ops)
state['c_locs'] = np.array(self.c_locs) state['c_locs'] = np.array(self.c_locs)
@ -358,11 +389,16 @@ class WaveSimCuda(WaveSim):
state['delays'] = np.array(self.delays) state['delays'] = np.array(self.delays)
state['simctl_int'] = np.array(self.simctl_int) state['simctl_int'] = np.array(self.simctl_int)
state['abuf'] = np.array(self.abuf) state['abuf'] = np.array(self.abuf)
state['h'] = np.array(self.h)
state['h_base'] = np.array(self.h_base)
state['line_use'] = np.array(self.line_use)
state['error_counts'] = np.array(self.error_counts)
state['retval_int'] = np.array(self.retval_int)
return state return state
def __setstate__(self, state): def __setstate__(self, state):
self.__dict__.update(state) self.__dict__.update(state)
self.c = cuda.to_device(self.c) self.c = cuda.to_device(np.full((self.c_len, self.sims), TMAX, dtype=np.float32))
self.s = cuda.to_device(self.s) self.s = cuda.to_device(self.s)
self.ops = cuda.to_device(self.ops) self.ops = cuda.to_device(self.ops)
self.c_locs = cuda.to_device(self.c_locs) self.c_locs = cuda.to_device(self.c_locs)
@ -370,6 +406,11 @@ class WaveSimCuda(WaveSim):
self.delays = cuda.to_device(self.delays) self.delays = cuda.to_device(self.delays)
self.simctl_int = cuda.to_device(self.simctl_int) self.simctl_int = cuda.to_device(self.simctl_int)
self.abuf = cuda.to_device(self.abuf) self.abuf = cuda.to_device(self.abuf)
self.h = cuda.to_device(self.h)
self.h_base = cuda.to_device(self.h_base)
self.line_use = cuda.to_device(self.line_use)
self.error_counts = cuda.to_device(self.error_counts)
self.retval_int = cuda.to_device(self.retval_int)
def s_to_c(self): def s_to_c(self):
grid_dim = self._grid_dim(self.sims, self.s_len) grid_dim = self._grid_dim(self.sims, self.s_len)
@ -383,10 +424,18 @@ class WaveSimCuda(WaveSim):
if op_from > op_start: continue if op_from > op_start: continue
if op_to is not None and op_to <= op_start: break if op_to is not None and op_to <= op_start: break
grid_dim = self._grid_dim(sims, op_stop - op_start) grid_dim = self._grid_dim(sims, op_stop - op_start)
wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, int(0), wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, int(0),
sims, self.delays, self.simctl_int, seed) sims, self.delays, self.simctl_int, seed)
cuda.synchronize() cuda.synchronize()
def c_prop_level(self, level, sims=None, seed=1):
sims = min(sims or self.sims, self.sims)
op_start = self.level_starts[level]
op_stop = self.level_stops[level]
grid_dim = self._grid_dim(sims, op_stop - op_start)
wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, int(0),
sims, self.delays, self.simctl_int, seed)
def c_to_s(self, time=TMAX, sd=0.0, seed=1): def c_to_s(self, time=TMAX, sd=0.0, seed=1):
grid_dim = self._grid_dim(self.sims, self.s_len) grid_dim = self._grid_dim(self.sims, self.s_len)
wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.c_locs, self.c_caps, self.ppo_offset, wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.c_locs, self.c_caps, self.ppo_offset,
@ -396,6 +445,52 @@ class WaveSimCuda(WaveSim):
grid_dim = self._grid_dim(self.sims, self.s_len) grid_dim = self._grid_dim(self.sims, self.s_len)
ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.c_locs, time, self.ppi_offset, self.ppo_offset) ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.c_locs, time, self.ppi_offset, self.ppo_offset)
def set_base_hashes(self):
nitems = self.h_base.shape[0] * self.h_base.shape[1]
grid_dim = cdiv(nitems, 256)
memcpy_gpu[grid_dim, 256](self.h, self.h_base, nitems)
def compare_hashes_level(self, lv):
self.retval_int[0] = 0
grid_dim = self._grid_dim(self.sims, self.line_use_counts[lv])
diff_hash_gpu[grid_dim, self._block_dim](self.h, self.h_base, self.line_use, self.line_use_offsets[lv],
self.line_use_counts[lv], self.retval_int)
return self.retval_int[0]
def calc_error_counts(self, sims=None):
sims = min(sims or self.sims, self.sims)
grid_dim = cdiv(self.s_len, 256)
calc_error_counts_gpu[grid_dim, 256](self.s, sims, self.error_counts)
return np.array(self.error_counts)
@cuda.jit()
def memcpy_gpu (src, dst, nitems):
tid = cuda.grid(1)
stride = cuda.gridDim.x * cuda.blockDim.x
for i in range(tid, nitems, stride):
dst.flat[i] = src.flat[i]
@cuda.jit()
def diff_hash_gpu(hbuf1, hbuf2, h_locs, h_locs_offset, h_locs_cnt, differs):
x, y = cuda.grid(2)
if x >= hbuf1.shape[1]: return
if y >= h_locs_cnt: return
h_loc = h_locs[h_locs_offset+y]
if hbuf1[h_loc, x] != hbuf2[h_loc, x]:
differs[0] = 1
@cuda.jit()
def calc_error_counts_gpu(s, sims, error_counts):
x = cuda.grid(1)
if x >= s.shape[1]: return
cnt = 0
for i in range(sims):
cnt += (s[6,x,i] != s[8,x,i])
error_counts[x] = cnt
@cuda.jit() @cuda.jit()
def wave_assign_gpu(c, s, c_locs, ppi_offset): def wave_assign_gpu(c, s, c_locs, ppi_offset):
@ -425,7 +520,7 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True)
@cuda.jit() @cuda.jit()
def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed): def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, hbuf, abuf, sim_start, sim_stop, delays, simctl_int, seed):
x, y = cuda.grid(2) x, y = cuda.grid(2)
sim = sim_start + x sim = sim_start + x
op_idx = op_start + y op_idx = op_start + y
@ -437,7 +532,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, abuf, sim_start,
a_wr = op[7] a_wr = op[7]
a_wf = op[8] a_wf = op[8]
nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int[:, sim], seed)
# accumulate WSA into abuf # accumulate WSA into abuf
if a_loc >= 0: if a_loc >= 0:

10
tests/test_wave_sim.py

@ -8,9 +8,10 @@ from kyupy.logic import mvarray
def test_xnor2_delays(): def test_xnor2_delays():
op = (sim.XNOR2, 2, 0, 1, 3, 3, -1, 0, 0) op = (sim.XNOR2, 2, 0, 1, 3, 3, -1, 0, 0)
#op = (0b0111, 4, 0, 1) #op = (0b0111, 4, 0, 1)
c = np.full((4*16, 1), TMAX) # 4 waveforms of capacity 16 c = np.full((4*16, 1), TMAX, dtype=np.float32) # 4 waveforms of capacity 16
c_locs = np.zeros((4,), dtype='int') c_locs = np.zeros((4,), dtype='int')
c_caps = np.zeros((4,), dtype='int') c_caps = np.zeros((4,), dtype='int')
h = np.zeros((4, 1), dtype=np.float32)
for i in range(4): c_locs[i], c_caps[i] = i*16, 16 # 1:1 mapping for i in range(4): c_locs[i], c_caps[i] = i*16, 16 # 1:1 mapping
delays = np.zeros((1, 4, 2, 2)) delays = np.zeros((1, 4, 2, 2))
@ -27,7 +28,7 @@ def test_xnor2_delays():
def wave_assert(inputs, output): def wave_assert(inputs, output):
for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
wave_eval_cpu(op, c, c_locs, c_caps, 0, delays, simctl_int) wave_eval_cpu(op, c, c_locs, c_caps, h, 0, delays, simctl_int, 0)
for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[2,i], v) for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[2,i], v)
wave_assert([[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # XNOR(1,1) => 1 wave_assert([[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # XNOR(1,1) => 1
@ -40,9 +41,10 @@ def test_xnor2_delays():
def test_nand_delays(): def test_nand_delays():
op = (sim.NAND4, 4, 0, 1, 2, 3, -1, 0, 0) op = (sim.NAND4, 4, 0, 1, 2, 3, -1, 0, 0)
#op = (0b0111, 4, 0, 1) #op = (0b0111, 4, 0, 1)
c = np.full((5*16, 1), TMAX) # 5 waveforms of capacity 16 c = np.full((5*16, 1), TMAX, dtype=np.float32) # 5 waveforms of capacity 16
c_locs = np.zeros((5,), dtype='int') c_locs = np.zeros((5,), dtype='int')
c_caps = np.zeros((5,), dtype='int') c_caps = np.zeros((5,), dtype='int')
h = np.zeros((5, 1), dtype=np.float32)
for i in range(5): c_locs[i], c_caps[i] = i*16, 16 # 1:1 mapping for i in range(5): c_locs[i], c_caps[i] = i*16, 16 # 1:1 mapping
@ -64,7 +66,7 @@ def test_nand_delays():
def wave_assert(inputs, output): def wave_assert(inputs, output):
for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
wave_eval_cpu(op, c, c_locs, c_caps, 0, delays, simctl_int) wave_eval_cpu(op, c, c_locs, c_caps, h, 0, delays, simctl_int, 0)
for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[4,i], v) for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[4,i], v)
wave_assert([[TMAX,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(0,0,1,1) => 1 wave_assert([[TMAX,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(0,0,1,1) => 1

Loading…
Cancel
Save