Browse Source

cleanup and new intro notebook

devel
stefan 2 years ago
parent
commit
6520ee23ef
  1. 1354
      Demo.ipynb
  2. 12
      README.rst
  3. 4
      docs/conf.py
  4. 1873
      examples/Introduction.ipynb
  5. 2
      src/kyupy/bench.py
  6. 23
      src/kyupy/circuit.py
  7. 6
      src/kyupy/logic.py
  8. 135
      src/kyupy/logic_sim.py
  9. 58
      src/kyupy/wave_sim.py
  10. 961
      src/kyupy/wave_sim_old.py

1354
Demo.ipynb

File diff suppressed because it is too large Load Diff

12
README.rst

@ -16,13 +16,17 @@ Getting Started @@ -16,13 +16,17 @@ Getting Started
---------------
KyuPy is available in `PyPI <https://pypi.org/project/kyupy>`_.
It requires Python 3.6 or newer, `lark-parser <https://pypi.org/project/lark-parser>`_, and `numpy`_.
It requires Python 3.8 or newer, `lark-parser <https://pypi.org/project/lark-parser>`_, and `numpy`_.
Although optional, `numba`_ should be installed for best performance.
GPU/CUDA support in numba may `require some additional setup <https://numba.pydata.org/numba-doc/latest/cuda/index.html>`_.
GPU/CUDA support in numba may `require some additional setup <https://numba.readthedocs.io/en/stable/cuda/index.html>`_.
If numba is not available, KyuPy will automatically fall back to slow, pure Python execution.
The Jupyter Notebook `Demo.ipynb <https://github.com/s-holst/kyupy/blob/main/Demo.ipynb>`_ contains some useful examples to get familiar with the API.
The Jupyter Notebook `Introduction.ipynb <https://github.com/s-holst/kyupy/blob/main/examples/Introduction.ipynb>`_ contains some useful examples to get familiar with the API.
Development
-----------
To work with the latest pre-release source code, clone the `KyuPy GitHub repository <https://github.com/s-holst/kyupy>`_.
Run ``pip3 install --user -e .`` within your local checkout to make the package available in your Python environment.
Run ``pip install -e .`` within your local checkout to make the package available in your Python environment.
The source code comes with tests that can be run with ``pytest``.

4
docs/conf.py

@ -20,11 +20,11 @@ sys.path.insert(0, os.path.abspath('../src')) @@ -20,11 +20,11 @@ sys.path.insert(0, os.path.abspath('../src'))
# -- Project information -----------------------------------------------------
project = 'KyuPy'
copyright = '2020-2021, Stefan Holst'
copyright = '2020-2023, Stefan Holst'
author = 'Stefan Holst'
# The full version, including alpha/beta/rc tags
release = '0.0.3'
release = '0.0.4'
# -- General configuration ---------------------------------------------------

1873
examples/Introduction.ipynb

File diff suppressed because it is too large Load Diff

2
src/kyupy/bench.py

@ -21,7 +21,7 @@ class BenchTransformer(Transformer): @@ -21,7 +21,7 @@ class BenchTransformer(Transformer):
def start(self, _): return self.c
def parameters(self, args): return [self.c.get_or_add_fork(name) for name in args]
def parameters(self, args): return [self.c.get_or_add_fork(str(name)) for name in args]
def interface(self, args): self.c.io_nodes.extend(args[0])

23
src/kyupy/circuit.py

@ -286,16 +286,8 @@ class Circuit: @@ -286,16 +286,8 @@ class Circuit:
def __eq__(self, other):
return self.nodes == other.nodes and self.lines == other.lines and self.io_nodes == other.io_nodes
def dump(self):
"""Returns a string representation of the circuit and all its nodes.
"""
header = f'{self.name}({",".join([str(n.index) for n in self.io_nodes])})\n'
return header + '\n'.join([str(n) for n in self.nodes])
def __repr__(self):
name = f' {self.name}' if self.name else ''
return f'<Circuit{name} cells={len(self.cells)} forks={len(self.forks)} ' + \
f'lines={len(self.lines)} ports={len(self.io_nodes)}>'
return f'{{name: "{self.name}", cells: {len(self.cells)}, forks: {len(self.forks)}, lines: {len(self.lines)}, io_nodes: {len(self.io_nodes)}}}'
@property
def stats(self):
@ -303,18 +295,15 @@ class Circuit: @@ -303,18 +295,15 @@ class Circuit:
stats['__node__'] = len(self.nodes)
stats['__cell__'] = len(self.cells)
stats['__fork__'] = len(self.forks)
stats['__port__'] = len(self.io_nodes)
stats['__io__'] = len(self.io_nodes)
stats['__line__'] = len(self.lines)
for n in self.cells.values():
stats[n.kind] += 1
if 'dff' in n.kind.lower():
stats['__dff__'] += 1
elif 'latch' in n.kind.lower():
stats['__latch__'] += 1
elif 'put' not in n.kind.lower(): # no input or output
stats['__comb__'] += 1
if 'dff' in n.kind.lower(): stats['__dff__'] += 1
elif 'latch' in n.kind.lower(): stats['__latch__'] += 1
elif 'put' not in n.kind.lower(): stats['__comb__'] += 1 # no input or output
stats['__seq__'] = stats['__dff__'] + stats['__latch__']
return stats
return dict(stats)
def topological_order(self):
"""Generator function to iterate over all nodes in topological order.

6
src/kyupy/logic.py

@ -121,6 +121,12 @@ def mvarray(*a): @@ -121,6 +121,12 @@ def mvarray(*a):
return mva[..., 0, :]
def mv_str(mva, delim='\n'):
sa = np.choose(mva, np.array([*'0X-1PRFN'], dtype=np.unicode_))
if mva.ndim == 1: return ''.join(sa)
return delim.join([''.join(c) for c in sa.swapaxes(-1,-2)])
def mv_to_bp(mva):
if mva.ndim == 1: mva = mva[..., np.newaxis]
return np.packbits(unpackbits(mva)[...,:3], axis=-2, bitorder='little').swapaxes(-1,-2)

135
src/kyupy/logic_sim.py

@ -51,40 +51,6 @@ class LogicSim(SimOps): @@ -51,40 +51,6 @@ class LogicSim(SimOps):
self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
#dffs = [n for n in circuit.nodes if 'dff' in n.kind.lower()]
#latches = [n for n in circuit.nodes if 'latch' in n.kind.lower()]
#self.interface = list(circuit.io_nodes) + dffs + latches
#self.width = len(self.interface)
#"""The number of bits in the circuit state (number of ports + number of state-elements)."""
#self.state = np.zeros((len(circuit.lines), mdim, nbytes), dtype='uint8')
#self.state_epoch = np.zeros(len(circuit.nodes), dtype='int8') - 1
#self.tmp = np.zeros((5, mdim, nbytes), dtype='uint8')
#self.zero = np.zeros((mdim, nbytes), dtype='uint8')
#self.epoch = 0
#self.latch_dict = dict((n.index, i) for i, n in enumerate(latches))
#self.latch_state = np.zeros((len(latches), mdim, nbytes), dtype='uint8')
# known_fct = [(f[:-4], getattr(self, f)) for f in dir(self) if f.endswith('_fct')]
# self.node_fct = []
# for n in circuit.nodes:
# t = n.kind.lower().replace('__fork__', 'fork')
# t = t.replace('nbuff', 'fork')
# t = t.replace('input', 'fork')
# t = t.replace('output', 'fork')
# t = t.replace('__const0__', 'const0')
# t = t.replace('__const1__', 'const1')
# t = t.replace('tieh', 'const1')
# t = t.replace('ibuff', 'not')
# t = t.replace('inv', 'not')
# fcts = [f for n, f in known_fct if t.startswith(n)]
# if len(fcts) < 1:
# raise ValueError(f'Unknown node kind {n.kind}')
# self.node_fct.append(fcts[0])
def __repr__(self):
return f'<LogicSim {self.circuit.name} sims={self.sims} m={self.m} state_mem={hr_bytes(self.c.nbytes)}>'
@ -96,22 +62,6 @@ class LogicSim(SimOps): @@ -96,22 +62,6 @@ class LogicSim(SimOps):
:returns: The given stimuli object.
"""
self.c[self.pippi_c_locs] = self.s[0, self.pippi_s_locs, :self.mdim]
# for node, stim in zip(self.interface, stimuli.data if hasattr(stimuli, 'data') else stimuli):
# if len(node.outs) == 0: continue
# if node.index in self.latch_dict:
# self.latch_state[self.latch_dict[node.index]] = stim
# else:
# outputs = [self.state[line] if line else self.tmp[3] for line in node.outs]
# self.node_fct[node]([stim], outputs)
# for line in node.outs:
# if line is not None: self.state_epoch[line.reader] = self.epoch
# for n in self.circuit.nodes:
# if n.kind in ('__const1__', '__const0__'):
# outputs = [self.state[line] if line else self.tmp[3] for line in n.outs]
# self.node_fct[n]([], outputs)
# for line in n.outs:
# if line is not None: self.state_epoch[line.reader] = self.epoch
# return stimuli
def c_to_s(self): #, responses, ff_transitions=False):
"""Capture the current values at the primary outputs and in the state-elements (flip-flops).
@ -129,25 +79,6 @@ class LogicSim(SimOps): @@ -129,25 +79,6 @@ class LogicSim(SimOps):
if self.mdim == 1:
self.s[1, self.poppo_s_locs, 1:2] = self.c[self.poppo_c_locs]
# for node, resp in zip(self.interface, responses.data if hasattr(responses, 'data') else responses):
# if len(node.ins) == 0: continue
# if node.index in self.latch_dict:
# resp[...] = self.state[node.outs[0]]
# else:
# resp[...] = self.state[node.ins[0]]
# if not ff_transitions: continue
# # outs of DFFs contain the previously assigned value (previous state)
# if self.m > 2 and 'dff' in node.kind.lower() and len(node.outs) > 0:
# if node.outs[0] is None:
# resp[1, :] = ~self.state[node.outs[1], 0, :] # assume QN is connected, take inverse of that.
# else:
# resp[1, :] = self.state[node.outs[0], 0, :]
# if self.m > 4:
# resp[..., 2, :] = resp[..., 0, :] ^ resp[..., 1, :]
# # FIXME: We don't handle X or - correctly.
# return responses
def c_prop(self, sims=None, inject_cb=None):
"""Propagate the input values towards the outputs (Perform all logic operations in topological order).
@ -201,19 +132,9 @@ class LogicSim(SimOps): @@ -201,19 +132,9 @@ class LogicSim(SimOps):
elif op == SimPrim.XNOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
else: print(f'unknown SimPrim {op}')
if inject_cb is not None: inject_cb(o0, self.s[o0])
# for node in self.circuit.topological_order():
# if self.state_epoch[node] != self.epoch: continue
# inputs = [self.state[line] if line else self.zero for line in node.ins]
# outputs = [self.state[line] if line else self.tmp[3] for line in node.outs]
# if node.index in self.latch_dict:
# inputs.append(self.latch_state[self.latch_dict[node.index]])
# self.node_fct[node](inputs, outputs)
# for line in node.outs:
# if inject_cb is not None: inject_cb(line, self.state[line])
# self.state_epoch[line.reader] = self.epoch
# self.epoch = (self.epoch + 1) % 128
def s_ppo_to_ppi(self):
# TODO: handle latches correctly
if self.m == 2:
self.s[0, self.ppio_s_locs, 0] = self.s[1, self.ppio_s_locs, 0]
else:
@ -236,60 +157,6 @@ class LogicSim(SimOps): @@ -236,60 +157,6 @@ class LogicSim(SimOps):
self.c_to_s()
self.s_ppo_to_ppi()
# def fork_fct(self, inputs, outputs):
# for o in outputs: o[...] = inputs[0]
# def const0_fct(self, _, outputs):
# for o in outputs: o[...] = 0
# def const1_fct(self, _, outputs):
# for o in outputs:
# o[...] = 0
# logic.bp_not(o, o)
# def not_fct(self, inputs, outputs):
# logic.bp_not(outputs[0], inputs[0])
# def and_fct(self, inputs, outputs):
# logic.bp_and(outputs[0], *inputs)
# def or_fct(self, inputs, outputs):
# logic.bp_or(outputs[0], *inputs)
# def xor_fct(self, inputs, outputs):
# logic.bp_xor(outputs[0], *inputs)
# def sdff_fct(self, inputs, outputs):
# logic.bp_buf(outputs[0], inputs[0])
# if len(outputs) > 1:
# logic.bp_not(outputs[1], inputs[0])
# def dff_fct(self, inputs, outputs):
# logic.bp_buf(outputs[0], inputs[0])
# if len(outputs) > 1:
# logic.bp_not(outputs[1], inputs[0])
# def latch_fct(self, inputs, outputs):
# logic.bp_latch(outputs[0], inputs[0], inputs[1], inputs[2])
# if len(outputs) > 1:
# logic.bp_not(outputs[1], inputs[0])
# def nand_fct(self, inputs, outputs):
# logic.bp_and(outputs[0], *inputs)
# logic.bp_not(outputs[0], outputs[0])
# def nor_fct(self, inputs, outputs):
# logic.bp_or(outputs[0], *inputs)
# logic.bp_not(outputs[0], outputs[0])
# def xnor_fct(self, inputs, outputs):
# logic.bp_xor(outputs[0], *inputs)
# logic.bp_not(outputs[0], outputs[0])
# def aoi21_fct(self, inputs, outputs):
# logic.bp_and(self.tmp[0], inputs[0], inputs[1])
# logic.bp_or(outputs[0], self.tmp[0], inputs[2])
# logic.bp_not(outputs[0], outputs[0])
@numba.njit
def _prop_cpu(ops, vat, c):

58
src/kyupy/wave_sim.py

@ -52,7 +52,7 @@ class WaveSim(SimOps): @@ -52,7 +52,7 @@ class WaveSim(SimOps):
assert c_caps > 0 and c_caps % 4 == 0
super().__init__(circuit, c_caps=c_caps//4, c_reuse=c_reuse, strip_forks=strip_forks)
self.sims = sims
self.c_len *= 4
self.vat[...,0:2] *= 4
@ -81,7 +81,7 @@ class WaveSim(SimOps): @@ -81,7 +81,7 @@ class WaveSim(SimOps):
transitions than specified in ``c_caps``. Some transitions have been discarded, the
final values in the waveforms are still valid.
"""
self.params = np.zeros((sims, 4), dtype=np.float32)
self.params[...,0] = 1.0
@ -203,7 +203,7 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0): @@ -203,7 +203,7 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
if int(param[1]) == c_idx: c += param[2+z_cur]
d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
if int(param[1]) == d_idx: d += param[2+z_cur]
previous_t = TMIN
current_t = min(a, b, c, d)
@ -220,8 +220,8 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0): @@ -220,8 +220,8 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
a += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 1
next_t = a
next_t = a
elif b == current_t:
b_cur += 1
b = cbuf[b_mem + b_cur, st_idx]
@ -232,7 +232,7 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0): @@ -232,7 +232,7 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
thresh += param[2+z_val]
inputs ^= 2
next_t = b
elif c == current_t:
c_cur += 1
c = cbuf[c_mem + c_cur, st_idx]
@ -242,8 +242,8 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0): @@ -242,8 +242,8 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
c += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 4
next_t = c
next_t = c
else:
d_cur += 1
d = cbuf[d_mem + d_cur, st_idx]
@ -253,8 +253,8 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0): @@ -253,8 +253,8 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
d += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 8
next_t = d
next_t = d
if (z_cur & 1) != ((lut >> inputs) & 1):
# we generate a toggle in z_mem, if:
# ( it is the first toggle in z_mem OR
@ -272,12 +272,12 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0): @@ -272,12 +272,12 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
else:
z_cur -= 1
previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
current_t = min(a, b, c, d)
# generate overflow flag or propagate from input
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
@numba.njit
def level_eval_cpu(ops, op_start, op_stop, c, vat, st_start, st_stop, line_times, params, sd, seed):
@ -350,7 +350,7 @@ class WaveSimCuda(WaveSim): @@ -350,7 +350,7 @@ class WaveSimCuda(WaveSim):
self.vat = cuda.to_device(self.vat)
self.timing = cuda.to_device(self.timing)
self.params = cuda.to_device(self.params)
self._block_dim = (32, 16)
def s_to_c(self):
@ -361,7 +361,7 @@ class WaveSimCuda(WaveSim): @@ -361,7 +361,7 @@ class WaveSimCuda(WaveSim):
gx = math.ceil(x / self._block_dim[0])
gy = math.ceil(y / self._block_dim[1])
return gx, gy
def c_prop(self, sims=None, sd=0.0, seed=1):
sims = min(sims or self.sims, self.sims)
for op_start, op_stop in zip(self.level_starts, self.level_stops):
@ -369,12 +369,12 @@ class WaveSimCuda(WaveSim): @@ -369,12 +369,12 @@ class WaveSimCuda(WaveSim):
wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.vat, int(0),
sims, self.timing, self.params, sd, seed)
cuda.synchronize()
def c_to_s(self, time=TMAX, sd=0.0, seed=1):
grid_dim = self._grid_dim(self.sims, self.s_len)
wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppo_offset,
time, sd * math.sqrt(2), seed)
def s_ppo_to_ppi(self, time=0.0):
grid_dim = self._grid_dim(self.sims, self.s_len)
ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.vat, time, self.ppi_offset, self.ppo_offset)
@ -436,7 +436,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim @@ -436,7 +436,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
d_idx = ops[op_idx, 5]
param = param[st_idx]
# >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
overflows = int(0)
@ -451,7 +451,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim @@ -451,7 +451,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
a_cur = int(0)
b_cur = int(0)
c_cur = int(0)
d_cur = int(0)
d_cur = int(0)
z_cur = lut & 1
if z_cur == 1:
cbuf[z_mem, st_idx] = TMIN
@ -464,7 +464,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim @@ -464,7 +464,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
if int(param[1]) == c_idx: c += param[2+z_cur]
d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
if int(param[1]) == d_idx: d += param[2+z_cur]
previous_t = TMIN
current_t = min(a, b, c, d)
@ -481,8 +481,8 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim @@ -481,8 +481,8 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
a += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 1
next_t = a
next_t = a
elif b == current_t:
b_cur += 1
b = cbuf[b_mem + b_cur, st_idx]
@ -493,7 +493,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim @@ -493,7 +493,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
thresh += param[2+z_val]
inputs ^= 2
next_t = b
elif c == current_t:
c_cur += 1
c = cbuf[c_mem + c_cur, st_idx]
@ -503,8 +503,8 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim @@ -503,8 +503,8 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
c += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 4
next_t = c
next_t = c
else:
d_cur += 1
d = cbuf[d_mem + d_cur, st_idx]
@ -514,8 +514,8 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim @@ -514,8 +514,8 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
d += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 8
next_t = d
next_t = d
if (z_cur & 1) != ((lut >> inputs) & 1):
# we generate a toggle in z_mem, if:
# ( it is the first toggle in z_mem OR
@ -533,7 +533,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim @@ -533,7 +533,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
else:
z_cur -= 1
previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
current_t = min(a, b, c, d)
# generate overflow flag or propagate from input
@ -603,10 +603,10 @@ def ppo_to_ppi_gpu(s, vat, time, ppi_offset, ppo_offset): @@ -603,10 +603,10 @@ def ppo_to_ppi_gpu(s, vat, time, ppi_offset, ppo_offset):
x, y = cuda.grid(2)
if y >= s.shape[0]: return
if x >= s.shape[1]: return
if vat[ppi_offset + y, 0] < 0: return
if vat[ppo_offset + y, 0] < 0: return
s[y, x, 0] = s[y, x, 2]
s[y, x, 1] = time
s[y, x, 2] = s[y, x, 8]

961
src/kyupy/wave_sim_old.py

@ -1,961 +0,0 @@ @@ -1,961 +0,0 @@
"""High-throughput combinational logic timing simulators.
These simulators work similarly to :py:class:`~kyupy.logic_sim.LogicSim`.
They propagate values through the combinational circuit from (pseudo) primary inputs to (pseudo) primary outputs.
Instead of propagating logic values, these simulators propagate signal histories (waveforms).
They are designed to run many simulations in parallel and while their latencies are quite high, they can achieve
high throughput.
The simulators are not event-based and are not capable of simulating sequential circuits directly.
Two simulators are available: :py:class:`WaveSim` runs on the CPU, and the derived class
:py:class:`WaveSimCuda` runs on the GPU.
"""
import math
from bisect import bisect, insort_left
import numpy as np
from . import numba, cuda, hr_bytes
TMAX = np.float32(2 ** 127)
"""A large 32-bit floating point value used to mark the end of a waveform."""
TMAX_OVL = np.float32(1.1 * 2 ** 127)
"""A large 32-bit floating point value used to mark the end of a waveform that
may be incomplete due to an overflow."""
TMIN = np.float32(-2 ** 127)
"""A large negative 32-bit floating point value used at the beginning of waveforms that start with logic-1."""
class Heap:
def __init__(self):
self.chunks = dict() # map start location to chunk size
self.released = list() # chunks that were released
self.current_size = 0
self.max_size = 0
def alloc(self, size):
for idx, loc in enumerate(self.released):
if self.chunks[loc] == size:
del self.released[idx]
return loc
if self.chunks[loc] > size: # split chunk
chunksize = self.chunks[loc]
self.chunks[loc] = size
self.chunks[loc + size] = chunksize - size
self.released[idx] = loc + size # move released pointer: loc -> loc+size
return loc
# no previously released chunk; make new one
loc = self.current_size
self.chunks[loc] = size
self.current_size += size
self.max_size = max(self.max_size, self.current_size)
return loc
def free(self, loc):
size = self.chunks[loc]
if loc + size == self.current_size: # end of managed area, remove chunk
del self.chunks[loc]
self.current_size -= size
# check and remove prev chunk if free
if len(self.released) > 0:
prev = self.released[-1]
if prev + self.chunks[prev] == self.current_size:
chunksize = self.chunks[prev]
del self.chunks[prev]
del self.released[-1]
self.current_size -= chunksize
return
released_idx = bisect(self.released, loc)
if released_idx < len(self.released) and loc + size == self.released[released_idx]: # next chunk is free, merge
chunksize = size + self.chunks[loc + size]
del self.chunks[loc + size]
self.chunks[loc] = chunksize
size = self.chunks[loc]
self.released[released_idx] = loc
else:
insort_left(self.released, loc) # put in a new release
if released_idx > 0: # check if previous chunk is free
prev = self.released[released_idx - 1]
if prev + self.chunks[prev] == loc: # previous chunk is adjacent to freed one, merge
chunksize = size + self.chunks[prev]
del self.chunks[loc]
self.chunks[prev] = chunksize
del self.released[released_idx]
def __repr__(self):
r = []
for loc in sorted(self.chunks.keys()):
size = self.chunks[loc]
released_idx = bisect(self.released, loc)
is_released = released_idx > 0 and len(self.released) > 0 and self.released[released_idx - 1] == loc
r.append(f'{loc:5d}: {"free" if is_released else "used"} {size}')
return "\n".join(r)
class WaveSim:
"""A waveform-based combinational logic timing simulator running on CPU.
:param circuit: The circuit to simulate.
:param timing: The timing annotation of the circuit (see :py:func:`kyupy.sdf.DelayFile.annotation` for details)
:param sims: The number of parallel simulations.
:param wavecaps: The number of floats available in each waveform. Waveforms are encoding the signal switching
history by storing transition times. The waveform capacity roughly corresponds to the number of transitions
that can be stored. A capacity of ``n`` can store at least ``n-2`` transitions. If more transitions are
generated during simulation, the latest glitch is removed (freeing up two transition times) and an overflow
flag is set. If an integer is given, all waveforms are set to that same capacity. With an array of length
``len(circuit.lines)`` the capacity can be controlled for each intermediate waveform individually.
:param strip_forks: If enabled, the simulator will not evaluate fork nodes explicitly. This saves simulation time
by reducing the number of nodes to simulate, but (interconnect) delay annotations of lines read by fork nodes
are ignored.
:param keep_waveforms: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
"""
def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
self.circuit = circuit
self.sims = sims
self.overflows = 0
self.interface = list(circuit.io_nodes) + [n for n in circuit.nodes if 'dff' in n.kind.lower()]
self.lst_eat_valid = False
self.cdata = np.zeros((len(self.interface), sims, 7), dtype='float32')
self.sdata = np.zeros((sims, 4), dtype='float32')
self.sdata[...,0] = 1.0
if isinstance(wavecaps, int):
wavecaps = [wavecaps] * len(circuit.lines)
intf_wavecap = 4 # sufficient for storing only 1 transition.
# indices for state allocation table (sat)
self.zero_idx = len(circuit.lines)
self.tmp_idx = self.zero_idx + 1
self.ppi_offset = self.tmp_idx + 1
self.ppo_offset = self.ppi_offset + len(self.interface)
self.sat_length = self.ppo_offset + len(self.interface)
# translate circuit structure into self.ops
ops = []
interface_dict = dict((n, i) for i, n in enumerate(self.interface))
for n in circuit.topological_order():
if n in interface_dict:
inp_idx = self.ppi_offset + interface_dict[n]
if len(n.outs) > 0 and n.outs[0] is not None: # first output of a PI/PPI
ops.append((0b1010, n.outs[0].index, inp_idx, self.zero_idx))
if 'dff' in n.kind.lower(): # second output of DFF is inverted
if len(n.outs) > 1 and n.outs[1] is not None:
ops.append((0b0101, n.outs[1].index, inp_idx, self.zero_idx))
else: # if not DFF, no output is inverted.
for o_line in n.outs[1:]:
if o_line is not None:
ops.append((0b1010, o_line.index, inp_idx, self.zero_idx))
else: # regular node, not PI/PPI or PO/PPO
o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx
i0_idx = n.ins[0].index if len(n.ins) > 0 and n.ins[0] is not None else self.zero_idx
i1_idx = n.ins[1].index if len(n.ins) > 1 and n.ins[1] is not None else self.zero_idx
kind = n.kind.lower()
if kind == '__fork__':
if not strip_forks:
for o_line in n.outs:
if o_line is not None:
ops.append((0b1010, o_line.index, i0_idx, i1_idx))
elif kind.startswith('nand'):
ops.append((0b0111, o0_idx, i0_idx, i1_idx))
elif kind.startswith('nor'):
ops.append((0b0001, o0_idx, i0_idx, i1_idx))
elif kind.startswith('and'):
ops.append((0b1000, o0_idx, i0_idx, i1_idx))
elif kind.startswith('or'):
ops.append((0b1110, o0_idx, i0_idx, i1_idx))
elif kind.startswith('xor'):
ops.append((0b0110, o0_idx, i0_idx, i1_idx))
elif kind.startswith('xnor'):
ops.append((0b1001, o0_idx, i0_idx, i1_idx))
elif kind.startswith('not') or kind.startswith('inv') or kind.startswith('ibuf'):
ops.append((0b0101, o0_idx, i0_idx, i1_idx))
elif kind.startswith('buf') or kind.startswith('nbuf'):
ops.append((0b1010, o0_idx, i0_idx, i1_idx))
elif kind.startswith('__const1__') or kind.startswith('tieh'):
ops.append((0b0101, o0_idx, i0_idx, i1_idx))
elif kind.startswith('__const0__') or kind.startswith('tiel'):
ops.append((0b1010, o0_idx, i0_idx, i1_idx))
else:
print('unknown gate type', kind)
self.ops = np.asarray(ops, dtype='int32')
# create a map from fanout lines to stem lines for fork stripping
stems = np.zeros(self.sat_length, dtype='int32') - 1 # default to -1: 'no fanout line'
if strip_forks:
for f in circuit.forks.values():
prev_line = f.ins[0]
while prev_line.driver.kind == '__fork__':
prev_line = prev_line.driver.ins[0]
stem_idx = prev_line.index
for ol in f.outs:
stems[ol] = stem_idx
# calculate level (distance from PI/PPI) and reference count for each line
levels = np.zeros(self.sat_length, dtype='int32')
ref_count = np.zeros(self.sat_length, dtype='int32')
level_starts = [0]
current_level = 1
for i, op in enumerate(self.ops):
# if we fork-strip, always take the stems for determining fan-in level
i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
if levels[i0_idx] >= current_level or levels[i1_idx] >= current_level:
current_level += 1
level_starts.append(i)
levels[op[1]] = current_level # set level of the output line
ref_count[i0_idx] += 1
ref_count[i1_idx] += 1
self.level_starts = np.asarray(level_starts, dtype='int32')
self.level_stops = np.asarray(level_starts[1:] + [len(self.ops)], dtype='int32')
# state allocation table. maps line and interface indices to self.state memory locations
self.sat = np.zeros((self.sat_length, 3), dtype='int')
self.sat[:, 0] = -1
h = Heap()
# allocate and keep memory for special fields
self.sat[self.zero_idx] = h.alloc(intf_wavecap), intf_wavecap, 0
self.sat[self.tmp_idx] = h.alloc(intf_wavecap), intf_wavecap, 0
ref_count[self.zero_idx] += 1
ref_count[self.tmp_idx] += 1
# allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later)
for i, n in enumerate(self.interface):
if len(n.outs) > 0:
self.sat[self.ppi_offset + i] = h.alloc(intf_wavecap), intf_wavecap, 0
ref_count[self.ppi_offset + i] += 1
if len(n.ins) > 0:
i0_idx = stems[n.ins[0]] if stems[n.ins[0]] >= 0 else n.ins[0]
ref_count[i0_idx] += 1
# allocate memory for the rest of the circuit
for op_start, op_stop in zip(self.level_starts, self.level_stops):
free_list = []
for op in self.ops[op_start:op_stop]:
# if we fork-strip, always take the stems
i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
ref_count[i0_idx] -= 1
ref_count[i1_idx] -= 1
if ref_count[i0_idx] <= 0: free_list.append(self.sat[i0_idx, 0])
if ref_count[i1_idx] <= 0: free_list.append(self.sat[i1_idx, 0])
o_idx = op[1]
cap = wavecaps[o_idx]
self.sat[o_idx] = h.alloc(cap), cap, 0
if not keep_waveforms:
for loc in free_list:
h.free(loc)
# copy memory location and capacity from stems to fanout lines
for lidx, stem in enumerate(stems):
if stem >= 0: # if at a fanout line
self.sat[lidx] = self.sat[stem]
# copy memory location to PO/PPO area
for i, n in enumerate(self.interface):
if len(n.ins) > 0:
self.sat[self.ppo_offset + i] = self.sat[n.ins[0]]
# pad timing
self.timing = np.zeros((self.sat_length, 2, 2))
self.timing[:len(timing)] = timing
# allocate self.state
self.state = np.zeros((h.max_size, sims), dtype='float32') + TMAX
m1 = np.array([2 ** x for x in range(7, -1, -1)], dtype='uint8')
m0 = ~m1
self.mask = np.rollaxis(np.vstack((m0, m1)), 1)
def __repr__(self):
total_mem = self.state.nbytes + self.sat.nbytes + self.ops.nbytes + self.cdata.nbytes
return f'<WaveSim {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
f'levels={len(self.level_starts)} mem={hr_bytes(total_mem)}>'
def get_line_delay(self, line, polarity):
"""Returns the current delay of the given ``line`` and ``polarity`` in the simulation model."""
return self.timing[line, 0, polarity]
def set_line_delay(self, line, polarity, delay):
"""Sets a new ``delay`` for the given ``line`` and ``polarity`` in the simulation model."""
self.timing[line, 0, polarity] = delay
def assign(self, vectors, time=0.0, offset=0):
"""Assigns new values to the primary inputs and state-elements.
:param vectors: The values to assign preferably in 8-valued logic. The values are converted to
appropriate waveforms with or one transition (``RISE``, ``FALL``) no transitions
(``ZERO``, ``ONE``, and others).
:type vectors: :py:class:`~kyupy.logic.BPArray`
:param time: The transition time of the generated waveforms.
:param offset: The offset into the vector set. The vector assigned to the first simulator is
``vectors[offset]``.
"""
nvectors = min(len(vectors) - offset, self.sims)
for i in range(len(self.interface)):
ppi_loc = self.sat[self.ppi_offset + i, 0]
if ppi_loc < 0: continue
for p in range(nvectors):
vector = p + offset
a = vectors.data[i, :, vector // 8]
m = self.mask[vector % 8]
toggle = 0
if len(a) <= 2:
if a[0] & m[1]:
self.state[ppi_loc, p] = TMIN
toggle += 1
else:
if a[1] & m[1]:
self.state[ppi_loc, p] = TMIN
toggle += 1
if (a[2] & m[1]) and ((a[0] & m[1]) != (a[1] & m[1])):
self.state[ppi_loc + toggle, p] = time
toggle += 1
self.state[ppi_loc + toggle, p] = TMAX
def propagate(self, sims=None, sd=0.0, seed=1):
"""Propagates all waveforms from the (pseudo) primary inputs to the (pseudo) primary outputs.
:param sims: Number of parallel simulations to execute. If None, all available simulations are performed.
:param sd: Standard deviation for injection of random delay variation. Active, if value is positive.
:param seed: Random seed for delay variations.
"""
sims = min(sims or self.sims, self.sims)
for op_start, op_stop in zip(self.level_starts, self.level_stops):
self.overflows += level_eval(self.ops, op_start, op_stop, self.state, self.sat, 0, sims,
self.timing, self.sdata, sd, seed)
self.lst_eat_valid = False
def wave(self, line, vector):
# """Returns the desired waveform from the simulation state. Only valid, if simulator was
# instantiated with ``keep_waveforms=True``."""
if line < 0:
return [TMAX]
mem, wcap, _ = self.sat[line]
if mem < 0:
return [TMAX]
return self.state[mem:mem + wcap, vector]
def wave_ppi(self, i, vector):
return self.wave(self.ppi_offset + i, vector)
def wave_ppo(self, o, vector):
return self.wave(self.ppo_offset + o, vector)
def capture(self, time=TMAX, sd=0.0, seed=1, cdata=None, offset=0):
"""Simulates a capture operation at all state-elements and primary outputs.
The capture analyzes the propagated waveforms at and around the given capture time and returns
various results for each capture operation.
:param time: The desired capture time. By default, a capture of the settled value is performed.
:param sd: A standard deviation for uncertainty in the actual capture time.
:param seed: The random seed for a capture with uncertainty.
:param cdata: An array to copy capture data into (optional). See the return value for details.
:param offset: An offset into the supplied capture data array.
:return: The capture data as numpy array.
The 3-dimensional capture data array contains for each interface node (axis 0),
and each test (axis 1), seven values:
0. Probability of capturing a 1 at the given capture time (same as next value, if no
standard deviation given).
1. A capture value decided by random sampling according to above probability and given seed.
2. The final value (assume a very late capture time).
3. True, if there was a premature capture (capture error), i.e. final value is different
from captured value.
4. Earliest arrival time. The time at which the output transitioned from its initial value.
5. Latest stabilization time. The time at which the output transitioned to its final value.
6. Overflow indicator. If non-zero, some signals in the input cone of this output had more
transitions than specified in ``wavecaps``. Some transitions have been discarded, the
final values in the waveforms are still valid.
"""
for i, node in enumerate(self.interface):
if len(node.ins) == 0: continue
for p in range(self.sims):
self.cdata[i, p] = self.capture_wave(self.ppo_offset + i, p, time, sd, seed)
if cdata is not None:
assert offset < cdata.shape[1]
cap_dim = min(cdata.shape[1] - offset, self.sims)
cdata[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim]
self.lst_eat_valid = True
return self.cdata
def reassign(self, time=0.0):
"""Re-assigns the last capture to the appropriate pseudo-primary inputs. Generates a new set of
waveforms at the PPIs that start with the previous final value of that PPI, and transitions at the
given time to the value captured in a previous simulation. :py:func:`~WaveSim.capture` must be called
prior to this function. The final value of each PPI is taken from the randomly sampled concrete logic
values in the capture data.
:param time: The transition time at the inputs (usually 0.0).
"""
for i in range(len(self.interface)):
ppi_loc = self.sat[self.ppi_offset + i, 0]
ppo_loc = self.sat[self.ppo_offset + i, 0]
if ppi_loc < 0 or ppo_loc < 0: continue
for sidx in range(self.sims):
ival = self.val(self.ppi_offset + i, sidx, TMAX) > 0.5
oval = self.cdata[i, sidx, 1] > 0.5
toggle = 0
if ival:
self.state[ppi_loc, sidx] = TMIN
toggle += 1
if ival != oval:
self.state[ppi_loc + toggle, sidx] = time
toggle += 1
self.state[ppi_loc + toggle, sidx] = TMAX
def eat(self, line, vector):
eat = TMAX
for t in self.wave(line, vector):
if t >= TMAX: break
if t <= TMIN: continue
eat = min(eat, t)
return eat
def lst(self, line, vector):
lst = TMIN
for t in self.wave(line, vector):
if t >= TMAX: break
if t <= TMIN: continue
lst = max(lst, t)
return lst
def lst_ppo(self, o, vector):
if not self.lst_eat_valid:
self.capture()
return self.cdata[o, vector, 5]
def toggles(self, line, vector):
tog = 0
for t in self.wave(line, vector):
if t >= TMAX: break
if t <= TMIN: continue
tog += 1
return tog
def _vals(self, idx, vector, times, sd=0.0):
s_sqrt2 = sd * math.sqrt(2)
m = 0.5
accs = [0.0] * len(times)
values = [0] * len(times)
for t in self.wave(idx, vector):
if t >= TMAX: break
for idx, time in enumerate(times):
if t < time:
values[idx] = values[idx] ^ 1
m = -m
if t <= TMIN: continue
if s_sqrt2 > 0:
for idx, time in enumerate(times):
accs[idx] += m * (1 + math.erf((t - time) / s_sqrt2))
if (m < 0) and (s_sqrt2 > 0):
for idx, time in enumerate(times):
accs[idx] += 1
if s_sqrt2 == 0:
return values
return accs
def vals(self, line, vector, times, sd=0):
return self._vals(line, vector, times, sd)
def val(self, line, vector, time=TMAX, sd=0):
return self.capture_wave(line, vector, time, sd)[0]
def vals_ppo(self, o, vector, times, sd=0):
return self._vals(self.ppo_offset + o, vector, times, sd)
def val_ppo(self, o, vector, time=TMAX, sd=0):
if not self.lst_eat_valid:
self.capture(time, sd)
return self.cdata[o, vector, 0]
def capture_wave(self, line, vector, time=TMAX, sd=0.0, seed=1):
s_sqrt2 = sd * math.sqrt(2)
m = 0.5
acc = 0.0
eat = TMAX
lst = TMIN
tog = 0
ovl = 0
val = int(0)
final = int(0)
for t in self.wave(line, vector):
if t >= TMAX:
if t == TMAX_OVL:
ovl = 1
break
m = -m
final ^= 1
if t < time:
val ^= 1
if t <= TMIN: continue
if s_sqrt2 > 0:
acc += m * (1 + math.erf((t - time) / s_sqrt2))
eat = min(eat, t)
lst = max(lst, t)
tog += 1
if s_sqrt2 > 0:
if m < 0:
acc += 1
if acc >= 0.99:
val = 1
elif acc > 0.01:
seed = (seed << 4) + (vector << 20) + (line-self.ppo_offset << 1)
seed = int(0xDEECE66D) * seed + 0xB
seed = int(0xDEECE66D) * seed + 0xB
rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
val = rnd < acc
else:
val = 0
else:
acc = val
return acc, val, final, (val != final), eat, lst, ovl
@numba.njit
def level_eval(ops, op_start, op_stop, state, sat, st_start, st_stop, line_times, sdata, sd, seed):
overflows = 0
for op_idx in range(op_start, op_stop):
op = ops[op_idx]
for st_idx in range(st_start, st_stop):
overflows += wave_eval(op, state, sat, st_idx, line_times, sdata[st_idx], sd, seed)
return overflows
@numba.njit
def rand_gauss(seed, sd):
clamp = 0.5
if sd <= 0.0:
return 1.0
while True:
x = -6.0
for _ in range(12):
seed = int(0xDEECE66D) * seed + 0xB
x += float((seed >> 8) & 0xffffff) / float(1 << 24)
x *= sd
if abs(x) <= clamp:
break
return x + 1.0
@numba.njit
def wave_eval(op, state, sat, st_idx, line_times, sdata, sd=0.0, seed=0):
lut, z_idx, a_idx, b_idx = op
overflows = int(0)
_seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
a_mem = sat[a_idx, 0]
b_mem = sat[b_idx, 0]
z_mem, z_cap, _ = sat[z_idx]
a_cur = int(0)
b_cur = int(0)
z_cur = lut & 1
if z_cur == 1:
state[z_mem, st_idx] = TMIN
a = state[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss(_seed ^ a_mem ^ z_cur, sd) * sdata[0]
if int(sdata[1]) == a_idx: a += sdata[2+z_cur]
b = state[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss(_seed ^ b_mem ^ z_cur, sd) * sdata[0]
if int(sdata[1]) == b_idx: b += sdata[2+z_cur]
previous_t = TMIN
current_t = min(a, b)
inputs = int(0)
while current_t < TMAX:
z_val = z_cur & 1
if b < a:
b_cur += 1
b = state[b_mem + b_cur, st_idx]
b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ b_mem ^ z_val ^ 1, sd) * sdata[0]
thresh = line_times[b_idx, 1, z_val] * rand_gauss(_seed ^ b_mem ^ z_val, sd) * sdata[0]
if int(sdata[1]) == b_idx:
b += sdata[2+(z_val^1)]
thresh += sdata[2+z_val]
inputs ^= 2
next_t = b
else:
a_cur += 1
a = state[a_mem + a_cur, st_idx]
a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ a_mem ^ z_val ^ 1, sd) * sdata[0]
thresh = line_times[a_idx, 1, z_val] * rand_gauss(_seed ^ a_mem ^ z_val, sd) * sdata[0]
if int(sdata[1]) == a_idx:
a += sdata[2+(z_val^1)]
thresh += sdata[2+z_val]
inputs ^= 1
next_t = a
if (z_cur & 1) != ((lut >> inputs) & 1):
# we generate a toggle in z_mem, if:
# ( it is the first toggle in z_mem OR
# following toggle is earlier OR
# pulse is wide enough ) AND enough space in z_mem.
if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
if z_cur < (z_cap - 1):
state[z_mem + z_cur, st_idx] = current_t
previous_t = current_t
z_cur += 1
else:
overflows += 1
previous_t = state[z_mem + z_cur - 1, st_idx]
z_cur -= 1
else:
z_cur -= 1
if z_cur > 0:
previous_t = state[z_mem + z_cur - 1, st_idx]
else:
previous_t = TMIN
current_t = min(a, b)
if overflows > 0:
state[z_mem + z_cur, st_idx] = TMAX_OVL
else:
state[z_mem + z_cur, st_idx] = a if a > b else b # propagate overflow flags by storing biggest TMAX from input
return overflows
class WaveSimCuda(WaveSim):
"""A GPU-accelerated waveform-based combinational logic timing simulator.
The API is the same as for :py:class:`WaveSim`.
All internal memories are mirrored into GPU memory upon construction.
Some operations like access to single waveforms can involve large communication overheads.
"""
def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
super().__init__(circuit, timing, sims, wavecaps, strip_forks, keep_waveforms)
self.tdata = np.zeros((len(self.interface), 3, (sims - 1) // 8 + 1), dtype='uint8')
self.d_state = cuda.to_device(self.state)
self.d_sat = cuda.to_device(self.sat)
self.d_ops = cuda.to_device(self.ops)
self.d_timing = cuda.to_device(self.timing)
self.d_tdata = cuda.to_device(self.tdata)
self.d_cdata = cuda.to_device(self.cdata)
self.d_sdata = cuda.to_device(self.sdata)
self._block_dim = (32, 16)
def __repr__(self):
total_mem = self.state.nbytes + self.sat.nbytes + self.ops.nbytes + self.timing.nbytes + \
self.tdata.nbytes + self.cdata.nbytes
return f'<WaveSimCuda {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
f'levels={len(self.level_starts)} mem={hr_bytes(total_mem)}>'
def get_line_delay(self, line, polarity):
return self.d_timing[line, 0, polarity]
def set_line_delay(self, line, polarity, delay):
self.d_timing[line, 0, polarity] = delay
def sdata_to_device(self):
cuda.to_device(self.sdata, to=self.d_sdata)
def assign(self, vectors, time=0.0, offset=0):
assert (offset % 8) == 0
byte_offset = offset // 8
assert byte_offset < vectors.data.shape[-1]
pdim = min(vectors.data.shape[-1] - byte_offset, self.tdata.shape[-1])
self.tdata[..., 0:pdim] = vectors.data[..., byte_offset:pdim + byte_offset]
if vectors.m == 2:
self.tdata[:, 2, 0:pdim] = 0
cuda.to_device(self.tdata, to=self.d_tdata)
grid_dim = self._grid_dim(self.sims, len(self.interface))
assign_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppi_offset,
len(self.interface), self.d_tdata, time)
def _grid_dim(self, x, y):
gx = math.ceil(x / self._block_dim[0])
gy = math.ceil(y / self._block_dim[1])
return gx, gy
def propagate(self, sims=None, sd=0.0, seed=1):
sims = min(sims or self.sims, self.sims)
for op_start, op_stop in zip(self.level_starts, self.level_stops):
grid_dim = self._grid_dim(sims, op_stop - op_start)
wave_kernel[grid_dim, self._block_dim](self.d_ops, op_start, op_stop, self.d_state, self.sat, int(0),
sims, self.d_timing, self.d_sdata, sd, seed)
cuda.synchronize()
self.lst_eat_valid = False
def wave(self, line, vector):
if line < 0:
return [TMAX]
mem, wcap, _ = self.sat[line]
if mem < 0:
return [TMAX]
return self.d_state[mem:mem + wcap, vector]
def capture(self, time=TMAX, sd=0, seed=1, cdata=None, offset=0):
grid_dim = self._grid_dim(self.sims, len(self.interface))
capture_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppo_offset,
self.d_cdata, time, sd * math.sqrt(2), seed)
self.cdata[...] = self.d_cdata
if cdata is not None:
assert offset < cdata.shape[1]
cap_dim = min(cdata.shape[1] - offset, self.sims)
cdata[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim]
self.lst_eat_valid = True
return self.cdata
def reassign(self, time=0.0):
grid_dim = self._grid_dim(self.sims, len(self.interface))
reassign_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppi_offset, self.ppo_offset,
self.d_cdata, time)
cuda.synchronize()
def wavecaps(self):
gx = math.ceil(len(self.circuit.lines) / 512)
wavecaps_kernel[gx, 512](self.d_state, self.d_sat, self.sims)
self.sat[...] = self.d_sat
return self.sat[..., 2]
@cuda.jit()
def wavecaps_kernel(state, sat, sims):
idx = cuda.grid(1)
if idx >= len(sat): return
lidx, lcap, _ = sat[idx]
if lidx < 0: return
wcap = 0
for sidx in range(sims):
for tidx in range(lcap):
t = state[lidx + tidx, sidx]
if tidx > wcap:
wcap = tidx
if t >= TMAX: break
sat[idx, 2] = wcap + 1
@cuda.jit()
def reassign_kernel(state, sat, ppi_offset, ppo_offset, cdata, ppi_time):
vector, y = cuda.grid(2)
if vector >= state.shape[-1]: return
if ppo_offset + y >= len(sat): return
ppo, _, _ = sat[ppo_offset + y]
ppi, ppi_cap, _ = sat[ppi_offset + y]
if ppo < 0: return
if ppi < 0: return
ppo_val = int(cdata[y, vector, 1])
ppi_val = int(0)
for tidx in range(ppi_cap):
t = state[ppi + tidx, vector]
if t >= TMAX: break
ppi_val ^= 1
# make new waveform at PPI
toggle = 0
if ppi_val:
state[ppi + toggle, vector] = TMIN
toggle += 1
if ppi_val != ppo_val:
state[ppi + toggle, vector] = ppi_time
toggle += 1
state[ppi + toggle, vector] = TMAX
@cuda.jit()
def capture_kernel(state, sat, ppo_offset, cdata, time, s_sqrt2, seed):
x, y = cuda.grid(2)
if ppo_offset + y >= len(sat): return
line, tdim, _ = sat[ppo_offset + y]
if line < 0: return
if x >= state.shape[-1]: return
vector = x
m = 0.5
acc = 0.0
eat = TMAX
lst = TMIN
tog = 0
ovl = 0
val = int(0)
final = int(0)
for tidx in range(tdim):
t = state[line + tidx, vector]
if t >= TMAX:
if t == TMAX_OVL:
ovl = 1
break
m = -m
final ^= 1
if t < time:
val ^= 1
if t <= TMIN: continue
if s_sqrt2 > 0:
acc += m * (1 + math.erf((t - time) / s_sqrt2))
eat = min(eat, t)
lst = max(lst, t)
tog += 1
if s_sqrt2 > 0:
if m < 0:
acc += 1
if acc >= 0.99:
val = 1
elif acc > 0.01:
seed = (seed << 4) + (vector << 20) + (y << 1)
seed = int(0xDEECE66D) * seed + 0xB
seed = int(0xDEECE66D) * seed + 0xB
rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
val = rnd < acc
else:
val = 0
else:
acc = val
cdata[y, vector, 0] = acc
cdata[y, vector, 1] = val
cdata[y, vector, 2] = final
cdata[y, vector, 3] = (val != final)
cdata[y, vector, 4] = eat
cdata[y, vector, 5] = lst
cdata[y, vector, 6] = ovl
@cuda.jit()
def assign_kernel(state, sat, ppi_offset, intf_len, tdata, time):
x, y = cuda.grid(2)
if y >= intf_len: return
line = sat[ppi_offset + y, 0]
if line < 0: return
sdim = state.shape[-1]
if x >= sdim: return
vector = x
a0 = tdata[y, 0, vector // 8]
a1 = tdata[y, 1, vector // 8]
a2 = tdata[y, 2, vector // 8]
m = np.uint8(1 << (7 - (vector % 8)))
toggle = 0
if a1 & m:
state[line + toggle, x] = TMIN
toggle += 1
if (a2 & m) and ((a0 & m) != (a1 & m)):
state[line + toggle, x] = time
toggle += 1
state[line + toggle, x] = TMAX
@cuda.jit(device=True)
def rand_gauss_dev(seed, sd):
clamp = 0.5
if sd <= 0.0:
return 1.0
while True:
x = -6.0
for _ in range(12):
seed = int(0xDEECE66D) * seed + 0xB
x += float((seed >> 8) & 0xffffff) / float(1 << 24)
x *= sd
if abs(x) <= clamp:
break
return x + 1.0
@cuda.jit()
def wave_kernel(ops, op_start, op_stop, state, sat, st_start, st_stop, line_times, sdata, sd, seed):
x, y = cuda.grid(2)
st_idx = st_start + x
op_idx = op_start + y
if st_idx >= st_stop: return
if op_idx >= op_stop: return
lut = ops[op_idx, 0]
z_idx = ops[op_idx, 1]
a_idx = ops[op_idx, 2]
b_idx = ops[op_idx, 3]
overflows = int(0)
sdata = sdata[st_idx]
_seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
a_mem = sat[a_idx, 0]
b_mem = sat[b_idx, 0]
z_mem, z_cap, _ = sat[z_idx]
a_cur = int(0)
b_cur = int(0)
z_cur = lut & 1
if z_cur == 1:
state[z_mem, st_idx] = TMIN
a = state[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_dev(_seed ^ a_mem ^ z_cur, sd) * sdata[0]
if int(sdata[1]) == a_idx: a += sdata[2+z_cur]
b = state[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_dev(_seed ^ b_mem ^ z_cur, sd) * sdata[0]
if int(sdata[1]) == b_idx: b += sdata[2+z_cur]
previous_t = TMIN
current_t = min(a, b)
inputs = int(0)
while current_t < TMAX:
z_val = z_cur & 1
if b < a:
b_cur += 1
b = state[b_mem + b_cur, st_idx]
b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_dev(_seed ^ b_mem ^ z_val ^ 1, sd) * sdata[0]
thresh = line_times[b_idx, 1, z_val] * rand_gauss_dev(_seed ^ b_mem ^ z_val, sd) * sdata[0]
if int(sdata[1]) == b_idx:
b += sdata[2+(z_val^1)]
thresh += sdata[2+z_val]
inputs ^= 2
next_t = b
else:
a_cur += 1
a = state[a_mem + a_cur, st_idx]
a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_dev(_seed ^ a_mem ^ z_val ^ 1, sd) * sdata[0]
thresh = line_times[a_idx, 1, z_val] * rand_gauss_dev(_seed ^ a_mem ^ z_val, sd) * sdata[0]
if int(sdata[1]) == a_idx:
a += sdata[2+(z_val^1)]
thresh += sdata[2+z_val]
inputs ^= 1
next_t = a
if (z_cur & 1) != ((lut >> inputs) & 1):
# we generate a toggle in z_mem, if:
# ( it is the first toggle in z_mem OR
# following toggle is earlier OR
# pulse is wide enough ) AND enough space in z_mem.
if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
if z_cur < (z_cap - 1):
state[z_mem + z_cur, st_idx] = current_t
previous_t = current_t
z_cur += 1
else:
overflows += 1
previous_t = state[z_mem + z_cur - 1, st_idx]
z_cur -= 1
else:
z_cur -= 1
if z_cur > 0:
previous_t = state[z_mem + z_cur - 1, st_idx]
else:
previous_t = TMIN
current_t = min(a, b)
if overflows > 0:
state[z_mem + z_cur, st_idx] = TMAX_OVL
else:
state[z_mem + z_cur, st_idx] = a if a > b else b # propagate overflow flags by storing biggest TMAX from input
Loading…
Cancel
Save