switch to new wave_sim, silence occupancy warnings

3 years ago · 8da4a62bce
7 changed files with 1535 additions and 1523 deletions
--- a/src/kyupy/init.py
+++ b/src/kyupy/init.py
@ -211,6 +211,8 @@ if importlib.util.find_spec('numba') is not None:
				@@ -211,6 +211,8 @@ if importlib.util.find_spec('numba') is not None:
    except CudaSupportError:
        log.warn('Cuda unavailable. Falling back to pure Python.')
        cuda = MockCuda()
+    from numba.core import config
+    config.CUDA_LOW_OCCUPANCY_WARNINGS = False
 else:
    numba = MockNumba()
    """If Numba is available on the system, it is the actual ``numba`` package.
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
--- a/src/kyupy/wave_sim4.py
+++ b/src/kyupy/wave_sim4.py
@ -1,509 +0,0 @@
				@@ -1,509 +0,0 @@
-"""High-throughput combinational logic timing simulators.
-
-These simulators work similarly to :py:class:`~kyupy.logic_sim.LogicSim`.
-They propagate values through the combinational circuit from (pseudo) primary inputs to (pseudo) primary outputs.
-Instead of propagating logic values, these simulators propagate signal histories (waveforms).
-They are designed to run many simulations in parallel and while their latencies are quite high, they can achieve
-high throughput.
-
-The simulators are not event-based and are not capable of simulating sequential circuits directly.
-"""
-
-import math
-
-import numpy as np
-
-from . import numba, cuda, hr_bytes
-from .sim import SimOps
-
-
-TMAX = np.float32(2 ** 127)
-"""A large 32-bit floating point value used to mark the end of a waveform."""
-TMAX_OVL = np.float32(1.1 * 2 ** 127)
-"""A large 32-bit floating point value used to mark the end of a waveform that
-may be incomplete due to an overflow."""
-TMIN = np.float32(-2 ** 127)
-"""A large negative 32-bit floating point value used at the beginning of waveforms that start with logic-1."""
-
-
-class WaveSim(SimOps):
-    """A waveform-based combinational logic timing simulator running on CPU.
-
-    :param circuit: The circuit to simulate.
-    :param timing: The timing annotation of the circuit (see :py:func:`kyupy.sdf.DelayFile.annotation` for details)
-    :param sims: The number of parallel simulations.
-    :param c_caps: The number of floats available in each waveform. Values must be positive and a multiple of 4.
-        Waveforms encode the signal switching history by storing transition times.
-        The waveform capacity roughly corresponds to the number of transitions
-        that can be stored. A capacity of ``n`` can store at least ``n-2`` transitions. If more transitions are
-        generated during simulation, the latest glitch is removed (freeing up two transition times) and an overflow
-        flag is set. If an integer is given, all waveforms are set to that same capacity. With an array of length
-        ``len(circuit.lines)`` the capacity is set for each intermediate waveform individually.
-    :param strip_forks: If enabled, the simulator will not evaluate fork nodes explicitly. This saves simulation time
-        by reducing the number of nodes to simulate, but (interconnect) delay annotations of lines read by fork nodes
-        are ignored.
-    :param keep_waveforms: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
-        memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
-    """
-    def __init__(self, circuit, timing, sims=8, c_caps=16, c_reuse=False, strip_forks=False):
-        assert c_caps > 0 and c_caps % 4 == 0
-        super().__init__(circuit, c_caps=c_caps//4, c_reuse=c_reuse, strip_forks=strip_forks)
-        self.sims = sims
-        
-        self.c_len *= 4
-        self.vat[...,0:2] *= 4
-
-        self.timing = np.zeros((self.c_len, 2, 2))
-        self.timing[:len(timing)] = timing
-
-        self.c = np.zeros((self.c_len, sims), dtype=np.float32) + TMAX
-        self.s = np.zeros((len(self.s_nodes), sims, 11), dtype=np.float32)
-        """Information about the logic values and transitions around the sequential elements (flip-flops) and ports.
-
-        The first 3 values are read by ``s_to_c()``.
-        The remaining values are written by ``c_to_s()``.
-
-        The elements are as follows:
-        * ``s[..., 0]`` (P)PI initial value
-        * ``s[..., 1]`` (P)PI transition time
-        * ``s[..., 2]`` (P)PI final value
-        * ``s[..., 3]`` (P)PO initial value
-        * ``s[..., 4]`` (P)PO earliest arrival time (EAT): The time at which the output transitioned from its initial value.
-        * ``s[..., 5]`` (P)PO latest stabilization time (LST): The time at which the output settled to its final value.
-        * ``s[..., 6]`` (P)PO final value
-        * ``s[..., 7]`` (P)PO capture value: probability of capturing a 1 at a given capture time
-        * ``s[..., 8]`` (P)PO sampled capture value: decided by random sampling according to a given seed.
-        * ``s[..., 9]`` (P)PO sampled capture slack: (capture time - LST) - decided by random sampling according to a given seed.
-        * ``s[..., 10]`` Overflow indicator: If non-zero, some signals in the input cone of this output had more
-          transitions than specified in ``c_caps``. Some transitions have been discarded, the
-          final values in the waveforms are still valid.
-        """
-                     
-        self.params = np.zeros((sims, 4), dtype=np.float32)
-        self.params[...,0] = 1.0
-
-        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.vat, self.ops, self.params)])
-
-        self.pi_s_locs = np.flatnonzero(self.vat[self.ppi_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
-        self.po_s_locs = np.flatnonzero(self.vat[self.ppo_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
-        self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
-
-        self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
-        self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
-
-        self.pi_c_locs = self.vat[self.ppi_offset+self.pi_s_locs, 0]
-        self.po_c_locs = self.vat[self.ppo_offset+self.po_s_locs, 0]
-        self.ppi_c_locs = self.vat[self.ppi_offset+self.ppio_s_locs, 0]
-        self.ppo_c_locs = self.vat[self.ppo_offset+self.ppio_s_locs, 0]
-
-        self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
-        self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
-
-    def __repr__(self):
-        return f'<{type(self).__name__} {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
-               f'levels={len(self.level_starts)} mem={hr_bytes(self.nbytes)}>'
-
-    def s_to_c(self):
-        """Transfers values of sequential elements and primary inputs to the combinational portion.
-
-        Based on the data in ``self.s``, waveforms are generated on the input lines of the circuit.
-        It modifies ``self.c``.
-        """
-        sins = np.moveaxis(self.s[self.pippi_s_locs], -1, 0)
-        cond = (sins[2] != 0) + 2*(sins[0] != 0)  # choices order: 0 R F 1
-        self.c[self.pippi_c_locs] = np.choose(cond, [TMAX, sins[1], TMIN, TMIN])
-        self.c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX])
-        self.c[self.pippi_c_locs+2] = TMAX
-
-    def c_prop(self, sims=None, sd=0.0, seed=1):
-        """Propagates all waveforms from the (pseudo) primary inputs to the (pseudo) primary outputs.
-
-        :param sims: Number of parallel simulations to execute. If None, all available simulations are performed.
-        :param sd: Standard deviation for injection of random delay variation. Active, if value is positive.
-        :param seed: Random seed for delay variations.
-        """
-        sims = min(sims or self.sims, self.sims)
-        for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.vat, 0, sims,
-                                         self.timing, self.params, sd, seed)
-
-    def c_to_s(self, time=TMAX, sd=0.0, seed=1):
-        """Simulates a capture operation at all sequential elements and primary outputs.
-
-        Propagated waveforms in ``self.c`` at and around the given capture time are analyzed and
-        the results are stored in ``self.s``.
-
-        :param time: The desired capture time. By default, a capture of the settled value is performed.
-        :param sd: A standard deviation for uncertainty in the actual capture time.
-        :param seed: The random seed for a capture with uncertainty.
-        """
-        for s_loc, (c_loc, c_len, _) in zip(self.poppo_s_locs, self.vat[self.ppo_offset+self.poppo_s_locs]):
-            for vector in range(self.sims):
-                self.s[s_loc, vector, 3:] = wave_capture_cpu(self.c, c_loc, c_len, vector, time=time, sd=sd, seed=seed)
-
-    def s_ppo_to_ppi(self, time=0.0):
-        """Re-assigns the last sampled capture to the appropriate pseudo-primary inputs (PPI). 
-        Each PPI transition is constructed from its previous final value, the
-        given time, and the sampled captured value of its PPO. Reads and modifies ``self.s``.
-
-        :param time: The transition time at the inputs (usually 0.0).
-        """
-        self.s[self.ppio_s_locs, :, 0] = self.s[self.ppio_s_locs, :, 2]
-        self.s[self.ppio_s_locs, :, 1] = time
-        self.s[self.ppio_s_locs, :, 2] = self.s[self.ppio_s_locs, :, 8]
-
-
-@numba.njit
-def rand_gauss_cpu(seed, sd):
-    clamp = 0.5
-    if sd <= 0.0:
-        return 1.0
-    while True:
-        x = -6.0
-        for _ in range(12):
-            seed = int(0xDEECE66D) * seed + 0xB
-            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
-        x *= sd
-        if abs(x) <= clamp:
-            break
-    return x + 1.0
-
-
-@numba.njit
-def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
-    lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
-
-    # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-call) >>>
-    overflows = int(0)
-
-    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
-
-    a_mem = vat[a_idx, 0]
-    b_mem = vat[b_idx, 0]
-    c_mem = vat[c_idx, 0]
-    d_mem = vat[d_idx, 0]
-    z_mem, z_cap, _ = vat[z_idx]
-
-    a_cur = int(0)
-    b_cur = int(0)
-    c_cur = int(0)
-    d_cur = int(0)                                          
-    z_cur = lut & 1
-    if z_cur == 1:
-        cbuf[z_mem, st_idx] = TMIN
-
-    a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == a_idx: a += param[2+z_cur]
-    b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == b_idx: b += param[2+z_cur]
-    c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == c_idx: c += param[2+z_cur]
-    d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == d_idx: d += param[2+z_cur]
-    
-    previous_t = TMIN
-
-    current_t = min(a, b, c, d)
-    inputs = int(0)
-
-    while current_t < TMAX:
-        z_val = z_cur & 1
-        if a == current_t:
-            a_cur += 1
-            a = cbuf[a_mem + a_cur, st_idx]
-            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[a_idx, 1, z_val] * rand_gauss_cpu(_seed ^ a_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == a_idx:
-                a += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 1
-            next_t = a   
-        
-        elif b == current_t:
-            b_cur += 1
-            b = cbuf[b_mem + b_cur, st_idx]
-            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[b_idx, 1, z_val] * rand_gauss_cpu(_seed ^ b_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == b_idx:
-                b += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 2
-            next_t = b
-                
-        elif c == current_t:
-            c_cur += 1
-            c = cbuf[c_mem + c_cur, st_idx]
-            c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[c_idx, 1, z_val] * rand_gauss_cpu(_seed ^ c_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == c_idx:
-                c += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 4
-            next_t = c 
-                     
-        else:
-            d_cur += 1
-            d = cbuf[d_mem + d_cur, st_idx]
-            d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[d_idx, 1, z_val] * rand_gauss_cpu(_seed ^ d_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == d_idx:
-                d += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 8
-            next_t = d 
-       
-        if (z_cur & 1) != ((lut >> inputs) & 1):
-            # we generate a toggle in z_mem, if:
-            #   ( it is the first toggle in z_mem OR
-            #   following toggle is earlier OR
-            #   pulse is wide enough ) AND enough space in z_mem.
-            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
-                if z_cur < (z_cap - 1):
-                    cbuf[z_mem + z_cur, st_idx] = current_t
-                    previous_t = current_t
-                    z_cur += 1
-                else:
-                    overflows += 1
-                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
-                    z_cur -= 1
-            else:
-                z_cur -= 1
-                previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
-                
-        current_t = min(a, b, c, d)
-
-    # generate overflow flag or propagate from input
-    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
-    
-
-@numba.njit
-def level_eval_cpu(ops, op_start, op_stop, c, vat, st_start, st_stop, line_times, params, sd, seed):
-    overflows = 0
-    for op_idx in range(op_start, op_stop):
-        op = ops[op_idx]
-        for st_idx in range(st_start, st_stop):
-            wave_eval_cpu(op, c, vat, st_idx, line_times, params[st_idx], sd, seed)
-
-
-@numba.njit
-def wave_capture_cpu(c, c_loc, c_len, vector, time=TMAX, sd=0.0, seed=1):
-    s_sqrt2 = sd * math.sqrt(2)
-    m = 0.5
-    acc = 0.0
-    eat = TMAX
-    lst = TMIN
-    tog = 0
-    ovl = 0
-    val = int(0)
-    final = int(0)
-    w = c[c_loc:c_loc+c_len, vector]
-    for t in w:
-        if t >= TMAX:
-            if t == TMAX_OVL:
-                ovl = 1
-            break
-        m = -m
-        final ^= 1
-        if t < time:
-            val ^= 1
-        if t <= TMIN: continue
-        if s_sqrt2 > 0:
-            acc += m * (1 + math.erf((t - time) / s_sqrt2))
-        eat = min(eat, t)
-        lst = max(lst, t)
-        tog += 1
-    if s_sqrt2 > 0:
-        if m < 0:
-            acc += 1
-        if acc >= 0.99:
-            val = 1
-        elif acc > 0.01:
-            seed = (seed << 4) + (vector << 20) + c_loc
-            seed = int(0xDEECE66D) * seed + 0xB
-            seed = int(0xDEECE66D) * seed + 0xB
-            rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
-            val = rnd < acc
-        else:
-            val = 0
-    else:
-        acc = val
-
-    return (w[0] <= TMIN), eat, lst, final, acc, val, 0, ovl
-
-
-class WaveSimCuda(WaveSim):
-    """A GPU-accelerated waveform-based combinational logic timing simulator.
-
-    The API is the same as for :py:class:`WaveSim`.
-    All internal memories are mirrored into GPU memory upon construction.
-    Some operations like access to single waveforms can involve large communication overheads.
-    """
-    def __init__(self, circuit, timing, sims=8, c_caps=16, c_reuse=False, strip_forks=False):
-        super().__init__(circuit, timing, sims, c_caps, c_reuse, strip_forks)
-
-        self.c = cuda.to_device(self.c)
-        self.s = cuda.to_device(self.s)
-        self.ops = cuda.to_device(self.ops)
-        self.vat = cuda.to_device(self.vat)
-        self.timing = cuda.to_device(self.timing)
-        self.params = cuda.to_device(self.params)
-        
-        self._block_dim = (32, 16)
-
-    # TODO implement on GPU
-    #def s_to_c(self):
-
-    def _grid_dim(self, x, y):
-        gx = math.ceil(x / self._block_dim[0])
-        gy = math.ceil(y / self._block_dim[1])
-        return gx, gy
-    
-    def c_prop(self, sims=None, sd=0.0, seed=1):
-        sims = min(sims or self.sims, self.sims)
-        for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            grid_dim = self._grid_dim(sims, op_stop - op_start)
-            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.vat, int(0),
-                sims, self.timing, self.params, sd, seed)
-        cuda.synchronize()
-    
-    # TODO implement on GPU
-    #def c_to_s(self):
-    
-    # TODO implement on GPU
-    #def s_ppo_to_ppi(self, time=0.0):
-    
-
-@cuda.jit(device=True)
-def rand_gauss_gpu(seed, sd):
-    clamp = 0.5
-    if sd <= 0.0:
-        return 1.0
-    while True:
-        x = -6.0
-        for _ in range(12):
-            seed = int(0xDEECE66D) * seed + 0xB
-            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
-        x *= sd
-        if abs(x) <= clamp:
-            break
-    return x + 1.0
-
-
-@cuda.jit()
-def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_times, param, sd, seed):
-    x, y = cuda.grid(2)
-    st_idx = st_start + x
-    op_idx = op_start + y
-    if st_idx >= st_stop: return
-    if op_idx >= op_stop: return
-
-    lut = ops[op_idx, 0]
-    z_idx = ops[op_idx, 1]
-    a_idx = ops[op_idx, 2]
-    b_idx = ops[op_idx, 3]
-    c_idx = ops[op_idx, 4]
-    d_idx = ops[op_idx, 5]
-
-    param = param[st_idx]
-    
-    # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-call) >>>
-    overflows = int(0)
-
-    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
-
-    a_mem = vat[a_idx, 0]
-    b_mem = vat[b_idx, 0]
-    c_mem = vat[c_idx, 0]
-    d_mem = vat[d_idx, 0]
-    z_mem, z_cap, _ = vat[z_idx]
-
-    a_cur = int(0)
-    b_cur = int(0)
-    c_cur = int(0)
-    d_cur = int(0)                                          
-    z_cur = lut & 1
-    if z_cur == 1:
-        cbuf[z_mem, st_idx] = TMIN
-
-    a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == a_idx: a += param[2+z_cur]
-    b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == b_idx: b += param[2+z_cur]
-    c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == c_idx: c += param[2+z_cur]
-    d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == d_idx: d += param[2+z_cur]
-    
-    previous_t = TMIN
-
-    current_t = min(a, b, c, d)
-    inputs = int(0)
-
-    while current_t < TMAX:
-        z_val = z_cur & 1
-        if a == current_t:
-            a_cur += 1
-            a = cbuf[a_mem + a_cur, st_idx]
-            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[a_idx, 1, z_val] * rand_gauss_gpu(_seed ^ a_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == a_idx:
-                a += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 1
-            next_t = a   
-        
-        elif b == current_t:
-            b_cur += 1
-            b = cbuf[b_mem + b_cur, st_idx]
-            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[b_idx, 1, z_val] * rand_gauss_gpu(_seed ^ b_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == b_idx:
-                b += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 2
-            next_t = b
-                
-        elif c == current_t:
-            c_cur += 1
-            c = cbuf[c_mem + c_cur, st_idx]
-            c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[c_idx, 1, z_val] * rand_gauss_gpu(_seed ^ c_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == c_idx:
-                c += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 4
-            next_t = c 
-                     
-        else:
-            d_cur += 1
-            d = cbuf[d_mem + d_cur, st_idx]
-            d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[d_idx, 1, z_val] * rand_gauss_gpu(_seed ^ d_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == d_idx:
-                d += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 8
-            next_t = d 
-       
-        if (z_cur & 1) != ((lut >> inputs) & 1):
-            # we generate a toggle in z_mem, if:
-            #   ( it is the first toggle in z_mem OR
-            #   following toggle is earlier OR
-            #   pulse is wide enough ) AND enough space in z_mem.
-            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
-                if z_cur < (z_cap - 1):
-                    cbuf[z_mem + z_cur, st_idx] = current_t
-                    previous_t = current_t
-                    z_cur += 1
-                else:
-                    overflows += 1
-                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
-                    z_cur -= 1
-            else:
-                z_cur -= 1
-                previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
-                
-        current_t = min(a, b, c, d)
-
-    # generate overflow flag or propagate from input
-    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
--- a/src/kyupy/wave_sim_old.py
+++ b/src/kyupy/wave_sim_old.py
@ -0,0 +1,961 @@
				@@ -0,0 +1,961 @@
+"""High-throughput combinational logic timing simulators.
+
+These simulators work similarly to :py:class:`~kyupy.logic_sim.LogicSim`.
+They propagate values through the combinational circuit from (pseudo) primary inputs to (pseudo) primary outputs.
+Instead of propagating logic values, these simulators propagate signal histories (waveforms).
+They are designed to run many simulations in parallel and while their latencies are quite high, they can achieve
+high throughput.
+
+The simulators are not event-based and are not capable of simulating sequential circuits directly.
+
+Two simulators are available: :py:class:`WaveSim` runs on the CPU, and the derived class
+:py:class:`WaveSimCuda` runs on the GPU.
+"""
+
+import math
+from bisect import bisect, insort_left
+
+import numpy as np
+
+from . import numba, cuda, hr_bytes
+
+
+TMAX = np.float32(2 ** 127)
+"""A large 32-bit floating point value used to mark the end of a waveform."""
+TMAX_OVL = np.float32(1.1 * 2 ** 127)
+"""A large 32-bit floating point value used to mark the end of a waveform that
+may be incomplete due to an overflow."""
+TMIN = np.float32(-2 ** 127)
+"""A large negative 32-bit floating point value used at the beginning of waveforms that start with logic-1."""
+
+
+class Heap:
+    def __init__(self):
+        self.chunks = dict()  # map start location to chunk size
+        self.released = list()  # chunks that were released
+        self.current_size = 0
+        self.max_size = 0
+
+    def alloc(self, size):
+        for idx, loc in enumerate(self.released):
+            if self.chunks[loc] == size:
+                del self.released[idx]
+                return loc
+            if self.chunks[loc] > size:  # split chunk
+                chunksize = self.chunks[loc]
+                self.chunks[loc] = size
+                self.chunks[loc + size] = chunksize - size
+                self.released[idx] = loc + size  # move released pointer: loc -> loc+size
+                return loc
+        # no previously released chunk; make new one
+        loc = self.current_size
+        self.chunks[loc] = size
+        self.current_size += size
+        self.max_size = max(self.max_size, self.current_size)
+        return loc
+
+    def free(self, loc):
+        size = self.chunks[loc]
+        if loc + size == self.current_size:  # end of managed area, remove chunk
+            del self.chunks[loc]
+            self.current_size -= size
+            # check and remove prev chunk if free
+            if len(self.released) > 0:
+                prev = self.released[-1]
+                if prev + self.chunks[prev] == self.current_size:
+                    chunksize = self.chunks[prev]
+                    del self.chunks[prev]
+                    del self.released[-1]
+                    self.current_size -= chunksize
+            return
+        released_idx = bisect(self.released, loc)
+        if released_idx < len(self.released) and loc + size == self.released[released_idx]:  # next chunk is free, merge
+            chunksize = size + self.chunks[loc + size]
+            del self.chunks[loc + size]
+            self.chunks[loc] = chunksize
+            size = self.chunks[loc]
+            self.released[released_idx] = loc
+        else:
+            insort_left(self.released, loc)  # put in a new release
+        if released_idx > 0:  # check if previous chunk is free
+            prev = self.released[released_idx - 1]
+            if prev + self.chunks[prev] == loc:  # previous chunk is adjacent to freed one, merge
+                chunksize = size + self.chunks[prev]
+                del self.chunks[loc]
+                self.chunks[prev] = chunksize
+                del self.released[released_idx]
+
+    def __repr__(self):
+        r = []
+        for loc in sorted(self.chunks.keys()):
+            size = self.chunks[loc]
+            released_idx = bisect(self.released, loc)
+            is_released = released_idx > 0 and len(self.released) > 0 and self.released[released_idx - 1] == loc
+            r.append(f'{loc:5d}: {"free" if is_released else "used"} {size}')
+        return "\n".join(r)
+
+
+class WaveSim:
+    """A waveform-based combinational logic timing simulator running on CPU.
+
+    :param circuit: The circuit to simulate.
+    :param timing: The timing annotation of the circuit (see :py:func:`kyupy.sdf.DelayFile.annotation` for details)
+    :param sims: The number of parallel simulations.
+    :param wavecaps: The number of floats available in each waveform. Waveforms are encoding the signal switching
+        history by storing transition times. The waveform capacity roughly corresponds to the number of transitions
+        that can be stored. A capacity of ``n`` can store at least ``n-2`` transitions. If more transitions are
+        generated during simulation, the latest glitch is removed (freeing up two transition times) and an overflow
+        flag is set. If an integer is given, all waveforms are set to that same capacity. With an array of length
+        ``len(circuit.lines)`` the capacity can be controlled for each intermediate waveform individually.
+    :param strip_forks: If enabled, the simulator will not evaluate fork nodes explicitly. This saves simulation time
+        by reducing the number of nodes to simulate, but (interconnect) delay annotations of lines read by fork nodes
+        are ignored.
+    :param keep_waveforms: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
+        memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
+    """
+    def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
+        self.circuit = circuit
+        self.sims = sims
+        self.overflows = 0
+        self.interface = list(circuit.io_nodes) + [n for n in circuit.nodes if 'dff' in n.kind.lower()]
+
+        self.lst_eat_valid = False
+
+        self.cdata = np.zeros((len(self.interface), sims, 7), dtype='float32')
+                     
+        self.sdata = np.zeros((sims, 4), dtype='float32')
+        self.sdata[...,0] = 1.0
+
+        if isinstance(wavecaps, int):
+            wavecaps = [wavecaps] * len(circuit.lines)
+
+        intf_wavecap = 4  # sufficient for storing only 1 transition.
+
+        # indices for state allocation table (sat)
+        self.zero_idx = len(circuit.lines)
+        self.tmp_idx = self.zero_idx + 1
+        self.ppi_offset = self.tmp_idx + 1
+        self.ppo_offset = self.ppi_offset + len(self.interface)
+        self.sat_length = self.ppo_offset + len(self.interface)
+
+        # translate circuit structure into self.ops
+        ops = []
+        interface_dict = dict((n, i) for i, n in enumerate(self.interface))
+        for n in circuit.topological_order():
+            if n in interface_dict:
+                inp_idx = self.ppi_offset + interface_dict[n]
+                if len(n.outs) > 0 and n.outs[0] is not None:  # first output of a PI/PPI
+                    ops.append((0b1010, n.outs[0].index, inp_idx, self.zero_idx))
+                if 'dff' in n.kind.lower():  # second output of DFF is inverted
+                    if len(n.outs) > 1 and n.outs[1] is not None:
+                        ops.append((0b0101, n.outs[1].index, inp_idx, self.zero_idx))
+                else:  # if not DFF, no output is inverted.
+                    for o_line in n.outs[1:]:
+                        if o_line is not None:
+                            ops.append((0b1010, o_line.index, inp_idx, self.zero_idx))
+            else:  # regular node, not PI/PPI or PO/PPO
+                o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx
+                i0_idx = n.ins[0].index if len(n.ins) > 0 and n.ins[0] is not None else self.zero_idx
+                i1_idx = n.ins[1].index if len(n.ins) > 1 and n.ins[1] is not None else self.zero_idx
+                kind = n.kind.lower()
+                if kind == '__fork__':
+                    if not strip_forks:
+                        for o_line in n.outs:
+                            if o_line is not None:
+                                ops.append((0b1010, o_line.index, i0_idx, i1_idx))
+                elif kind.startswith('nand'):
+                    ops.append((0b0111, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('nor'):
+                    ops.append((0b0001, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('and'):
+                    ops.append((0b1000, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('or'):
+                    ops.append((0b1110, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('xor'):
+                    ops.append((0b0110, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('xnor'):
+                    ops.append((0b1001, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('not') or kind.startswith('inv') or kind.startswith('ibuf'):
+                    ops.append((0b0101, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('buf') or kind.startswith('nbuf'):
+                    ops.append((0b1010, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('__const1__') or kind.startswith('tieh'):
+                    ops.append((0b0101, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('__const0__') or kind.startswith('tiel'):
+                    ops.append((0b1010, o0_idx, i0_idx, i1_idx))
+                else:
+                    print('unknown gate type', kind)
+        self.ops = np.asarray(ops, dtype='int32')
+
+        # create a map from fanout lines to stem lines for fork stripping
+        stems = np.zeros(self.sat_length, dtype='int32') - 1  # default to -1: 'no fanout line'
+        if strip_forks:
+            for f in circuit.forks.values():
+                prev_line = f.ins[0]
+                while prev_line.driver.kind == '__fork__':
+                    prev_line = prev_line.driver.ins[0]
+                stem_idx = prev_line.index
+                for ol in f.outs:
+                    stems[ol] = stem_idx
+
+        # calculate level (distance from PI/PPI) and reference count for each line
+        levels = np.zeros(self.sat_length, dtype='int32')
+        ref_count = np.zeros(self.sat_length, dtype='int32')
+        level_starts = [0]
+        current_level = 1
+        for i, op in enumerate(self.ops):
+            # if we fork-strip, always take the stems for determining fan-in level
+            i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
+            i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
+            if levels[i0_idx] >= current_level or levels[i1_idx] >= current_level:
+                current_level += 1
+                level_starts.append(i)
+            levels[op[1]] = current_level  # set level of the output line
+            ref_count[i0_idx] += 1
+            ref_count[i1_idx] += 1
+        self.level_starts = np.asarray(level_starts, dtype='int32')
+        self.level_stops = np.asarray(level_starts[1:] + [len(self.ops)], dtype='int32')
+
+        # state allocation table. maps line and interface indices to self.state memory locations
+        self.sat = np.zeros((self.sat_length, 3), dtype='int')
+        self.sat[:, 0] = -1
+
+        h = Heap()
+
+        # allocate and keep memory for special fields
+        self.sat[self.zero_idx] = h.alloc(intf_wavecap), intf_wavecap, 0
+        self.sat[self.tmp_idx] = h.alloc(intf_wavecap), intf_wavecap, 0
+        ref_count[self.zero_idx] += 1
+        ref_count[self.tmp_idx] += 1
+
+        # allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later)
+        for i, n in enumerate(self.interface):
+            if len(n.outs) > 0:
+                self.sat[self.ppi_offset + i] = h.alloc(intf_wavecap), intf_wavecap, 0
+                ref_count[self.ppi_offset + i] += 1
+            if len(n.ins) > 0:
+                i0_idx = stems[n.ins[0]] if stems[n.ins[0]] >= 0 else n.ins[0]
+                ref_count[i0_idx] += 1
+
+        # allocate memory for the rest of the circuit
+        for op_start, op_stop in zip(self.level_starts, self.level_stops):
+            free_list = []
+            for op in self.ops[op_start:op_stop]:
+                # if we fork-strip, always take the stems
+                i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
+                i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
+                ref_count[i0_idx] -= 1
+                ref_count[i1_idx] -= 1
+                if ref_count[i0_idx] <= 0: free_list.append(self.sat[i0_idx, 0])
+                if ref_count[i1_idx] <= 0: free_list.append(self.sat[i1_idx, 0])
+                o_idx = op[1]
+                cap = wavecaps[o_idx]
+                self.sat[o_idx] = h.alloc(cap), cap, 0
+            if not keep_waveforms:
+                for loc in free_list:
+                    h.free(loc)
+
+        # copy memory location and capacity from stems to fanout lines
+        for lidx, stem in enumerate(stems):
+            if stem >= 0:  # if at a fanout line
+                self.sat[lidx] = self.sat[stem]
+
+        # copy memory location to PO/PPO area
+        for i, n in enumerate(self.interface):
+            if len(n.ins) > 0:
+                self.sat[self.ppo_offset + i] = self.sat[n.ins[0]]
+
+        # pad timing
+        self.timing = np.zeros((self.sat_length, 2, 2))
+        self.timing[:len(timing)] = timing
+
+        # allocate self.state
+        self.state = np.zeros((h.max_size, sims), dtype='float32') + TMAX
+
+        m1 = np.array([2 ** x for x in range(7, -1, -1)], dtype='uint8')
+        m0 = ~m1
+        self.mask = np.rollaxis(np.vstack((m0, m1)), 1)
+
+    def __repr__(self):
+        total_mem = self.state.nbytes + self.sat.nbytes + self.ops.nbytes + self.cdata.nbytes
+        return f'<WaveSim {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
+               f'levels={len(self.level_starts)} mem={hr_bytes(total_mem)}>'
+
+    def get_line_delay(self, line, polarity):
+        """Returns the current delay of the given ``line`` and ``polarity`` in the simulation model."""
+        return self.timing[line, 0, polarity]
+
+    def set_line_delay(self, line, polarity, delay):
+        """Sets a new ``delay`` for the given ``line`` and ``polarity`` in the simulation model."""
+        self.timing[line, 0, polarity] = delay
+
+    def assign(self, vectors, time=0.0, offset=0):
+        """Assigns new values to the primary inputs and state-elements.
+
+        :param vectors: The values to assign preferably in 8-valued logic. The values are converted to
+            appropriate waveforms with or one transition (``RISE``, ``FALL``) no transitions
+            (``ZERO``, ``ONE``, and others).
+        :type vectors: :py:class:`~kyupy.logic.BPArray`
+        :param time: The transition time of the generated waveforms.
+        :param offset: The offset into the vector set. The vector assigned to the first simulator is
+            ``vectors[offset]``.
+        """
+        nvectors = min(len(vectors) - offset, self.sims)
+        for i in range(len(self.interface)):
+            ppi_loc = self.sat[self.ppi_offset + i, 0]
+            if ppi_loc < 0: continue
+            for p in range(nvectors):
+                vector = p + offset
+                a = vectors.data[i, :, vector // 8]
+                m = self.mask[vector % 8]
+                toggle = 0
+                if len(a) <= 2:
+                    if a[0] & m[1]:
+                        self.state[ppi_loc, p] = TMIN
+                        toggle += 1
+                else:
+                    if a[1] & m[1]:
+                        self.state[ppi_loc, p] = TMIN
+                        toggle += 1
+                    if (a[2] & m[1]) and ((a[0] & m[1]) != (a[1] & m[1])):
+                        self.state[ppi_loc + toggle, p] = time
+                        toggle += 1
+                self.state[ppi_loc + toggle, p] = TMAX
+
+    def propagate(self, sims=None, sd=0.0, seed=1):
+        """Propagates all waveforms from the (pseudo) primary inputs to the (pseudo) primary outputs.
+
+        :param sims: Number of parallel simulations to execute. If None, all available simulations are performed.
+        :param sd: Standard deviation for injection of random delay variation. Active, if value is positive.
+        :param seed: Random seed for delay variations.
+        """
+        sims = min(sims or self.sims, self.sims)
+        for op_start, op_stop in zip(self.level_starts, self.level_stops):
+            self.overflows += level_eval(self.ops, op_start, op_stop, self.state, self.sat, 0, sims,
+                                         self.timing, self.sdata, sd, seed)
+        self.lst_eat_valid = False
+
+    def wave(self, line, vector):
+        # """Returns the desired waveform from the simulation state. Only valid, if simulator was
+        # instantiated with ``keep_waveforms=True``."""
+        if line < 0:
+            return [TMAX]
+        mem, wcap, _ = self.sat[line]
+        if mem < 0:
+            return [TMAX]
+        return self.state[mem:mem + wcap, vector]
+
+    def wave_ppi(self, i, vector):
+        return self.wave(self.ppi_offset + i, vector)
+
+    def wave_ppo(self, o, vector):
+        return self.wave(self.ppo_offset + o, vector)
+
+    def capture(self, time=TMAX, sd=0.0, seed=1, cdata=None, offset=0):
+        """Simulates a capture operation at all state-elements and primary outputs.
+
+        The capture analyzes the propagated waveforms at and around the given capture time and returns
+        various results for each capture operation.
+
+        :param time: The desired capture time. By default, a capture of the settled value is performed.
+        :param sd: A standard deviation for uncertainty in the actual capture time.
+        :param seed: The random seed for a capture with uncertainty.
+        :param cdata: An array to copy capture data into (optional). See the return value for details.
+        :param offset: An offset into the supplied capture data array.
+        :return: The capture data as numpy array.
+
+            The 3-dimensional capture data array contains for each interface node (axis 0),
+            and each test (axis 1), seven values:
+
+            0. Probability of capturing a 1 at the given capture time (same as next value, if no
+               standard deviation given).
+            1. A capture value decided by random sampling according to above probability and given seed.
+            2. The final value (assume a very late capture time).
+            3. True, if there was a premature capture (capture error), i.e. final value is different
+               from captured value.
+            4. Earliest arrival time. The time at which the output transitioned from its initial value.
+            5. Latest stabilization time. The time at which the output transitioned to its final value.
+            6. Overflow indicator. If non-zero, some signals in the input cone of this output had more
+               transitions than specified in ``wavecaps``. Some transitions have been discarded, the
+               final values in the waveforms are still valid.
+        """
+        for i, node in enumerate(self.interface):
+            if len(node.ins) == 0: continue
+            for p in range(self.sims):
+                self.cdata[i, p] = self.capture_wave(self.ppo_offset + i, p, time, sd, seed)
+        if cdata is not None:
+            assert offset < cdata.shape[1]
+            cap_dim = min(cdata.shape[1] - offset, self.sims)
+            cdata[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim]
+        self.lst_eat_valid = True
+        return self.cdata
+
+    def reassign(self, time=0.0):
+        """Re-assigns the last capture to the appropriate pseudo-primary inputs. Generates a new set of
+        waveforms at the PPIs that start with the previous final value of that PPI, and transitions at the
+        given time to the value captured in a previous simulation. :py:func:`~WaveSim.capture` must be called
+        prior to this function. The final value of each PPI is taken from the randomly sampled concrete logic
+        values in the capture data.
+
+        :param time: The transition time at the inputs (usually 0.0).
+        """
+        for i in range(len(self.interface)):
+            ppi_loc = self.sat[self.ppi_offset + i, 0]
+            ppo_loc = self.sat[self.ppo_offset + i, 0]
+            if ppi_loc < 0 or ppo_loc < 0: continue
+            for sidx in range(self.sims):
+                ival = self.val(self.ppi_offset + i, sidx, TMAX) > 0.5
+                oval = self.cdata[i, sidx, 1] > 0.5
+                toggle = 0
+                if ival:
+                    self.state[ppi_loc, sidx] = TMIN
+                    toggle += 1
+                if ival != oval:
+                    self.state[ppi_loc + toggle, sidx] = time
+                    toggle += 1
+                self.state[ppi_loc + toggle, sidx] = TMAX
+
+    def eat(self, line, vector):
+        eat = TMAX
+        for t in self.wave(line, vector):
+            if t >= TMAX: break
+            if t <= TMIN: continue
+            eat = min(eat, t)
+        return eat
+
+    def lst(self, line, vector):
+        lst = TMIN
+        for t in self.wave(line, vector):
+            if t >= TMAX: break
+            if t <= TMIN: continue
+            lst = max(lst, t)
+        return lst
+
+    def lst_ppo(self, o, vector):
+        if not self.lst_eat_valid:
+            self.capture()
+        return self.cdata[o, vector, 5]
+
+    def toggles(self, line, vector):
+        tog = 0
+        for t in self.wave(line, vector):
+            if t >= TMAX: break
+            if t <= TMIN: continue
+            tog += 1
+        return tog
+
+    def _vals(self, idx, vector, times, sd=0.0):
+        s_sqrt2 = sd * math.sqrt(2)
+        m = 0.5
+        accs = [0.0] * len(times)
+        values = [0] * len(times)
+        for t in self.wave(idx, vector):
+            if t >= TMAX: break
+            for idx, time in enumerate(times):
+                if t < time:
+                    values[idx] = values[idx] ^ 1
+            m = -m
+            if t <= TMIN: continue
+            if s_sqrt2 > 0:
+                for idx, time in enumerate(times):
+                    accs[idx] += m * (1 + math.erf((t - time) / s_sqrt2))
+        if (m < 0) and (s_sqrt2 > 0):
+            for idx, time in enumerate(times):
+                accs[idx] += 1
+        if s_sqrt2 == 0:
+            return values
+        return accs
+
+    def vals(self, line, vector, times, sd=0):
+        return self._vals(line, vector, times, sd)
+
+    def val(self, line, vector, time=TMAX, sd=0):
+        return self.capture_wave(line, vector, time, sd)[0]
+
+    def vals_ppo(self, o, vector, times, sd=0):
+        return self._vals(self.ppo_offset + o, vector, times, sd)
+
+    def val_ppo(self, o, vector, time=TMAX, sd=0):
+        if not self.lst_eat_valid:
+            self.capture(time, sd)
+        return self.cdata[o, vector, 0]
+
+    def capture_wave(self, line, vector, time=TMAX, sd=0.0, seed=1):
+        s_sqrt2 = sd * math.sqrt(2)
+        m = 0.5
+        acc = 0.0
+        eat = TMAX
+        lst = TMIN
+        tog = 0
+        ovl = 0
+        val = int(0)
+        final = int(0)
+        for t in self.wave(line, vector):
+            if t >= TMAX:
+                if t == TMAX_OVL:
+                    ovl = 1
+                break
+            m = -m
+            final ^= 1
+            if t < time:
+                val ^= 1
+            if t <= TMIN: continue
+            if s_sqrt2 > 0:
+                acc += m * (1 + math.erf((t - time) / s_sqrt2))
+            eat = min(eat, t)
+            lst = max(lst, t)
+            tog += 1
+        if s_sqrt2 > 0:
+            if m < 0:
+                acc += 1
+            if acc >= 0.99:
+                val = 1
+            elif acc > 0.01:
+                seed = (seed << 4) + (vector << 20) + (line-self.ppo_offset << 1)
+                seed = int(0xDEECE66D) * seed + 0xB
+                seed = int(0xDEECE66D) * seed + 0xB
+                rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
+                val = rnd < acc
+            else:
+                val = 0
+        else:
+            acc = val
+
+        return acc, val, final, (val != final), eat, lst, ovl
+
+
+@numba.njit
+def level_eval(ops, op_start, op_stop, state, sat, st_start, st_stop, line_times, sdata, sd, seed):
+    overflows = 0
+    for op_idx in range(op_start, op_stop):
+        op = ops[op_idx]
+        for st_idx in range(st_start, st_stop):
+            overflows += wave_eval(op, state, sat, st_idx, line_times, sdata[st_idx], sd, seed)
+    return overflows
+
+
+@numba.njit
+def rand_gauss(seed, sd):
+    clamp = 0.5
+    if sd <= 0.0:
+        return 1.0
+    while True:
+        x = -6.0
+        for _ in range(12):
+            seed = int(0xDEECE66D) * seed + 0xB
+            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
+        x *= sd
+        if abs(x) <= clamp:
+            break
+    return x + 1.0
+
+
+@numba.njit
+def wave_eval(op, state, sat, st_idx, line_times, sdata, sd=0.0, seed=0):
+    lut, z_idx, a_idx, b_idx = op
+    overflows = int(0)
+
+    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
+
+    a_mem = sat[a_idx, 0]
+    b_mem = sat[b_idx, 0]
+    z_mem, z_cap, _ = sat[z_idx]
+
+    a_cur = int(0)
+    b_cur = int(0)
+    z_cur = lut & 1
+    if z_cur == 1:
+        state[z_mem, st_idx] = TMIN
+
+    a = state[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss(_seed ^ a_mem ^ z_cur, sd) * sdata[0]
+    if int(sdata[1]) == a_idx: a += sdata[2+z_cur]
+    b = state[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss(_seed ^ b_mem ^ z_cur, sd) * sdata[0]
+    if int(sdata[1]) == b_idx: b += sdata[2+z_cur]
+    
+    previous_t = TMIN
+
+    current_t = min(a, b)
+    inputs = int(0)
+
+    while current_t < TMAX:
+        z_val = z_cur & 1
+        if b < a:
+            b_cur += 1
+            b = state[b_mem + b_cur, st_idx]
+            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ b_mem ^ z_val ^ 1, sd) * sdata[0]
+            thresh = line_times[b_idx, 1, z_val] * rand_gauss(_seed ^ b_mem ^ z_val, sd) * sdata[0]
+            if int(sdata[1]) == b_idx:
+                b += sdata[2+(z_val^1)]
+                thresh += sdata[2+z_val]
+            inputs ^= 2
+            next_t = b
+        else:
+            a_cur += 1
+            a = state[a_mem + a_cur, st_idx]
+            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ a_mem ^ z_val ^ 1, sd) * sdata[0]
+            thresh = line_times[a_idx, 1, z_val] * rand_gauss(_seed ^ a_mem ^ z_val, sd) * sdata[0]
+            if int(sdata[1]) == a_idx:
+                a += sdata[2+(z_val^1)]
+                thresh += sdata[2+z_val]
+            inputs ^= 1
+            next_t = a
+
+        if (z_cur & 1) != ((lut >> inputs) & 1):
+            # we generate a toggle in z_mem, if:
+            #   ( it is the first toggle in z_mem OR
+            #   following toggle is earlier OR
+            #   pulse is wide enough ) AND enough space in z_mem.
+            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
+                if z_cur < (z_cap - 1):
+                    state[z_mem + z_cur, st_idx] = current_t
+                    previous_t = current_t
+                    z_cur += 1
+                else:
+                    overflows += 1
+                    previous_t = state[z_mem + z_cur - 1, st_idx]
+                    z_cur -= 1
+            else:
+                z_cur -= 1
+                if z_cur > 0:
+                    previous_t = state[z_mem + z_cur - 1, st_idx]
+                else:
+                    previous_t = TMIN
+        current_t = min(a, b)
+
+    if overflows > 0:
+        state[z_mem + z_cur, st_idx] = TMAX_OVL
+    else:
+        state[z_mem + z_cur, st_idx] = a if a > b else b  # propagate overflow flags by storing biggest TMAX from input
+
+    return overflows
+
+
+
+class WaveSimCuda(WaveSim):
+    """A GPU-accelerated waveform-based combinational logic timing simulator.
+
+    The API is the same as for :py:class:`WaveSim`.
+    All internal memories are mirrored into GPU memory upon construction.
+    Some operations like access to single waveforms can involve large communication overheads.
+    """
+    def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
+        super().__init__(circuit, timing, sims, wavecaps, strip_forks, keep_waveforms)
+
+        self.tdata = np.zeros((len(self.interface), 3, (sims - 1) // 8 + 1), dtype='uint8')
+
+        self.d_state = cuda.to_device(self.state)
+        self.d_sat = cuda.to_device(self.sat)
+        self.d_ops = cuda.to_device(self.ops)
+        self.d_timing = cuda.to_device(self.timing)
+        self.d_tdata = cuda.to_device(self.tdata)
+        self.d_cdata = cuda.to_device(self.cdata)
+        self.d_sdata = cuda.to_device(self.sdata)
+
+        self._block_dim = (32, 16)
+
+    def __repr__(self):
+        total_mem = self.state.nbytes + self.sat.nbytes + self.ops.nbytes + self.timing.nbytes + \
+                    self.tdata.nbytes + self.cdata.nbytes
+        return f'<WaveSimCuda {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
+               f'levels={len(self.level_starts)} mem={hr_bytes(total_mem)}>'
+
+    def get_line_delay(self, line, polarity):
+        return self.d_timing[line, 0, polarity]
+
+    def set_line_delay(self, line, polarity, delay):
+        self.d_timing[line, 0, polarity] = delay
+                     
+    def sdata_to_device(self):
+        cuda.to_device(self.sdata, to=self.d_sdata)
+
+    def assign(self, vectors, time=0.0, offset=0):
+        assert (offset % 8) == 0
+        byte_offset = offset // 8
+        assert byte_offset < vectors.data.shape[-1]
+        pdim = min(vectors.data.shape[-1] - byte_offset, self.tdata.shape[-1])
+
+        self.tdata[..., 0:pdim] = vectors.data[..., byte_offset:pdim + byte_offset]
+        if vectors.m == 2:
+            self.tdata[:, 2, 0:pdim] = 0
+        cuda.to_device(self.tdata, to=self.d_tdata)
+
+        grid_dim = self._grid_dim(self.sims, len(self.interface))
+        assign_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppi_offset,
+                                                 len(self.interface), self.d_tdata, time)
+
+    def _grid_dim(self, x, y):
+        gx = math.ceil(x / self._block_dim[0])
+        gy = math.ceil(y / self._block_dim[1])
+        return gx, gy
+
+    def propagate(self, sims=None, sd=0.0, seed=1):
+        sims = min(sims or self.sims, self.sims)
+        for op_start, op_stop in zip(self.level_starts, self.level_stops):
+            grid_dim = self._grid_dim(sims, op_stop - op_start)
+            wave_kernel[grid_dim, self._block_dim](self.d_ops, op_start, op_stop, self.d_state, self.sat, int(0),
+                                                   sims, self.d_timing, self.d_sdata, sd, seed)
+        cuda.synchronize()
+        self.lst_eat_valid = False
+
+    def wave(self, line, vector):
+        if line < 0:
+            return [TMAX]
+        mem, wcap, _ = self.sat[line]
+        if mem < 0:
+            return [TMAX]
+        return self.d_state[mem:mem + wcap, vector]
+
+    def capture(self, time=TMAX, sd=0, seed=1, cdata=None, offset=0):
+        grid_dim = self._grid_dim(self.sims, len(self.interface))
+        capture_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppo_offset,
+                                                  self.d_cdata, time, sd * math.sqrt(2), seed)
+        self.cdata[...] = self.d_cdata
+        if cdata is not None:
+            assert offset < cdata.shape[1]
+            cap_dim = min(cdata.shape[1] - offset, self.sims)
+            cdata[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim]
+        self.lst_eat_valid = True
+        return self.cdata
+
+    def reassign(self, time=0.0):
+        grid_dim = self._grid_dim(self.sims, len(self.interface))
+        reassign_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppi_offset, self.ppo_offset,
+                                                   self.d_cdata, time)
+        cuda.synchronize()
+
+    def wavecaps(self):
+        gx = math.ceil(len(self.circuit.lines) / 512)
+        wavecaps_kernel[gx, 512](self.d_state, self.d_sat, self.sims)
+        self.sat[...] = self.d_sat
+        return self.sat[..., 2]
+
+
+@cuda.jit()
+def wavecaps_kernel(state, sat, sims):
+    idx = cuda.grid(1)
+    if idx >= len(sat): return
+
+    lidx, lcap, _ = sat[idx]
+    if lidx < 0: return
+
+    wcap = 0
+    for sidx in range(sims):
+        for tidx in range(lcap):
+            t = state[lidx + tidx, sidx]
+            if tidx > wcap:
+                wcap = tidx
+            if t >= TMAX: break
+
+    sat[idx, 2] = wcap + 1
+
+
+@cuda.jit()
+def reassign_kernel(state, sat, ppi_offset, ppo_offset, cdata, ppi_time):
+    vector, y = cuda.grid(2)
+    if vector >= state.shape[-1]: return
+    if ppo_offset + y >= len(sat): return
+
+    ppo, _, _ = sat[ppo_offset + y]
+    ppi, ppi_cap, _ = sat[ppi_offset + y]
+    if ppo < 0: return
+    if ppi < 0: return
+
+    ppo_val = int(cdata[y, vector, 1])
+    ppi_val = int(0)
+    for tidx in range(ppi_cap):
+        t = state[ppi + tidx, vector]
+        if t >= TMAX: break
+        ppi_val ^= 1
+
+    # make new waveform at PPI
+    toggle = 0
+    if ppi_val:
+        state[ppi + toggle, vector] = TMIN
+        toggle += 1
+    if ppi_val != ppo_val:
+        state[ppi + toggle, vector] = ppi_time
+        toggle += 1
+    state[ppi + toggle, vector] = TMAX
+
+
+@cuda.jit()
+def capture_kernel(state, sat, ppo_offset, cdata, time, s_sqrt2, seed):
+    x, y = cuda.grid(2)
+    if ppo_offset + y >= len(sat): return
+    line, tdim, _ = sat[ppo_offset + y]
+    if line < 0: return
+    if x >= state.shape[-1]: return
+    vector = x
+    m = 0.5
+    acc = 0.0
+    eat = TMAX
+    lst = TMIN
+    tog = 0
+    ovl = 0
+    val = int(0)
+    final = int(0)
+    for tidx in range(tdim):
+        t = state[line + tidx, vector]
+        if t >= TMAX:
+            if t == TMAX_OVL:
+                ovl = 1
+            break
+        m = -m
+        final ^= 1
+        if t < time:
+            val ^= 1
+        if t <= TMIN: continue
+        if s_sqrt2 > 0:
+            acc += m * (1 + math.erf((t - time) / s_sqrt2))
+        eat = min(eat, t)
+        lst = max(lst, t)
+        tog += 1
+    if s_sqrt2 > 0:
+        if m < 0:
+            acc += 1
+        if acc >= 0.99:
+            val = 1
+        elif acc > 0.01:
+            seed = (seed << 4) + (vector << 20) + (y << 1)
+            seed = int(0xDEECE66D) * seed + 0xB
+            seed = int(0xDEECE66D) * seed + 0xB
+            rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
+            val = rnd < acc
+        else:
+            val = 0
+    else:
+        acc = val
+
+    cdata[y, vector, 0] = acc
+    cdata[y, vector, 1] = val
+    cdata[y, vector, 2] = final
+    cdata[y, vector, 3] = (val != final)
+    cdata[y, vector, 4] = eat
+    cdata[y, vector, 5] = lst
+    cdata[y, vector, 6] = ovl
+
+
+@cuda.jit()
+def assign_kernel(state, sat, ppi_offset, intf_len, tdata, time):
+    x, y = cuda.grid(2)
+    if y >= intf_len: return
+    line = sat[ppi_offset + y, 0]
+    if line < 0: return
+    sdim = state.shape[-1]
+    if x >= sdim: return
+    vector = x
+    a0 = tdata[y, 0, vector // 8]
+    a1 = tdata[y, 1, vector // 8]
+    a2 = tdata[y, 2, vector // 8]
+    m = np.uint8(1 << (7 - (vector % 8)))
+    toggle = 0
+    if a1 & m:
+        state[line + toggle, x] = TMIN
+        toggle += 1
+    if (a2 & m) and ((a0 & m) != (a1 & m)):
+        state[line + toggle, x] = time
+        toggle += 1
+    state[line + toggle, x] = TMAX
+
+
+@cuda.jit(device=True)
+def rand_gauss_dev(seed, sd):
+    clamp = 0.5
+    if sd <= 0.0:
+        return 1.0
+    while True:
+        x = -6.0
+        for _ in range(12):
+            seed = int(0xDEECE66D) * seed + 0xB
+            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
+        x *= sd
+        if abs(x) <= clamp:
+            break
+    return x + 1.0
+
+
+@cuda.jit()
+def wave_kernel(ops, op_start, op_stop, state, sat, st_start, st_stop, line_times, sdata, sd, seed):
+    x, y = cuda.grid(2)
+    st_idx = st_start + x
+    op_idx = op_start + y
+    if st_idx >= st_stop: return
+    if op_idx >= op_stop: return
+    lut = ops[op_idx, 0]
+    z_idx = ops[op_idx, 1]
+    a_idx = ops[op_idx, 2]
+    b_idx = ops[op_idx, 3]
+    overflows = int(0)
+    sdata = sdata[st_idx]
+
+    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
+
+    a_mem = sat[a_idx, 0]
+    b_mem = sat[b_idx, 0]
+    z_mem, z_cap, _ = sat[z_idx]
+
+    a_cur = int(0)
+    b_cur = int(0)
+    z_cur = lut & 1
+    if z_cur == 1:
+        state[z_mem, st_idx] = TMIN
+
+    a = state[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_dev(_seed ^ a_mem ^ z_cur, sd) * sdata[0]
+    if int(sdata[1]) == a_idx: a += sdata[2+z_cur]
+    b = state[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_dev(_seed ^ b_mem ^ z_cur, sd) * sdata[0]
+    if int(sdata[1]) == b_idx: b += sdata[2+z_cur]
+    
+    previous_t = TMIN
+
+    current_t = min(a, b)
+    inputs = int(0)
+
+    while current_t < TMAX:
+        z_val = z_cur & 1
+        if b < a:
+            b_cur += 1
+            b = state[b_mem + b_cur, st_idx]
+            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_dev(_seed ^ b_mem ^ z_val ^ 1, sd) * sdata[0]
+            thresh = line_times[b_idx, 1, z_val] * rand_gauss_dev(_seed ^ b_mem ^ z_val, sd) * sdata[0]
+            if int(sdata[1]) == b_idx:
+                b += sdata[2+(z_val^1)]
+                thresh += sdata[2+z_val]
+            inputs ^= 2
+            next_t = b
+        else:
+            a_cur += 1
+            a = state[a_mem + a_cur, st_idx]
+            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_dev(_seed ^ a_mem ^ z_val ^ 1, sd) * sdata[0]
+            thresh = line_times[a_idx, 1, z_val] * rand_gauss_dev(_seed ^ a_mem ^ z_val, sd) * sdata[0]
+            if int(sdata[1]) == a_idx:
+                a += sdata[2+(z_val^1)]
+                thresh += sdata[2+z_val]
+            inputs ^= 1
+            next_t = a
+
+        if (z_cur & 1) != ((lut >> inputs) & 1):
+            # we generate a toggle in z_mem, if:
+            #   ( it is the first toggle in z_mem OR
+            #   following toggle is earlier OR
+            #   pulse is wide enough ) AND enough space in z_mem.
+            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
+                if z_cur < (z_cap - 1):
+                    state[z_mem + z_cur, st_idx] = current_t
+                    previous_t = current_t
+                    z_cur += 1
+                else:
+                    overflows += 1
+                    previous_t = state[z_mem + z_cur - 1, st_idx]
+                    z_cur -= 1
+            else:
+                z_cur -= 1
+                if z_cur > 0:
+                    previous_t = state[z_mem + z_cur - 1, st_idx]
+                else:
+                    previous_t = TMIN
+        current_t = min(a, b)
+
+    if overflows > 0:
+        state[z_mem + z_cur, st_idx] = TMAX_OVL
+    else:
+        state[z_mem + z_cur, st_idx] = a if a > b else b  # propagate overflow flags by storing biggest TMAX from input
--- a/tests/test_wave_sim.py
+++ b/tests/test_wave_sim.py
@ -1,118 +1,148 @@
				@@ -1,118 +1,148 @@
 import numpy as np

-from kyupy.wave_sim import WaveSim, WaveSimCuda, wave_eval, TMIN, TMAX
+from kyupy.wave_sim import WaveSim, WaveSimCuda, wave_eval_cpu, TMIN, TMAX
 from kyupy.logic_sim import LogicSim
-from kyupy import verilog, sdf, logic
+from kyupy import verilog, sdf, logic, bench
 from kyupy.logic import MVArray, BPArray
+from kyupy.sim import SimPrim


-def test_wave_eval():
+def test_nand_delays():
+    op = (SimPrim.NAND4, 4, 0, 1, 2, 3)
+    #op = (0b0111, 4, 0, 1)
+    c = np.full((5*16, 1), TMAX)  # 5 waveforms of capacity 16
+    vat = np.zeros((5, 3), dtype='int')
+    for i in range(5): vat[i] = i*16, 16, 0  # 1:1 mapping
+
    # SDF specifies IOPATH delays with respect to output polarity
    # SDF pulse rejection value is determined by IOPATH causing last transition and polarity of last transition
-    line_times = np.zeros((3, 2, 2))
+    line_times = np.zeros((5, 2, 2))
    line_times[0, 0, 0] = 0.1  # A -> Z rise delay
    line_times[0, 0, 1] = 0.2  # A -> Z fall delay
    line_times[0, 1, 0] = 0.1  # A -> Z negative pulse limit (terminate in rising Z)
    line_times[0, 1, 1] = 0.2  # A -> Z positive pulse limit
-    line_times[1, 0, 0] = 0.3  # as above for B -> Z
-    line_times[1, 0, 1] = 0.4
-    line_times[1, 1, 0] = 0.3
-    line_times[1, 1, 1] = 0.4
-
-    state = np.zeros((3*16, 1)) + TMAX  # 3 waveforms of capacity 16
-    state[::16, 0] = 16  # first entry is capacity
-    a = state[0:16, 0]
-    b = state[16:32, 0]
-    z = state[32:, 0]
-    sat = np.zeros((3, 3), dtype='int')
-    sat[0] = 0, 16, 0
-    sat[1] = 16, 16, 0
-    sat[2] = 32, 16, 0
+    line_times[1, :, 0] = 0.3  # as above for B -> Z
+    line_times[1, :, 1] = 0.4
+    line_times[2, :, 0] = 0.5  # as above for C -> Z
+    line_times[2, :, 1] = 0.6
+    line_times[3, :, 0] = 0.7  # as above for D -> Z
+    line_times[3, :, 1] = 0.8
    
    sdata = np.asarray([1, -1, 0, 0], dtype='float32')

-    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
-    assert z[0] == TMIN
-
-    a[0] = TMIN
-    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
-    assert z[0] == TMIN
-
-    b[0] = TMIN
-    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
-    assert z[0] == TMAX
-
-    a[0] = 1  # A _/^^^
-    b[0] = 2  # B __/^^
-    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
-    assert z[0] == TMIN  # ^^^\___ B -> Z fall delay
-    assert z[1] == 2.4
-    assert z[2] == TMAX
-
-    a[0] = TMIN  # A ^^^^^^
-    b[0] = TMIN  # B ^^^\__
-    b[1] = 2
-    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
-    assert z[0] == 2.3  # ___/^^^ B -> Z rise delay
-    assert z[1] == TMAX
-
-    # pos pulse of 0.35 at B -> 0.45 after delays
-    a[0] = TMIN  # A ^^^^^^^^
-    b[0] = TMIN
-    b[1] = 2     # B ^^\__/^^
-    b[2] = 2.35
-    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
-    assert z[0] == 2.3  # __/^^\__
-    assert z[1] == 2.75
-    assert z[2] == TMAX
-
-    # neg pulse of 0.45 at B -> 0.35 after delays
-    a[0] = TMIN  # A ^^^^^^^^
-    b[0] = 2  # B __/^^\__
-    b[1] = 2.45
-    b[2] = TMAX
-    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
-    assert z[0] == TMIN  # ^^\__/^^
-    assert z[1] == 2.4
-    assert z[2] == 2.75
-    assert z[3] == TMAX
-
-    # neg pulse of 0.35 at B -> 0.25 after delays (filtered)
-    a[0] = TMIN  # A ^^^^^^^^
-    b[0] = 2  # B __/^^\__
-    b[1] = 2.35
-    b[2] = TMAX
-    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
-    assert z[0] == TMIN  # ^^^^^^
-    assert z[1] == TMAX
-
-    # pos pulse of 0.25 at B -> 0.35 after delays (filtered)
-    a[0] = TMIN  # A ^^^^^^^^
-    b[0] = TMIN
-    b[1] = 2  # B ^^\__/^^
-    b[2] = 2.25
-    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
-    assert z[0] == TMAX  # ______
-
-
-def compare_to_logic_sim(wsim):
-    tests = MVArray((len(wsim.interface), wsim.sims))
+    def wave_assert(inputs, output):
+        for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
+        wave_eval_cpu(op, c, vat, 0, line_times, sdata)
+        for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[4,i], v)
+
+    wave_assert([[TMAX,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(0,0,1,1) => 1
+    wave_assert([[TMIN,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(1,0,1,1) => 1
+    wave_assert([[TMIN,TMAX],[TMIN,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMAX])      # NAND(1,1,1,1) => 0
+
+    # Keep inputs C=1 and D=1.
+    wave_assert([[1,TMAX],[2,TMAX]], [TMIN,2.4,TMAX])              # _/⎺⎺⎺ NAND __/⎺⎺ => ⎺⎺⎺\___ (B->Z fall delay)
+    wave_assert([[TMIN,TMAX],[TMIN,2,TMAX]],  [2.3,TMAX])          # ⎺⎺⎺⎺⎺ NAND ⎺⎺\__ => ___/⎺⎺⎺ (B->Z rise delay)
+    wave_assert([[TMIN,TMAX],[TMIN,2,2.35,TMAX]], [2.3,2.75,TMAX]) # ⎺⎺⎺⎺⎺ NAND ⎺\_/⎺ => __/⎺⎺\_ (pos pulse, .35@B -> .45@Z)
+    wave_assert([[TMIN,TMAX],[TMIN,2,2.25,TMAX]], [TMAX])          # ⎺⎺⎺⎺⎺ NAND ⎺\_/⎺ => _______ (pos pulse, .25@B -> .35@Z, filtered)
+    wave_assert([[TMIN,TMAX],[2,2.45,TMAX]], [TMIN,2.4,2.75,TMAX]) # ⎺⎺⎺⎺⎺ NAND _/⎺\_ => ⎺⎺\_/⎺⎺ (neg pulse, .45@B -> .35@Z)
+    wave_assert([[TMIN,TMAX],[2,2.35,TMAX]], [TMIN,TMAX])          # ⎺⎺⎺⎺⎺ NAND _/⎺\_ => ⎺⎺⎺⎺⎺⎺⎺ (neg pulse, .35@B -> .25@Z, filtered)
+
+
+def test_tiny_circuit():
+    c = bench.parse('input(x, y) output(a, o, n) a=and(x,y) o=or(x,y) n=not(x)')
+    lt = np.zeros((len(c.lines), 2, 2))
+    lt[:,0,:] = 1.0  # unit delay for all lines
+    wsim = WaveSim(c, lt)
+    assert len(wsim.s) == 5
+    
+    # values for x
+    wsim.s[0,0,:3] = 0, 0.1, 0
+    wsim.s[0,1,:3] = 0, 0.2, 1
+    wsim.s[0,2,:3] = 1, 0.3, 0
+    wsim.s[0,3,:3] = 1, 0.4, 1
+
+    # values for y
+    wsim.s[1,0,:3] = 1, 0.5, 0
+    wsim.s[1,1,:3] = 1, 0.6, 0
+    wsim.s[1,2,:3] = 1, 0.7, 0
+    wsim.s[1,3,:3] = 0, 0.8, 1
+    
+    wsim.s_to_c()
+
+    x_c_loc = wsim.vat[wsim.ppi_offset+0, 0] # check x waveforms
+    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 0], [TMAX, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 1], [0.2, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 2], [TMIN, 0.3, TMAX])
+    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 3], [TMIN, TMAX, TMAX])
+
+    y_c_loc = wsim.vat[wsim.ppi_offset+1, 0] # check y waveforms
+    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 0], [TMIN, 0.5, TMAX])
+    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 1], [TMIN, 0.6, TMAX])
+    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 2], [TMIN, 0.7, TMAX])
+    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 3], [0.8, TMAX, TMAX])
+
+    wsim.c_prop()
+
+    a_c_loc = wsim.vat[wsim.ppo_offset+2, 0] # check a waveforms
+    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 0], [TMAX, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 1], [1.2, 1.6, TMAX])
+    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 2], [TMIN, 1.3, TMAX])
+    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 3], [1.8, TMAX, TMAX])
+
+    o_c_loc = wsim.vat[wsim.ppo_offset+3, 0] # check o waveforms
+    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 0], [TMIN, 1.5, TMAX])
+    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 1], [TMIN, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 2], [TMIN, 1.7, TMAX])
+    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 3], [TMIN, TMAX, TMAX])
+    
+    n_c_loc = wsim.vat[wsim.ppo_offset+4, 0] # check n waveforms
+    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 0], [TMIN, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 1], [TMIN, 1.2, TMAX])
+    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 2], [1.3, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 3], [TMAX, TMAX, TMAX])
+
+    wsim.c_to_s()
+
+    # check a captures
+    np.testing.assert_allclose(wsim.s[2, 0, 3:7], [0, TMAX, TMIN, 0])
+    np.testing.assert_allclose(wsim.s[2, 1, 3:7], [0, 1.2, 1.6, 0])
+    np.testing.assert_allclose(wsim.s[2, 2, 3:7], [1, 1.3, 1.3, 0])
+    np.testing.assert_allclose(wsim.s[2, 3, 3:7], [0, 1.8, 1.8, 1])
+
+    # check o captures
+    np.testing.assert_allclose(wsim.s[3, 0, 3:7], [1, 1.5, 1.5, 0])
+    np.testing.assert_allclose(wsim.s[3, 1, 3:7], [1, TMAX, TMIN, 1])
+    np.testing.assert_allclose(wsim.s[3, 2, 3:7], [1, 1.7, 1.7, 0])
+    np.testing.assert_allclose(wsim.s[3, 3, 3:7], [1, TMAX, TMIN, 1])
+
+    # check o captures
+    np.testing.assert_allclose(wsim.s[4, 0, 3:7], [1, TMAX, TMIN, 1])
+    np.testing.assert_allclose(wsim.s[4, 1, 3:7], [1, 1.2, 1.2, 0])
+    np.testing.assert_allclose(wsim.s[4, 2, 3:7], [0, 1.3, 1.3, 1])
+    np.testing.assert_allclose(wsim.s[4, 3, 3:7], [0, TMAX, TMIN, 0])
+
+
+def compare_to_logic_sim(wsim: WaveSim):
+    tests = MVArray((len(wsim.s_nodes), wsim.sims))
    choices = np.asarray([logic.ZERO, logic.ONE, logic.RISE, logic.FALL], dtype=np.uint8)
    rng = np.random.default_rng(10)
    tests.data[...] = rng.choice(choices, tests.data.shape)
-    tests_bp = BPArray(tests)
-    wsim.assign(tests_bp)
-    wsim.propagate()
-    cdata = wsim.capture()

-    resp = MVArray(tests)
+    wsim.s[:, :, 0] = (tests.data & 2) >> 1
+    wsim.s[:, :, 3] = (tests.data & 2) >> 1
+    wsim.s[:, :, 1] = 0.0
+    wsim.s[:, :, 2] = tests.data & 1
+    wsim.s[:, :, 6] = tests.data & 1
    
-    for iidx, inode in enumerate(wsim.interface):
-        if len(inode.ins) > 0:
-            for vidx in range(wsim.sims):
-                resp.data[iidx, vidx] = logic.ZERO if cdata[iidx, vidx, 0] < 0.5 else logic.ONE
-                # resp.set_value(vidx, iidx, 0 if cdata[iidx, vidx, 0] < 0.5 else 1)
+    wsim.s_to_c()
+    wsim.c_prop()
+    wsim.c_to_s()

+    resp = MVArray(tests)
+    resp.data[...] = np.array(wsim.s[:, :, 6], dtype=np.uint8) | (np.array(wsim.s[:, :, 3], dtype=np.uint8)<<1)
+    resp.data |= ((resp.data ^ (resp.data >> 1)) & 1) << 2  # transitions
+
+    tests_bp = BPArray(tests)    
    lsim = LogicSim(wsim.circuit, len(tests_bp))
    lsim.assign(tests_bp)
    lsim.propagate()
@ -121,30 +151,18 @@ def compare_to_logic_sim(wsim):
				@@ -121,30 +151,18 @@ def compare_to_logic_sim(wsim):
    exp = MVArray(exp_bp)

    for i in range(8):
-        exp_str = exp[i].replace('R', '1').replace('F', '0').replace('P', '0').replace('N', '1')
-        res_str = resp[i].replace('R', '1').replace('F', '0').replace('P', '0').replace('N', '1')
+        exp_str = exp[i].replace('P', '0').replace('N', '1')
+        res_str = resp[i].replace('P', '0').replace('N', '1')
        assert res_str == exp_str


-def test_b14(mydir):
-    c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
-    df = sdf.load(mydir / 'b14.sdf.gz')
-    lt = df.annotation(c)
-    wsim = WaveSim(c, lt, 8)
-    compare_to_logic_sim(wsim)
+def test_b14(b14_circuit, b14_timing):
+    compare_to_logic_sim(WaveSim(b14_circuit, b14_timing, 8))


-def test_b14_strip_forks(mydir):
-    c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
-    df = sdf.load(mydir / 'b14.sdf.gz')
-    lt = df.annotation(c)
-    wsim = WaveSim(c, lt, 8, strip_forks=True)
-    compare_to_logic_sim(wsim)
+def test_b14_strip_forks(b14_circuit, b14_timing):
+    compare_to_logic_sim(WaveSim(b14_circuit, b14_timing, 8, strip_forks=True))


-def test_b14_cuda(mydir):
-    c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
-    df = sdf.load(mydir / 'b14.sdf.gz')
-    lt = df.annotation(c)
-    wsim = WaveSimCuda(c, lt, 8)
-    compare_to_logic_sim(wsim)
+def test_b14_cuda(b14_circuit, b14_timing):
+    compare_to_logic_sim(WaveSimCuda(b14_circuit, b14_timing, 8, strip_forks=True))
--- a/tests/test_wave_sim4.py
+++ b/tests/test_wave_sim4.py
@ -1,166 +0,0 @@
				@@ -1,166 +0,0 @@
-import numpy as np
-
-from kyupy.wave_sim4 import WaveSim, WaveSimCuda, wave_eval_cpu, TMIN, TMAX
-from kyupy.logic_sim import LogicSim
-from kyupy import verilog, sdf, logic, bench
-from kyupy.logic import MVArray, BPArray
-from kyupy.sim import SimPrim
-
-
-def test_nand_delays():
-    op = (SimPrim.NAND4, 4, 0, 1, 2, 3)
-    #op = (0b0111, 4, 0, 1)
-    c = np.full((5*16, 1), TMAX)  # 5 waveforms of capacity 16
-    vat = np.zeros((5, 3), dtype='int')
-    for i in range(5): vat[i] = i*16, 16, 0  # 1:1 mapping
-
-    # SDF specifies IOPATH delays with respect to output polarity
-    # SDF pulse rejection value is determined by IOPATH causing last transition and polarity of last transition
-    line_times = np.zeros((5, 2, 2))
-    line_times[0, 0, 0] = 0.1  # A -> Z rise delay
-    line_times[0, 0, 1] = 0.2  # A -> Z fall delay
-    line_times[0, 1, 0] = 0.1  # A -> Z negative pulse limit (terminate in rising Z)
-    line_times[0, 1, 1] = 0.2  # A -> Z positive pulse limit
-    line_times[1, :, 0] = 0.3  # as above for B -> Z
-    line_times[1, :, 1] = 0.4
-    line_times[2, :, 0] = 0.5  # as above for C -> Z
-    line_times[2, :, 1] = 0.6
-    line_times[3, :, 0] = 0.7  # as above for D -> Z
-    line_times[3, :, 1] = 0.8
-    
-    sdata = np.asarray([1, -1, 0, 0], dtype='float32')
-
-    def wave_assert(inputs, output):
-        for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
-        wave_eval_cpu(op, c, vat, 0, line_times, sdata)
-        for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[4,i], v)
-
-    wave_assert([[TMAX,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(0,0,1,1) => 1
-    wave_assert([[TMIN,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(1,0,1,1) => 1
-    wave_assert([[TMIN,TMAX],[TMIN,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMAX])      # NAND(1,1,1,1) => 0
-
-    # Keep inputs C=1 and D=1.
-    wave_assert([[1,TMAX],[2,TMAX]], [TMIN,2.4,TMAX])              # _/⎺⎺⎺ NAND __/⎺⎺ => ⎺⎺⎺\___ (B->Z fall delay)
-    wave_assert([[TMIN,TMAX],[TMIN,2,TMAX]],  [2.3,TMAX])          # ⎺⎺⎺⎺⎺ NAND ⎺⎺\__ => ___/⎺⎺⎺ (B->Z rise delay)
-    wave_assert([[TMIN,TMAX],[TMIN,2,2.35,TMAX]], [2.3,2.75,TMAX]) # ⎺⎺⎺⎺⎺ NAND ⎺\_/⎺ => __/⎺⎺\_ (pos pulse, .35@B -> .45@Z)
-    wave_assert([[TMIN,TMAX],[TMIN,2,2.25,TMAX]], [TMAX])          # ⎺⎺⎺⎺⎺ NAND ⎺\_/⎺ => _______ (pos pulse, .25@B -> .35@Z, filtered)
-    wave_assert([[TMIN,TMAX],[2,2.45,TMAX]], [TMIN,2.4,2.75,TMAX]) # ⎺⎺⎺⎺⎺ NAND _/⎺\_ => ⎺⎺\_/⎺⎺ (neg pulse, .45@B -> .35@Z)
-    wave_assert([[TMIN,TMAX],[2,2.35,TMAX]], [TMIN,TMAX])          # ⎺⎺⎺⎺⎺ NAND _/⎺\_ => ⎺⎺⎺⎺⎺⎺⎺ (neg pulse, .35@B -> .25@Z, filtered)
-
-
-def test_tiny_circuit():
-    c = bench.parse('input(x, y) output(a, o, n) a=and(x,y) o=or(x,y) n=not(x)')
-    lt = np.zeros((len(c.lines), 2, 2))
-    lt[:,0,:] = 1.0  # unit delay for all lines
-    wsim = WaveSim(c, lt)
-    assert len(wsim.s) == 5
-    
-    # values for x
-    wsim.s[0,0,:3] = 0, 0.1, 0
-    wsim.s[0,1,:3] = 0, 0.2, 1
-    wsim.s[0,2,:3] = 1, 0.3, 0
-    wsim.s[0,3,:3] = 1, 0.4, 1
-
-    # values for y
-    wsim.s[1,0,:3] = 1, 0.5, 0
-    wsim.s[1,1,:3] = 1, 0.6, 0
-    wsim.s[1,2,:3] = 1, 0.7, 0
-    wsim.s[1,3,:3] = 0, 0.8, 1
-    
-    wsim.s_to_c()
-
-    x_c_loc = wsim.vat[wsim.ppi_offset+0, 0] # check x waveforms
-    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 0], [TMAX, TMAX, TMAX])
-    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 1], [0.2, TMAX, TMAX])
-    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 2], [TMIN, 0.3, TMAX])
-    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 3], [TMIN, TMAX, TMAX])
-
-    y_c_loc = wsim.vat[wsim.ppi_offset+1, 0] # check y waveforms
-    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 0], [TMIN, 0.5, TMAX])
-    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 1], [TMIN, 0.6, TMAX])
-    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 2], [TMIN, 0.7, TMAX])
-    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 3], [0.8, TMAX, TMAX])
-
-    wsim.c_prop()
-
-    a_c_loc = wsim.vat[wsim.ppo_offset+2, 0] # check a waveforms
-    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 0], [TMAX, TMAX, TMAX])
-    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 1], [1.2, 1.6, TMAX])
-    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 2], [TMIN, 1.3, TMAX])
-    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 3], [1.8, TMAX, TMAX])
-
-    o_c_loc = wsim.vat[wsim.ppo_offset+3, 0] # check o waveforms
-    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 0], [TMIN, 1.5, TMAX])
-    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 1], [TMIN, TMAX, TMAX])
-    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 2], [TMIN, 1.7, TMAX])
-    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 3], [TMIN, TMAX, TMAX])
-    
-    n_c_loc = wsim.vat[wsim.ppo_offset+4, 0] # check n waveforms
-    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 0], [TMIN, TMAX, TMAX])
-    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 1], [TMIN, 1.2, TMAX])
-    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 2], [1.3, TMAX, TMAX])
-    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 3], [TMAX, TMAX, TMAX])
-
-    wsim.c_to_s()
-
-    # check a captures
-    np.testing.assert_allclose(wsim.s[2, 0, 3:7], [0, TMAX, TMIN, 0])
-    np.testing.assert_allclose(wsim.s[2, 1, 3:7], [0, 1.2, 1.6, 0])
-    np.testing.assert_allclose(wsim.s[2, 2, 3:7], [1, 1.3, 1.3, 0])
-    np.testing.assert_allclose(wsim.s[2, 3, 3:7], [0, 1.8, 1.8, 1])
-
-    # check o captures
-    np.testing.assert_allclose(wsim.s[3, 0, 3:7], [1, 1.5, 1.5, 0])
-    np.testing.assert_allclose(wsim.s[3, 1, 3:7], [1, TMAX, TMIN, 1])
-    np.testing.assert_allclose(wsim.s[3, 2, 3:7], [1, 1.7, 1.7, 0])
-    np.testing.assert_allclose(wsim.s[3, 3, 3:7], [1, TMAX, TMIN, 1])
-
-    # check o captures
-    np.testing.assert_allclose(wsim.s[4, 0, 3:7], [1, TMAX, TMIN, 1])
-    np.testing.assert_allclose(wsim.s[4, 1, 3:7], [1, 1.2, 1.2, 0])
-    np.testing.assert_allclose(wsim.s[4, 2, 3:7], [0, 1.3, 1.3, 1])
-    np.testing.assert_allclose(wsim.s[4, 3, 3:7], [0, TMAX, TMIN, 0])
-
-
-def compare_to_logic_sim(wsim: WaveSim):
-    tests = MVArray((len(wsim.s_nodes), wsim.sims))
-    choices = np.asarray([logic.ZERO, logic.ONE, logic.RISE, logic.FALL], dtype=np.uint8)
-    rng = np.random.default_rng(10)
-    tests.data[...] = rng.choice(choices, tests.data.shape)
-
-    wsim.s[..., 0] = (tests.data & 2) >> 1
-    wsim.s[..., 3] = (tests.data & 2) >> 1
-    wsim.s[..., 1] = 0.0
-    wsim.s[..., 2] = tests.data & 1
-    wsim.s[..., 6] = tests.data & 1
-    
-    wsim.s_to_c()
-    wsim.c_prop()
-    wsim.c_to_s()
-
-    resp = MVArray(tests)
-    resp.data[...] = wsim.s[..., 6].astype(np.uint8) | (wsim.s[..., 3].astype(np.uint8)<<1)
-    resp.data |= ((resp.data ^ (resp.data >> 1)) & 1) << 2  # transitions
-
-    tests_bp = BPArray(tests)    
-    lsim = LogicSim(wsim.circuit, len(tests_bp))
-    lsim.assign(tests_bp)
-    lsim.propagate()
-    exp_bp = BPArray(tests_bp)
-    lsim.capture(exp_bp)
-    exp = MVArray(exp_bp)
-
-    for i in range(8):
-        exp_str = exp[i].replace('P', '0').replace('N', '1')
-        res_str = resp[i].replace('P', '0').replace('N', '1')
-        assert res_str == exp_str
-
-
-def test_b14(b14_circuit, b14_timing):
-    compare_to_logic_sim(WaveSim(b14_circuit, b14_timing, 8))
-
-def test_b14_strip_forks(b14_circuit, b14_timing):
-    compare_to_logic_sim(WaveSim(b14_circuit, b14_timing, 8, strip_forks=True))
-
-def test_b14_cuda(b14_circuit, b14_timing):
-    compare_to_logic_sim(WaveSimCuda(b14_circuit, b14_timing, 8, strip_forks=True))
--- a/tests/test_wave_sim_old.py
+++ b/tests/test_wave_sim_old.py
@ -0,0 +1,138 @@
				@@ -0,0 +1,138 @@
+import numpy as np
+
+from kyupy.wave_sim_old import WaveSim, WaveSimCuda, wave_eval, TMIN, TMAX
+from kyupy.logic_sim import LogicSim
+from kyupy import verilog, sdf, logic
+from kyupy.logic import MVArray, BPArray
+
+
+def test_wave_eval():
+    # SDF specifies IOPATH delays with respect to output polarity
+    # SDF pulse rejection value is determined by IOPATH causing last transition and polarity of last transition
+    line_times = np.zeros((3, 2, 2))
+    line_times[0, 0, 0] = 0.1  # A -> Z rise delay
+    line_times[0, 0, 1] = 0.2  # A -> Z fall delay
+    line_times[0, 1, 0] = 0.1  # A -> Z negative pulse limit (terminate in rising Z)
+    line_times[0, 1, 1] = 0.2  # A -> Z positive pulse limit
+    line_times[1, 0, 0] = 0.3  # as above for B -> Z
+    line_times[1, 0, 1] = 0.4
+    line_times[1, 1, 0] = 0.3
+    line_times[1, 1, 1] = 0.4
+
+    state = np.zeros((3*16, 1)) + TMAX  # 3 waveforms of capacity 16
+    state[::16, 0] = 16  # first entry is capacity
+    a = state[0:16, 0]
+    b = state[16:32, 0]
+    z = state[32:, 0]
+    sat = np.zeros((3, 3), dtype='int')
+    sat[0] = 0, 16, 0
+    sat[1] = 16, 16, 0
+    sat[2] = 32, 16, 0
+
+    sdata = np.asarray([1, -1, 0, 0], dtype='float32')
+
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
+    assert z[0] == TMIN
+
+    a[0] = TMIN
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
+    assert z[0] == TMIN
+
+    b[0] = TMIN
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
+    assert z[0] == TMAX
+
+    a[0] = 1  # A _/^^^
+    b[0] = 2  # B __/^^
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
+    assert z[0] == TMIN  # ^^^\___ B -> Z fall delay
+    assert z[1] == 2.4
+    assert z[2] == TMAX
+
+    a[0] = TMIN  # A ^^^^^^
+    b[0] = TMIN  # B ^^^\__
+    b[1] = 2
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
+    assert z[0] == 2.3  # ___/^^^ B -> Z rise delay
+    assert z[1] == TMAX
+
+    # pos pulse of 0.35 at B -> 0.45 after delays
+    a[0] = TMIN  # A ^^^^^^^^
+    b[0] = TMIN
+    b[1] = 2     # B ^^\__/^^
+    b[2] = 2.35
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
+    assert z[0] == 2.3  # __/^^\__
+    assert z[1] == 2.75
+    assert z[2] == TMAX
+
+    # neg pulse of 0.45 at B -> 0.35 after delays
+    a[0] = TMIN  # A ^^^^^^^^
+    b[0] = 2  # B __/^^\__
+    b[1] = 2.45
+    b[2] = TMAX
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
+    assert z[0] == TMIN  # ^^\__/^^
+    assert z[1] == 2.4
+    assert z[2] == 2.75
+    assert z[3] == TMAX
+
+    # neg pulse of 0.35 at B -> 0.25 after delays (filtered)
+    a[0] = TMIN  # A ^^^^^^^^
+    b[0] = 2  # B __/^^\__
+    b[1] = 2.35
+    b[2] = TMAX
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
+    assert z[0] == TMIN  # ^^^^^^
+    assert z[1] == TMAX
+
+    # pos pulse of 0.25 at B -> 0.35 after delays (filtered)
+    a[0] = TMIN  # A ^^^^^^^^
+    b[0] = TMIN
+    b[1] = 2  # B ^^\__/^^
+    b[2] = 2.25
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times, sdata)
+    assert z[0] == TMAX  # ______
+
+
+def compare_to_logic_sim(wsim):
+    tests = MVArray((len(wsim.interface), wsim.sims))
+    choices = np.asarray([logic.ZERO, logic.ONE, logic.RISE, logic.FALL], dtype=np.uint8)
+    rng = np.random.default_rng(10)
+    tests.data[...] = rng.choice(choices, tests.data.shape)
+    tests_bp = BPArray(tests)
+    wsim.assign(tests_bp)
+    wsim.propagate()
+    cdata = wsim.capture()
+
+    resp = MVArray(tests)
+
+    for iidx, inode in enumerate(wsim.interface):
+        if len(inode.ins) > 0:
+            for vidx in range(wsim.sims):
+                resp.data[iidx, vidx] = logic.ZERO if cdata[iidx, vidx, 0] < 0.5 else logic.ONE
+                # resp.set_value(vidx, iidx, 0 if cdata[iidx, vidx, 0] < 0.5 else 1)
+
+    lsim = LogicSim(wsim.circuit, len(tests_bp))
+    lsim.assign(tests_bp)
+    lsim.propagate()
+    exp_bp = BPArray(tests_bp)
+    lsim.capture(exp_bp)
+    exp = MVArray(exp_bp)
+
+    for i in range(8):
+        exp_str = exp[i].replace('R', '1').replace('F', '0').replace('P', '0').replace('N', '1')
+        res_str = resp[i].replace('R', '1').replace('F', '0').replace('P', '0').replace('N', '1')
+        assert res_str == exp_str
+
+
+def test_b14(b14_circuit, b14_timing):
+    compare_to_logic_sim(WaveSim(b14_circuit, b14_timing, 8))
+
+
+def test_b14_strip_forks(b14_circuit, b14_timing):
+    compare_to_logic_sim(WaveSim(b14_circuit, b14_timing, 8, strip_forks=True))
+
+
+def test_b14_cuda(b14_circuit, b14_timing):
+    compare_to_logic_sim(WaveSimCuda(b14_circuit, b14_timing, 8, strip_forks=True))