new wave sim

3 years ago · f1ebe1487c
4 changed files with 552 additions and 1125 deletions
--- a/src/kyupy/schedule.py
+++ b/src/kyupy/schedule.py
@ -145,7 +145,7 @@ class Heap:
				@@ -145,7 +145,7 @@ class Heap:
        return "\n".join(r)


-class Schedule:
+class SimOps:
    """A static scheduler that translates a Circuit into a topologically sorted list of basic logic operations (self.ops) and
    a value allocation table (self.vat) for use in simulators.

@ -155,23 +155,24 @@ class Schedule:
				@@ -155,23 +155,24 @@ class Schedule:
    :param keep_signals: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
        memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
    """
-    def __init__(self, circuit, strip_forks=False, keep_signals=True, signal_caps=1):
+    def __init__(self, circuit, c_caps=1, c_reuse=False, strip_forks=False):
        self.circuit = circuit
-        self.interface = list(circuit.io_nodes) + [n for n in circuit.nodes if 'dff' in n.kind.lower()]
+        self.s_nodes = list(circuit.io_nodes) + [n for n in circuit.nodes if 'dff' in n.kind.lower()]
+        keep_signals = not c_reuse
        
-        if isinstance(signal_caps, int):
-            signal_caps = [signal_caps] * len(circuit.lines)
+        if isinstance(c_caps, int):
+            c_caps = [c_caps] * len(circuit.lines)

        # indices for state allocation table (sat)
        self.zero_idx = len(circuit.lines)
        self.tmp_idx = self.zero_idx + 1
        self.ppi_offset = self.tmp_idx + 1
-        self.ppo_offset = self.ppi_offset + len(self.interface)
-        self.vat_length = self.ppo_offset + len(self.interface)
+        self.ppo_offset = self.ppi_offset + len(self.s_nodes)
+        self.vat_length = self.ppo_offset + len(self.s_nodes)

        # translate circuit structure into self.ops
        ops = []
-        interface_dict = dict((n, i) for i, n in enumerate(self.interface))
+        interface_dict = dict((n, i) for i, n in enumerate(self.s_nodes))
        for n in circuit.topological_order():
            if n in interface_dict:
                inp_idx = self.ppi_offset + interface_dict[n]
@ -260,7 +261,7 @@ class Schedule:
				@@ -260,7 +261,7 @@ class Schedule:
        ref_count[self.tmp_idx] += 1

        # allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later)
-        for i, n in enumerate(self.interface):
+        for i, n in enumerate(self.s_nodes):
            if len(n.outs) > 0:
                self.vat[self.ppi_offset + i] = h.alloc(1), 1, 0
                ref_count[self.ppi_offset + i] += 1
@ -286,7 +287,7 @@ class Schedule:
				@@ -286,7 +287,7 @@ class Schedule:
                if ref_count[i2_idx] <= 0: free_list.append(self.vat[i2_idx, 0])
                if ref_count[i3_idx] <= 0: free_list.append(self.vat[i3_idx, 0])
                o_idx = op[1]
-                cap = signal_caps[o_idx]
+                cap = c_caps[o_idx]
                self.vat[o_idx] = h.alloc(cap), cap, 0
            if not keep_signals:
                for loc in free_list:
@ -298,11 +299,11 @@ class Schedule:
				@@ -298,11 +299,11 @@ class Schedule:
                self.vat[lidx] = self.vat[stem]

        # copy memory location to PO/PPO area
-        for i, n in enumerate(self.interface):
+        for i, n in enumerate(self.s_nodes):
            if len(n.ins) > 0:
                self.vat[self.ppo_offset + i] = self.vat[n.ins[0]]

-        self.state_length = h.max_size
+        self.c_len = h.max_size

        from collections import defaultdict
        self.prim_counts = defaultdict(int)
--- a/src/kyupy/wave_sim4.py
+++ b/src/kyupy/wave_sim4.py
@ -0,0 +1,365 @@
				@@ -0,0 +1,365 @@
+"""High-throughput combinational logic timing simulators.
+
+These simulators work similarly to :py:class:`~kyupy.logic_sim.LogicSim`.
+They propagate values through the combinational circuit from (pseudo) primary inputs to (pseudo) primary outputs.
+Instead of propagating logic values, these simulators propagate signal histories (waveforms).
+They are designed to run many simulations in parallel and while their latencies are quite high, they can achieve
+high throughput.
+
+The simulators are not event-based and are not capable of simulating sequential circuits directly.
+"""
+
+import math
+from bisect import bisect, insort_left
+
+import numpy as np
+
+from . import numba, cuda, hr_bytes
+from .sim import SimOps
+
+
+TMAX = np.float32(2 ** 127)
+"""A large 32-bit floating point value used to mark the end of a waveform."""
+TMAX_OVL = np.float32(1.1 * 2 ** 127)
+"""A large 32-bit floating point value used to mark the end of a waveform that
+may be incomplete due to an overflow."""
+TMIN = np.float32(-2 ** 127)
+"""A large negative 32-bit floating point value used at the beginning of waveforms that start with logic-1."""
+
+
+class WaveSim(SimOps):
+    """A waveform-based combinational logic timing simulator running on CPU.
+
+    :param circuit: The circuit to simulate.
+    :param timing: The timing annotation of the circuit (see :py:func:`kyupy.sdf.DelayFile.annotation` for details)
+    :param sims: The number of parallel simulations.
+    :param c_caps: The number of floats available in each waveform. Values must be positive and a multiple of 4.
+        Waveforms encode the signal switching history by storing transition times.
+        The waveform capacity roughly corresponds to the number of transitions
+        that can be stored. A capacity of ``n`` can store at least ``n-2`` transitions. If more transitions are
+        generated during simulation, the latest glitch is removed (freeing up two transition times) and an overflow
+        flag is set. If an integer is given, all waveforms are set to that same capacity. With an array of length
+        ``len(circuit.lines)`` the capacity is set for each intermediate waveform individually.
+    :param strip_forks: If enabled, the simulator will not evaluate fork nodes explicitly. This saves simulation time
+        by reducing the number of nodes to simulate, but (interconnect) delay annotations of lines read by fork nodes
+        are ignored.
+    :param keep_waveforms: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
+        memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
+    """
+    def __init__(self, circuit, timing, sims=8, c_caps=16, c_reuse=False, strip_forks=False):
+        assert c_caps > 0 and c_caps % 4 == 0
+        super().__init__(circuit, c_caps=c_caps//4, c_reuse=c_reuse, strip_forks=strip_forks)
+        self.sims = sims
+        
+        self.c_len *= 4
+        self.vat[...,0:2] *= 4
+
+        self.timing = np.zeros((self.c_len, 2, 2))
+        self.timing[:len(timing)] = timing
+
+        self.c = np.zeros((self.c_len, sims), dtype=np.float32) + TMAX
+        self.s = np.zeros((len(self.s_nodes), sims, 11), dtype=np.float32)
+        """Information about the logic values and transitions around the sequential elements (flip-flops) and ports.
+
+        The first 3 values are read by ``s_to_c()``.
+        The remaining values are written by ``c_to_s()``.
+
+        The elements are as follows:
+        * ``s[..., 0]`` (P)PI initial value
+        * ``s[..., 1]`` (P)PI transition time
+        * ``s[..., 2]`` (P)PI final value
+        * ``s[..., 3]`` (P)PO initial value
+        * ``s[..., 4]`` (P)PO earliest arrival time (EAT): The time at which the output transitioned from its initial value.
+        * ``s[..., 5]`` (P)PO latest stabilization time (LST): The time at which the output settled to its final value.
+        * ``s[..., 6]`` (P)PO final value
+        * ``s[..., 7]`` (P)PO capture value: probability of capturing a 1 at a given capture time
+        * ``s[..., 8]`` (P)PO sampled capture value: decided by random sampling according to a given seed.
+        * ``s[..., 9]`` (P)PO sampled capture slack: (capture time - LST) - decided by random sampling according to a given seed.
+        * ``s[..., 10]`` Overflow indicator: If non-zero, some signals in the input cone of this output had more
+          transitions than specified in ``wavecaps``. Some transitions have been discarded, the
+          final values in the waveforms are still valid.
+        """
+                     
+        self.params = np.zeros((sims, 4), dtype=np.float32)
+        self.params[...,0] = 1.0
+
+        m1 = np.array([2 ** x for x in range(7, -1, -1)], dtype=np.uint8)
+        m0 = ~m1
+        self.mask = np.rollaxis(np.vstack((m0, m1)), 1)
+
+        self.overflows = 0
+        self.lst_eat_valid = False
+
+        self.pi_s_locs = np.flatnonzero(self.vat[self.ppi_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
+        self.po_s_locs = np.flatnonzero(self.vat[self.ppo_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
+        self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
+
+        self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
+        self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
+
+        self.pi_c_locs = self.vat[self.ppi_offset+self.pi_s_locs, 0]
+        self.po_c_locs = self.vat[self.ppo_offset+self.po_s_locs, 0]
+        self.ppi_c_locs = self.vat[self.ppi_offset+self.ppio_s_locs, 0]
+        self.ppo_c_locs = self.vat[self.ppo_offset+self.ppio_s_locs, 0]
+
+        self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
+        self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
+
+        self.wave_capture = numba.njit(WaveSim.wave_capture)
+
+    def __repr__(self):
+        total_mem = self.c.nbytes + self.vat.nbytes + self.ops.nbytes + self.s.nbytes
+        return f'<WaveSim {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
+               f'levels={len(self.level_starts)} mem={hr_bytes(total_mem)}>'
+
+    def get_line_delay(self, line, polarity):
+        """Returns the current delay of the given ``line`` and ``polarity`` in the simulation model."""
+        return self.timing[line, 0, polarity]
+
+    def set_line_delay(self, line, polarity, delay):
+        """Sets a new ``delay`` for the given ``line`` and ``polarity`` in the simulation model."""
+        self.timing[line, 0, polarity] = delay
+
+    def s_to_c(self):
+        """Transfers values of sequential elements and primary inputs to the combinational portion.
+
+        Based on the data in ``self.s``, waveforms are generated on the input lines of the circuit.
+        It modifies ``self.c``.
+        """
+        sins = np.moveaxis(self.s[self.pippi_s_locs], -1, 0)
+        cond = (sins[2] != 0) + 2*(sins[0] != 0)  # choices order: 0 R F 1
+        self.c[self.pippi_c_locs] = np.choose(cond, [TMAX, sins[1], TMIN, TMIN])
+        self.c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX])
+        self.c[self.pippi_c_locs+2] = TMAX
+
+    def c_prop(self, sims=None, sd=0.0, seed=1):
+        """Propagates all waveforms from the (pseudo) primary inputs to the (pseudo) primary outputs.
+
+        :param sims: Number of parallel simulations to execute. If None, all available simulations are performed.
+        :param sd: Standard deviation for injection of random delay variation. Active, if value is positive.
+        :param seed: Random seed for delay variations.
+        """
+        sims = min(sims or self.sims, self.sims)
+        for op_start, op_stop in zip(self.level_starts, self.level_stops):
+            self.overflows += level_eval(self.ops, op_start, op_stop, self.c, self.vat, 0, sims,
+                                         self.timing, self.params, sd, seed)
+        self.lst_eat_valid = False
+
+    def c_to_s(self, time=TMAX, sd=0.0, seed=1):
+        """Simulates a capture operation at all sequential elements and primary outputs.
+
+        Propagated waveforms in ``self.c`` at and around the given capture time are analyzed and
+        the results are stored in ``self.s``.
+
+        :param time: The desired capture time. By default, a capture of the settled value is performed.
+        :param sd: A standard deviation for uncertainty in the actual capture time.
+        :param seed: The random seed for a capture with uncertainty.
+        """
+        for s_loc, (c_loc, c_len, _) in zip(self.poppo_s_locs, self.vat[self.ppo_offset+self.poppo_s_locs]):
+            for vector in range(self.sims):
+                self.s[s_loc, vector, 3:] = self.wave_capture(self.c, c_loc, c_len, vector, time=time, sd=sd, seed=seed)
+
+    def s_ppo_to_ppi(self, time=0.0):
+        """Re-assigns the last sampled capture to the appropriate pseudo-primary inputs (PPI). 
+        Each PPI transition is constructed from its previous final value, the
+        given time, and the sampled captured value of its PPO. Reads and modifies ``self.s``.
+
+        :param time: The transition time at the inputs (usually 0.0).
+        """
+        self.s[self.ppio_s_locs, :, 0] = self.s[self.ppio_s_locs, :, 2]
+        self.s[self.ppio_s_locs, :, 1] = time
+        self.s[self.ppio_s_locs, :, 2] = self.s[self.ppio_s_locs, :, 8]
+
+    @staticmethod
+    def wave_capture(c, c_loc, c_len, vector, time=TMAX, sd=0.0, seed=1):
+        s_sqrt2 = sd * math.sqrt(2)
+        m = 0.5
+        acc = 0.0
+        eat = TMAX
+        lst = TMIN
+        tog = 0
+        ovl = 0
+        val = int(0)
+        final = int(0)
+        w = c[c_loc:c_loc+c_len, vector]
+        for t in w:
+            if t >= TMAX:
+                if t == TMAX_OVL:
+                    ovl = 1
+                break
+            m = -m
+            final ^= 1
+            if t < time:
+                val ^= 1
+            if t <= TMIN: continue
+            if s_sqrt2 > 0:
+                acc += m * (1 + math.erf((t - time) / s_sqrt2))
+            eat = min(eat, t)
+            lst = max(lst, t)
+            tog += 1
+        if s_sqrt2 > 0:
+            if m < 0:
+                acc += 1
+            if acc >= 0.99:
+                val = 1
+            elif acc > 0.01:
+                seed = (seed << 4) + (vector << 20) + c_loc
+                seed = int(0xDEECE66D) * seed + 0xB
+                seed = int(0xDEECE66D) * seed + 0xB
+                rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
+                val = rnd < acc
+            else:
+                val = 0
+        else:
+            acc = val
+
+        return (w[0] <= TMIN), eat, lst, final, acc, val, 0, ovl
+
+
+@numba.njit
+def level_eval(ops, op_start, op_stop, c, vat, st_start, st_stop, line_times, params, sd, seed):
+    overflows = 0
+    for op_idx in range(op_start, op_stop):
+        op = ops[op_idx]
+        for st_idx in range(st_start, st_stop):
+            overflows += wave_eval(op, c, vat, st_idx, line_times, params[st_idx], sd, seed)
+    return overflows
+
+
+@numba.njit
+def rand_gauss(seed, sd):
+    clamp = 0.5
+    if sd <= 0.0:
+        return 1.0
+    while True:
+        x = -6.0
+        for _ in range(12):
+            seed = int(0xDEECE66D) * seed + 0xB
+            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
+        x *= sd
+        if abs(x) <= clamp:
+            break
+    return x + 1.0
+
+
+@numba.njit
+def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
+    lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
+    overflows = int(0)
+
+    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
+
+    a_mem = vat[a_idx, 0]
+    b_mem = vat[b_idx, 0]
+    c_mem = vat[c_idx, 0]
+    d_mem = vat[d_idx, 0]
+    z_mem, z_cap, _ = vat[z_idx]
+
+    a_cur = int(0)
+    b_cur = int(0)
+    c_cur = int(0)
+    d_cur = int(0)                                          
+    z_cur = lut & 1
+    if z_cur == 1:
+        cbuf[z_mem, st_idx] = TMIN
+
+    a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss(_seed ^ a_mem ^ z_cur, sd) * param[0]
+    if int(param[1]) == a_idx: a += param[2+z_cur]
+    b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss(_seed ^ b_mem ^ z_cur, sd) * param[0]
+    if int(param[1]) == b_idx: b += param[2+z_cur]
+    c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss(_seed ^ c_mem ^ z_cur, sd) * param[0]
+    if int(param[1]) == c_idx: c += param[2+z_cur]
+    d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss(_seed ^ d_mem ^ z_cur, sd) * param[0]
+    if int(param[1]) == d_idx: d += param[2+z_cur]
+    
+    previous_t = TMIN
+
+    current_t = min(a, b, c, d)
+    inputs = int(0)
+
+    while current_t < TMAX:
+        z_val = z_cur & 1
+        if a == current_t:
+            a_cur += 1
+            a = cbuf[a_mem + a_cur, st_idx]
+            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[a_idx, 1, z_val] * rand_gauss(_seed ^ a_mem ^ z_val, sd) * param[0]
+            if int(param[1]) == a_idx:
+                a += param[2+(z_val^1)]
+                thresh += param[2+z_val]
+            inputs ^= 1
+            next_t = a   
+        
+        elif b == current_t:
+            b_cur += 1
+            b = cbuf[b_mem + b_cur, st_idx]
+            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[b_idx, 1, z_val] * rand_gauss(_seed ^ b_mem ^ z_val, sd) * param[0]
+            if int(param[1]) == b_idx:
+                b += param[2+(z_val^1)]
+                thresh += param[2+z_val]
+            inputs ^= 2
+            next_t = b
+                
+        elif c == current_t:
+            c_cur += 1
+            c = cbuf[c_mem + c_cur, st_idx]
+            c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[c_idx, 1, z_val] * rand_gauss(_seed ^ c_mem ^ z_val, sd) * param[0]
+            if int(param[1]) == c_idx:
+                c += param[2+(z_val^1)]
+                thresh += param[2+z_val]
+            inputs ^= 4
+            next_t = c 
+                     
+        else:
+            d_cur += 1
+            d = cbuf[d_mem + d_cur, st_idx]
+            d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[d_idx, 1, z_val] * rand_gauss(_seed ^ d_mem ^ z_val, sd) * param[0]
+            if int(param[1]) == d_idx:
+                d += param[2+(z_val^1)]
+                thresh += param[2+z_val]
+            inputs ^= 8
+            next_t = d 
+        #print("previous_t",previous_t)
+        #print("current_t",current_t) 
+        #print(current_t - previous_t)
+        #print(thresh)
+        #print(z_cur & 1)
+        #print((lut >> inputs) & 1)
+       
+        if (z_cur & 1) != ((lut >> inputs) & 1):
+            # we generate a toggle in z_mem, if:
+            #   ( it is the first toggle in z_mem OR
+            #   following toggle is earlier OR
+            #   pulse is wide enough ) AND enough space in z_mem.
+            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
+                #print(current_t - previous_t)
+                #print(thresh)
+                #print(z_cap)
+                if z_cur < (z_cap - 1):
+                    cbuf[z_mem + z_cur, st_idx] = current_t
+                    #print(cbuf[z_mem + z_cur, st_idx])
+                    previous_t = current_t
+                    z_cur += 1
+                else:
+                    overflows += 1
+                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
+                    z_cur -= 1
+            else:
+                #print(a)
+                z_cur -= 1
+                if z_cur > 0:
+                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
+                else:
+                    previous_t = TMIN
+        
+        current_t = min(a, b, c, d)
+                     
+    if overflows > 0:
+        cbuf[z_mem + z_cur, st_idx] = TMAX_OVL
+    else:
+        cbuf[z_mem + z_cur, st_idx] = a if a == max(a, b, c, d) else b if b == max(a, b, c, d) else c if c == max(a, b, c, d) else d   # propagate overflow flags by storing biggest TMAX from input
+
+    return overflows
--- a/src/kyupy/wave_sim_4ig.py
+++ b/src/kyupy/wave_sim_4ig.py
--- a/tests/test_wave_sim4.py
+++ b/tests/test_wave_sim4.py
@ -0,0 +1,174 @@
				@@ -0,0 +1,174 @@
+import numpy as np
+
+from kyupy.wave_sim4 import WaveSim, wave_eval, TMIN, TMAX
+from kyupy.logic_sim import LogicSim
+from kyupy import verilog, sdf, logic, bench
+from kyupy.logic import MVArray, BPArray
+from kyupy.sim import SimPrim
+
+
+def test_nand_delays():
+    op = (SimPrim.NAND4, 4, 0, 1, 2, 3)
+    #op = (0b0111, 4, 0, 1)
+    c = np.full((5*16, 1), TMAX)  # 5 waveforms of capacity 16
+    vat = np.zeros((5, 3), dtype='int')
+    for i in range(5): vat[i] = i*16, 16, 0  # 1:1 mapping
+
+    # SDF specifies IOPATH delays with respect to output polarity
+    # SDF pulse rejection value is determined by IOPATH causing last transition and polarity of last transition
+    line_times = np.zeros((5, 2, 2))
+    line_times[0, 0, 0] = 0.1  # A -> Z rise delay
+    line_times[0, 0, 1] = 0.2  # A -> Z fall delay
+    line_times[0, 1, 0] = 0.1  # A -> Z negative pulse limit (terminate in rising Z)
+    line_times[0, 1, 1] = 0.2  # A -> Z positive pulse limit
+    line_times[1, :, 0] = 0.3  # as above for B -> Z
+    line_times[1, :, 1] = 0.4
+    line_times[2, :, 0] = 0.5  # as above for C -> Z
+    line_times[2, :, 1] = 0.6
+    line_times[3, :, 0] = 0.7  # as above for D -> Z
+    line_times[3, :, 1] = 0.8
+    
+    sdata = np.asarray([1, -1, 0, 0], dtype='float32')
+
+    def wave_assert(inputs, output):
+        for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
+        wave_eval(op, c, vat, 0, line_times, sdata)
+        for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[4,i], v)
+
+    wave_assert([[TMAX,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(0,0,1,1) => 1
+    wave_assert([[TMIN,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(1,0,1,1) => 1
+    wave_assert([[TMIN,TMAX],[TMIN,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMAX])      # NAND(1,1,1,1) => 0
+
+    # Keep inputs C=1 and D=1.
+    wave_assert([[1,TMAX],[2,TMAX]], [TMIN,2.4,TMAX])              # _/⎺⎺⎺ NAND __/⎺⎺ => ⎺⎺⎺\___ (B->Z fall delay)
+    wave_assert([[TMIN,TMAX],[TMIN,2,TMAX]],  [2.3,TMAX])          # ⎺⎺⎺⎺⎺ NAND ⎺⎺\__ => ___/⎺⎺⎺ (B->Z rise delay)
+    wave_assert([[TMIN,TMAX],[TMIN,2,2.35,TMAX]], [2.3,2.75,TMAX]) # ⎺⎺⎺⎺⎺ NAND ⎺\_/⎺ => __/⎺⎺\_ (pos pulse, .35@B -> .45@Z)
+    wave_assert([[TMIN,TMAX],[TMIN,2,2.25,TMAX]], [TMAX])          # ⎺⎺⎺⎺⎺ NAND ⎺\_/⎺ => _______ (pos pulse, .25@B -> .35@Z, filtered)
+    wave_assert([[TMIN,TMAX],[2,2.45,TMAX]], [TMIN,2.4,2.75,TMAX]) # ⎺⎺⎺⎺⎺ NAND _/⎺\_ => ⎺⎺\_/⎺⎺ (neg pulse, .45@B -> .35@Z)
+    wave_assert([[TMIN,TMAX],[2,2.35,TMAX]], [TMIN,TMAX])          # ⎺⎺⎺⎺⎺ NAND _/⎺\_ => ⎺⎺⎺⎺⎺⎺⎺ (neg pulse, .35@B -> .25@Z, filtered)
+
+
+def test_tiny_circuit():
+    c = bench.parse('input(x, y) output(a, o, n) a=and(x,y) o=or(x,y) n=not(x)')
+    lt = np.zeros((len(c.lines), 2, 2))
+    lt[:,0,:] = 1.0  # unit delay for all lines
+    wsim = WaveSim(c, lt)
+    print(wsim.prim_counts)
+    assert len(wsim.s) == 5
+    
+    # values for x
+    wsim.s[0,0,:3] = 0, 0.1, 0
+    wsim.s[0,1,:3] = 0, 0.2, 1
+    wsim.s[0,2,:3] = 1, 0.3, 0
+    wsim.s[0,3,:3] = 1, 0.4, 1
+
+    # values for y
+    wsim.s[1,0,:3] = 1, 0.5, 0
+    wsim.s[1,1,:3] = 1, 0.6, 0
+    wsim.s[1,2,:3] = 1, 0.7, 0
+    wsim.s[1,3,:3] = 0, 0.8, 1
+    
+    wsim.s_to_c()
+
+    x_c_loc = wsim.vat[wsim.ppi_offset+0, 0] # check x waveforms
+    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 0], [TMAX, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 1], [0.2, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 2], [TMIN, 0.3, TMAX])
+    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 3], [TMIN, TMAX, TMAX])
+
+    y_c_loc = wsim.vat[wsim.ppi_offset+1, 0] # check y waveforms
+    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 0], [TMIN, 0.5, TMAX])
+    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 1], [TMIN, 0.6, TMAX])
+    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 2], [TMIN, 0.7, TMAX])
+    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 3], [0.8, TMAX, TMAX])
+
+    wsim.c_prop()
+
+    a_c_loc = wsim.vat[wsim.ppo_offset+2, 0] # check a waveforms
+    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 0], [TMAX, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 1], [1.2, 1.6, TMAX])
+    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 2], [TMIN, 1.3, TMAX])
+    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 3], [1.8, TMAX, TMAX])
+
+    o_c_loc = wsim.vat[wsim.ppo_offset+3, 0] # check o waveforms
+    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 0], [TMIN, 1.5, TMAX])
+    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 1], [TMIN, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 2], [TMIN, 1.7, TMAX])
+    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 3], [TMIN, TMAX, TMAX])
+    
+    n_c_loc = wsim.vat[wsim.ppo_offset+4, 0] # check n waveforms
+    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 0], [TMIN, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 1], [TMIN, 1.2, TMAX])
+    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 2], [1.3, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 3], [TMAX, TMAX, TMAX])
+
+    wsim.c_to_s()
+
+    # check a captures
+    np.testing.assert_allclose(wsim.s[2, 0, 3:7], [0, TMAX, TMIN, 0])
+    np.testing.assert_allclose(wsim.s[2, 1, 3:7], [0, 1.2, 1.6, 0])
+    np.testing.assert_allclose(wsim.s[2, 2, 3:7], [1, 1.3, 1.3, 0])
+    np.testing.assert_allclose(wsim.s[2, 3, 3:7], [0, 1.8, 1.8, 1])
+
+    # check o captures
+    np.testing.assert_allclose(wsim.s[3, 0, 3:7], [1, 1.5, 1.5, 0])
+    np.testing.assert_allclose(wsim.s[3, 1, 3:7], [1, TMAX, TMIN, 1])
+    np.testing.assert_allclose(wsim.s[3, 2, 3:7], [1, 1.7, 1.7, 0])
+    np.testing.assert_allclose(wsim.s[3, 3, 3:7], [1, TMAX, TMIN, 1])
+
+    # check o captures
+    np.testing.assert_allclose(wsim.s[4, 0, 3:7], [1, TMAX, TMIN, 1])
+    np.testing.assert_allclose(wsim.s[4, 1, 3:7], [1, 1.2, 1.2, 0])
+    np.testing.assert_allclose(wsim.s[4, 2, 3:7], [0, 1.3, 1.3, 1])
+    np.testing.assert_allclose(wsim.s[4, 3, 3:7], [0, TMAX, TMIN, 0])
+
+
+def compare_to_logic_sim(wsim: WaveSim):
+    tests = MVArray((len(wsim.s_nodes), wsim.sims))
+    choices = np.asarray([logic.ZERO, logic.ONE, logic.RISE, logic.FALL], dtype=np.uint8)
+    rng = np.random.default_rng(10)
+    tests.data[...] = rng.choice(choices, tests.data.shape)
+
+    wsim.s[..., 0] = (tests.data & 2) >> 1
+    wsim.s[..., 3] = (tests.data & 2) >> 1
+    wsim.s[..., 1] = 0.0
+    wsim.s[..., 2] = tests.data & 1
+    wsim.s[..., 6] = tests.data & 1
+    
+    wsim.s_to_c()
+    wsim.c_prop()
+    wsim.c_to_s()
+
+    resp = MVArray(tests)
+    resp.data[...] = wsim.s[..., 6].astype(np.uint8) | (wsim.s[..., 3].astype(np.uint8)<<1)
+    resp.data |= ((resp.data ^ (resp.data >> 1)) & 1) << 2  # transitions
+
+    tests_bp = BPArray(tests)    
+    lsim = LogicSim(wsim.circuit, len(tests_bp))
+    lsim.assign(tests_bp)
+    lsim.propagate()
+    exp_bp = BPArray(tests_bp)
+    lsim.capture(exp_bp)
+    exp = MVArray(exp_bp)
+
+    for i in range(8):
+        exp_str = exp[i].replace('P', '0').replace('N', '1')
+        res_str = resp[i].replace('P', '0').replace('N', '1')
+        assert res_str == exp_str
+
+
+def test_b14(mydir):
+    c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
+    df = sdf.load(mydir / 'b14.sdf.gz')
+    lt = df.annotation(c)
+    wsim = WaveSim(c, lt, 8)
+    compare_to_logic_sim(wsim)
+
+
+def test_b14_strip_forks(mydir):
+    c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
+    df = sdf.load(mydir / 'b14.sdf.gz')
+    lt = df.annotation(c)
+    wsim = WaveSim(c, lt, 8, strip_forks=True)
+    compare_to_logic_sim(wsim)
+