diff --git a/src/kyupy/__init__.py b/src/kyupy/__init__.py
index 9be6485..ea967b4 100644
--- a/src/kyupy/__init__.py
+++ b/src/kyupy/__init__.py
@@ -17,6 +17,10 @@ import numpy as np
 _pop_count_lut = np.asarray([bin(x).count('1') for x in range(256)])
 
 
+def cdiv(x, y):
+    return -(x // -y)
+
+
 def popcount(a):
     """Returns the number of 1-bits in a given packed numpy array."""
     return np.sum(_pop_count_lut[a])
diff --git a/src/kyupy/circuit.py b/src/kyupy/circuit.py
index 8802cec..5342a1a 100644
--- a/src/kyupy/circuit.py
+++ b/src/kyupy/circuit.py
@@ -228,10 +228,10 @@ class Circuit:
         Usually, nodes in the io_nodes list without any lines in their :py:attr:`Node.ins` list are primary inputs,
         and nodes without any lines in their :py:attr:`Node.outs` list are regarded as primary outputs.
         """
-        self.cells = {}
+        self.cells : dict[str, Node] = {}
         """A dictionary to access cells by name.
         """
-        self.forks = {}
+        self.forks : dict[str, Node] = {}
         """A dictionary to access forks by name.
         """
 
diff --git a/src/kyupy/sdf.py b/src/kyupy/sdf.py
index c1e3ebf..e2e9e0b 100644
--- a/src/kyupy/sdf.py
+++ b/src/kyupy/sdf.py
@@ -15,6 +15,7 @@ import numpy as np
 from lark import Lark, Transformer
 
 from . import log, readtext
+from .circuit import Circuit
 from .techlib import TechLib
 
 
@@ -27,17 +28,48 @@ class DelayFile:
     """
     def __init__(self, name, cells):
         self.name = name
-        if None in cells:
-            self.interconnects = cells[None]
-        else:
-            self.interconnects = None
+        self.interconnects = cells.get(None, None)
         self.cells = dict((n, l) for n, l in cells.items() if n)
 
     def __repr__(self):
         return '\n'.join(f'{n}: {l}' for n, l in self.cells.items()) + '\n' + \
                '\n'.join(str(i) for i in self.interconnects)
 
-    def annotation(self, circuit, tlib=TechLib(), dataset=1, interconnect=True, ffdelays=True):
+    def iopaths(self, circuit:Circuit, tlib=TechLib()):
+        """Constructs an ndarray containing all IOPATH delays.
+
+        All IOPATH delays for a node `n` are annotated to the line connected to the input pin specified in the IOPATH.
+
+        Axis 0: dataset (usually 3 datasets per SDF-file)
+        Axis 1: line index (e.g. `n.ins[0]`, `n.ins[1]`)
+        Axis 2: polarity of the transition at the IOPATH-input (e.g. at `n.ins[0]` or `n.ins[1]`), 0='rising/posedge', 1='falling/negedge'
+        Axis 3: polarity of the transition at the IOPATH-output (at `n.outs[0]`), 0='rising/posedge', 1='falling/negedge'
+        """
+
+        def find_cell(name:str):
+            if name not in circuit.cells: name = name.replace('\\', '')
+            if name not in circuit.cells: name = name.replace('[', '_').replace(']', '_')
+            return circuit.cells.get(name, None)
+
+        delays = np.zeros((len(circuit.lines), 2, 2, 3))  # dataset last during construction.
+
+        for name, iopaths in self.cells.items():
+            if cell := find_cell(name):
+                for i_pin_spec, o_pin_spec, *dels in iopaths:
+                    if i_pin_spec.startswith('(posedge '): i_pol_idxs = [0]
+                    elif i_pin_spec.startswith('(negedge '): i_pol_idxs = [1]
+                    else: i_pol_idxs = [0, 1]
+                    i_pin_spec = re.sub(r'\((neg|pos)edge ([^)]+)\)', r'\2', i_pin_spec)
+                    if line := cell.ins[tlib.pin_index(cell.kind, i_pin_spec)]:
+                        delays[line, i_pol_idxs] = [d if len(d) > 0 else [0, 0, 0] for d in dels]
+                    else:
+                        log.warn(f'No line to annotate in circuit: {i_pin_spec} for {cell}')
+            else:
+                log.warn(f'Name from SDF not found in circuit: {name}')
+
+        return np.moveaxis(delays, -1, 0)
+
+    def annotation(self, circuit:Circuit, tlib=TechLib(), dataset=1, interconnect=True, ffdelays=True):
         """Constructs an 3-dimensional ndarray with timing data for each line in ``circuit``.
 
         An IOPATH delay for a node is annotated to the line connected to the input pin specified in the IOPATH.
@@ -75,11 +107,9 @@ class DelayFile:
                 return sum(_delvals[idx][d] for d in dataset) / len(dataset)
             return _delvals[idx][dataset]
 
-        def find_cell(name):
-            if name not in circuit.cells:
-                name = name.replace('\\', '')
-            if name not in circuit.cells:
-                name = name.replace('[', '_').replace(']', '_')
+        def find_cell(name:str):
+            if name not in circuit.cells: name = name.replace('\\', '')
+            if name not in circuit.cells: name = name.replace('[', '_').replace(']', '_')
             if name not in circuit.cells:
                 return None
             return circuit.cells[name]
diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py
index 0796968..a6feffd 100644
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@@ -47,7 +47,7 @@ class WaveSim(sim.SimOps):
     :param keep_waveforms: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
         memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
     """
-    def __init__(self, circuit, timing, sims=8, c_caps=16, c_reuse=False, strip_forks=False):
+    def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False):
         assert c_caps > 0 and c_caps % 4 == 0
         super().__init__(circuit, c_caps=c_caps//4, c_reuse=c_reuse, strip_forks=strip_forks)
         self.sims = sims
@@ -56,8 +56,8 @@ class WaveSim(sim.SimOps):
         self.c_locs[...] *= 4
         self.c_caps[...] *= 4
 
-        self.timing = np.zeros((self.c_locs_len, 2, 2))
-        self.timing[:len(timing)] = timing
+        self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype)
+        self.delays[:, :delays.shape[1]] = delays
 
         self.c = np.zeros((self.c_len, sims), dtype=np.float32) + TMAX
         self.s = np.zeros((11, self.s_len, sims), dtype=np.float32)
@@ -128,7 +128,7 @@ class WaveSim(sim.SimOps):
         sims = min(sims or self.sims, self.sims)
         for op_start, op_stop in zip(self.level_starts, self.level_stops):
             level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, 0, sims,
-                                         self.timing, self.params, sd, seed)
+                                         self.delays, self.params, sd, seed)
 
     def c_to_s(self, time=TMAX, sd=0.0, seed=1):
         """Simulates a capture operation at all sequential elements and primary outputs.
@@ -173,7 +173,7 @@ def rand_gauss_cpu(seed, sd):
 
 
 @numba.njit
-def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, seed=0):
+def wave_eval_cpu_old(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, seed=0):
     lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
 
     # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
@@ -191,7 +191,7 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, s
     a_cur = int(0)
     b_cur = int(0)
     c_cur = int(0)
-    d_cur = int(0)                                          
+    d_cur = int(0)
     z_cur = lut & 1
     if z_cur == 1:
         cbuf[z_mem, st_idx] = TMIN
@@ -276,17 +276,116 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, s
 
         current_t = min(a, b, c, d)
 
-    # generate overflow flag or propagate from input
+    # generate or propagate overflow flag
     cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
 
 
+
 @numba.njit
-def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, line_times, params, sd, seed):
+def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=0):
+    lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
+
+    # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
+    overflows = int(0)
+
+    if len(delays) > 1:
+        _rnd = (seed << 4) + (z_idx << 20) + (st_idx << 1)
+        for _ in range(4):
+            _rnd = int(0xDEECE66D) * _rnd + 0xB
+        delays = delays[_rnd % len(delays)]
+    else:
+        delays = delays[0]
+
+    a_mem = c_locs[a_idx]
+    b_mem = c_locs[b_idx]
+    c_mem = c_locs[c_idx]
+    d_mem = c_locs[d_idx]
+    z_mem = c_locs[z_idx]
+    z_cap = c_caps[z_idx]
+
+    a_cur = int(0)
+    b_cur = int(0)
+    c_cur = int(0)
+    d_cur = int(0)
+    z_cur = lut & 1
+    if z_cur == 1:
+        cbuf[z_mem, st_idx] = TMIN
+
+    z_val = z_cur
+
+    a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val]
+    b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val]
+    c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val]
+    d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val]
+
+    previous_t = TMIN
+
+    current_t = min(a, b, c, d)
+    inputs = int(0)
+
+    while current_t < TMAX:
+        if a == current_t:
+            a_cur += 1
+            inputs ^= 1
+            thresh = delays[a_idx, 0, z_val]
+            a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val]
+            next_t = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val ^ 1]
+        elif b == current_t:
+            b_cur += 1
+            inputs ^= 2
+            thresh = delays[b_idx, 0, z_val]
+            b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val]
+            next_t = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val ^ 1]
+        elif c == current_t:
+            c_cur += 1
+            inputs ^= 4
+            thresh = delays[c_idx, 0, z_val]
+            c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val]
+            next_t = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val ^ 1]
+        else:
+            d_cur += 1
+            inputs ^= 8
+            thresh = delays[d_idx, 0, z_val]
+            d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val]
+            next_t = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val ^ 1]
+
+        if (z_cur & 1) != ((lut >> inputs) & 1):
+            # we generate an edge in z_mem, if ...
+            if (z_cur == 0                            # it is the first edge in z_mem ...
+                or next_t < current_t                 # -OR- the next edge on SAME input is EARLIER (need current edge to filter BOTH in next iteration) ...
+                or (current_t - previous_t) > thresh  # -OR- the generated hazard is wider than pulse threshold.
+                ):
+                if z_cur < (z_cap - 1):  # enough space in z_mem?
+                    cbuf[z_mem + z_cur, st_idx] = current_t
+                    previous_t = current_t
+                    z_cur += 1
+                else:
+                    overflows += 1
+                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
+                    z_cur -= 1
+            else:
+                z_cur -= 1
+                previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
+
+            # output value of cell changed. update all delayed inputs.
+            z_val = z_val ^ 1
+            a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val]
+            b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val]
+            c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val]
+            d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val]
+
+        current_t = min(a, b, c, d)
+
+    # generate or propagate overflow flag
+    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
+
+@numba.njit
+def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, delays, params, sd, seed):
     overflows = 0
     for op_idx in range(op_start, op_stop):
         op = ops[op_idx]
         for st_idx in range(st_start, st_stop):
-            wave_eval_cpu(op, c, c_locs, c_caps, st_idx, line_times, params[st_idx], sd, seed)
+            wave_eval_cpu(op, c, c_locs, c_caps, st_idx, delays, params[st_idx], sd, seed)
 
 
 @numba.njit
@@ -342,15 +441,15 @@ class WaveSimCuda(WaveSim):
     All internal memories are mirrored into GPU memory upon construction.
     Some operations like access to single waveforms can involve large communication overheads.
     """
-    def __init__(self, circuit, timing, sims=8, c_caps=16, c_reuse=False, strip_forks=False):
-        super().__init__(circuit, timing, sims, c_caps, c_reuse, strip_forks)
+    def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False):
+        super().__init__(circuit, delays, sims, c_caps, c_reuse, strip_forks)
 
         self.c = cuda.to_device(self.c)
         self.s = cuda.to_device(self.s)
         self.ops = cuda.to_device(self.ops)
         self.c_locs = cuda.to_device(self.c_locs)
         self.c_caps = cuda.to_device(self.c_caps)
-        self.timing = cuda.to_device(self.timing)
+        self.delays = cuda.to_device(self.delays)
         self.params = cuda.to_device(self.params)
 
         self._block_dim = (32, 16)
@@ -369,7 +468,7 @@ class WaveSimCuda(WaveSim):
         for op_start, op_stop in zip(self.level_starts, self.level_stops):
             grid_dim = self._grid_dim(sims, op_stop - op_start)
             wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, int(0),
-                sims, self.timing, self.params, sd, seed)
+                sims, self.delays, self.params, sd, seed)
         cuda.synchronize()
 
     def c_to_s(self, time=TMAX, sd=0.0, seed=1):
@@ -423,7 +522,7 @@ def rand_gauss_gpu(seed, sd):
 
 
 @cuda.jit()
-def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, line_times, param, sd, seed):
+def wave_eval_gpu_old(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, line_times, param, sd, seed):
     x, y = cuda.grid(2)
     st_idx = st_start + x
     op_idx = op_start + y
@@ -539,7 +638,119 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_sto
 
         current_t = min(a, b, c, d)
 
-    # generate overflow flag or propagate from input
+    # generate or propagate overflow flag
+    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
+
+
+@cuda.jit()
+def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, delays, param, sd, seed):
+    x, y = cuda.grid(2)
+    st_idx = st_start + x
+    op_idx = op_start + y
+    if st_idx >= st_stop: return
+    if op_idx >= op_stop: return
+
+    lut = ops[op_idx, 0]
+    z_idx = ops[op_idx, 1]
+    a_idx = ops[op_idx, 2]
+    b_idx = ops[op_idx, 3]
+    c_idx = ops[op_idx, 4]
+    d_idx = ops[op_idx, 5]
+
+    param = param[st_idx]
+
+    # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
+    overflows = int(0)
+
+    if len(delays) > 1:
+        _rnd = (seed << 4) + (z_idx << 20) + (st_idx << 1)
+        for _ in range(4):
+            _rnd = int(0xDEECE66D) * _rnd + 0xB
+        delays = delays[_rnd % len(delays)]
+    else:
+        delays = delays[0]
+
+    a_mem = c_locs[a_idx]
+    b_mem = c_locs[b_idx]
+    c_mem = c_locs[c_idx]
+    d_mem = c_locs[d_idx]
+    z_mem = c_locs[z_idx]
+    z_cap = c_caps[z_idx]
+
+    a_cur = int(0)
+    b_cur = int(0)
+    c_cur = int(0)
+    d_cur = int(0)
+    z_cur = lut & 1
+    if z_cur == 1:
+        cbuf[z_mem, st_idx] = TMIN
+
+    z_val = z_cur
+
+    a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val]
+    b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val]
+    c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val]
+    d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val]
+
+    previous_t = TMIN
+
+    current_t = min(a, b, c, d)
+    inputs = int(0)
+
+    while current_t < TMAX:
+        if a == current_t:
+            a_cur += 1
+            inputs ^= 1
+            thresh = delays[a_idx, 0, z_val]
+            a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val]
+            next_t = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val ^ 1]
+        elif b == current_t:
+            b_cur += 1
+            inputs ^= 2
+            thresh = delays[b_idx, 0, z_val]
+            b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val]
+            next_t = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val ^ 1]
+        elif c == current_t:
+            c_cur += 1
+            inputs ^= 4
+            thresh = delays[c_idx, 0, z_val]
+            c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val]
+            next_t = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val ^ 1]
+        else:
+            d_cur += 1
+            inputs ^= 8
+            thresh = delays[d_idx, 0, z_val]
+            d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val]
+            next_t = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val ^ 1]
+
+        if (z_cur & 1) != ((lut >> inputs) & 1):
+            # we generate an edge in z_mem, if ...
+            if (z_cur == 0                            # it is the first edge in z_mem ...
+                or next_t < current_t                 # -OR- the next edge on SAME input is EARLIER (need current edge to filter BOTH in next iteration) ...
+                or (current_t - previous_t) > thresh  # -OR- the generated hazard is wider than pulse threshold.
+                ):
+                if z_cur < (z_cap - 1):  # enough space in z_mem?
+                    cbuf[z_mem + z_cur, st_idx] = current_t
+                    previous_t = current_t
+                    z_cur += 1
+                else:
+                    overflows += 1
+                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
+                    z_cur -= 1
+            else:
+                z_cur -= 1
+                previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
+
+            # output value of cell changed. update all delayed inputs.
+            z_val = z_val ^ 1
+            a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val]
+            b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val]
+            c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val]
+            d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val]
+
+        current_t = min(a, b, c, d)
+
+    # generate or propagate overflow flag
     cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 5d88c6b..e2b538e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,6 +13,6 @@ def b14_circuit(mydir):
     return verilog.load(mydir / 'b14.v.gz', branchforks=True)
 
 @pytest.fixture(scope='session')
-def b14_timing(mydir, b14_circuit):
+def b14_delays(mydir, b14_circuit):
     from kyupy import sdf
-    return sdf.load(mydir / 'b14.sdf.gz').annotation(b14_circuit)
+    return sdf.load(mydir / 'b14.sdf.gz').iopaths(b14_circuit)[1:2]
diff --git a/tests/test_wave_sim.py b/tests/test_wave_sim.py
index 6a4bad3..1003689 100644
--- a/tests/test_wave_sim.py
+++ b/tests/test_wave_sim.py
@@ -11,28 +11,28 @@ def test_nand_delays():
     c = np.full((5*16, 1), TMAX)  # 5 waveforms of capacity 16
     c_locs = np.zeros((5,), dtype='int')
     c_caps = np.zeros((5,), dtype='int')
-    
+
     for i in range(5): c_locs[i], c_caps[i] = i*16, 16  # 1:1 mapping
 
     # SDF specifies IOPATH delays with respect to output polarity
     # SDF pulse rejection value is determined by IOPATH causing last transition and polarity of last transition
-    line_times = np.zeros((5, 2, 2))
-    line_times[0, 0, 0] = 0.1  # A -> Z rise delay
-    line_times[0, 0, 1] = 0.2  # A -> Z fall delay
-    line_times[0, 1, 0] = 0.1  # A -> Z negative pulse limit (terminate in rising Z)
-    line_times[0, 1, 1] = 0.2  # A -> Z positive pulse limit
-    line_times[1, :, 0] = 0.3  # as above for B -> Z
-    line_times[1, :, 1] = 0.4
-    line_times[2, :, 0] = 0.5  # as above for C -> Z
-    line_times[2, :, 1] = 0.6
-    line_times[3, :, 0] = 0.7  # as above for D -> Z
-    line_times[3, :, 1] = 0.8
+    delays = np.zeros((1, 5, 2, 2))
+    delays[0, 0, 0, 0] = 0.1  # A -> Z rise delay
+    delays[0, 0, 0, 1] = 0.2  # A -> Z fall delay
+    delays[0, 0, 1, 0] = 0.1  # A -> Z negative pulse limit (terminate in rising Z)
+    delays[0, 0, 1, 1] = 0.2  # A -> Z positive pulse limit
+    delays[0, 1, :, 0] = 0.3  # as above for B -> Z
+    delays[0, 1, :, 1] = 0.4
+    delays[0, 2, :, 0] = 0.5  # as above for C -> Z
+    delays[0, 2, :, 1] = 0.6
+    delays[0, 3, :, 0] = 0.7  # as above for D -> Z
+    delays[0, 3, :, 1] = 0.8
 
     sdata = np.asarray([1, -1, 0, 0], dtype='float32')
 
     def wave_assert(inputs, output):
         for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
-        wave_eval_cpu(op, c, c_locs, c_caps, 0, line_times, sdata)
+        wave_eval_cpu(op, c, c_locs, c_caps, 0, delays, sdata)
         for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[4,i], v)
 
     wave_assert([[TMAX,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(0,0,1,1) => 1
@@ -50,75 +50,74 @@ def test_nand_delays():
 
 def test_tiny_circuit():
     c = bench.parse('input(x, y) output(a, o, n) a=and(x,y) o=or(x,y) n=not(x)')
-    lt = np.zeros((len(c.lines), 2, 2))
-    lt[:,0,:] = 1.0  # unit delay for all lines
-    wsim = WaveSim(c, lt)
+    delays = np.full((1, len(c.lines), 2, 2), 1.0)  # unit delay for all lines
+    wsim = WaveSim(c, delays)
     assert wsim.s.shape[1] == 5
 
     # values for x
-    wsim.s[:3,0,0] = 0, 0.1, 0
-    wsim.s[:3,0,1] = 0, 0.2, 1
-    wsim.s[:3,0,2] = 1, 0.3, 0
-    wsim.s[:3,0,3] = 1, 0.4, 1
-             
+    wsim.s[:3,0,0] = 0, 10, 0
+    wsim.s[:3,0,1] = 0, 20, 1
+    wsim.s[:3,0,2] = 1, 30, 0
+    wsim.s[:3,0,3] = 1, 40, 1
+
     # values for y
-    wsim.s[:3,1,0] = 1, 0.5, 0
-    wsim.s[:3,1,1] = 1, 0.6, 0
-    wsim.s[:3,1,2] = 1, 0.7, 0
-    wsim.s[:3,1,3] = 0, 0.8, 1
+    wsim.s[:3,1,0] = 1, 50, 0
+    wsim.s[:3,1,1] = 1, 60, 0
+    wsim.s[:3,1,2] = 1, 70, 0
+    wsim.s[:3,1,3] = 0, 80, 1
 
     wsim.s_to_c()
 
     x_c_loc = wsim.c_locs[wsim.ppi_offset+0] # check x waveforms
     np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 0], [TMAX, TMAX, TMAX])
-    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 1], [0.2, TMAX, TMAX])
-    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 2], [TMIN, 0.3, TMAX])
+    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 1], [20, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 2], [TMIN, 30, TMAX])
     np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 3], [TMIN, TMAX, TMAX])
 
     y_c_loc = wsim.c_locs[wsim.ppi_offset+1] # check y waveforms
-    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 0], [TMIN, 0.5, TMAX])
-    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 1], [TMIN, 0.6, TMAX])
-    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 2], [TMIN, 0.7, TMAX])
-    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 3], [0.8, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 0], [TMIN, 50, TMAX])
+    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 1], [TMIN, 60, TMAX])
+    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 2], [TMIN, 70, TMAX])
+    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 3], [80, TMAX, TMAX])
 
     wsim.c_prop()
 
     a_c_loc = wsim.c_locs[wsim.ppo_offset+2] # check a waveforms
     np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 0], [TMAX, TMAX, TMAX])
-    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 1], [1.2, 1.6, TMAX])
-    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 2], [TMIN, 1.3, TMAX])
-    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 3], [1.8, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 1], [21, 61, TMAX])
+    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 2], [TMIN, 31, TMAX])
+    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 3], [81, TMAX, TMAX])
 
     o_c_loc = wsim.c_locs[wsim.ppo_offset+3] # check o waveforms
-    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 0], [TMIN, 1.5, TMAX])
+    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 0], [TMIN, 51, TMAX])
     np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 1], [TMIN, TMAX, TMAX])
-    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 2], [TMIN, 1.7, TMAX])
+    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 2], [TMIN, 71, TMAX])
     np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 3], [TMIN, TMAX, TMAX])
 
     n_c_loc = wsim.c_locs[wsim.ppo_offset+4] # check n waveforms
     np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 0], [TMIN, TMAX, TMAX])
-    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 1], [TMIN, 1.2, TMAX])
-    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 2], [1.3, TMAX, TMAX])
+    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 1], [TMIN, 21, TMAX])
+    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 2], [31, TMAX, TMAX])
     np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 3], [TMAX, TMAX, TMAX])
 
     wsim.c_to_s()
 
     # check a captures
     np.testing.assert_allclose(wsim.s[3:7, 2, 0], [0, TMAX, TMIN, 0])
-    np.testing.assert_allclose(wsim.s[3:7, 2, 1], [0, 1.2, 1.6, 0])
-    np.testing.assert_allclose(wsim.s[3:7, 2, 2], [1, 1.3, 1.3, 0])
-    np.testing.assert_allclose(wsim.s[3:7, 2, 3], [0, 1.8, 1.8, 1])
-                                      
-    # check o captures                
-    np.testing.assert_allclose(wsim.s[3:7, 3, 0], [1, 1.5, 1.5, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 2, 1], [0, 21, 61, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 2, 2], [1, 31, 31, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 2, 3], [0, 81, 81, 1])
+
+    # check o captures
+    np.testing.assert_allclose(wsim.s[3:7, 3, 0], [1, 51, 51, 0])
     np.testing.assert_allclose(wsim.s[3:7, 3, 1], [1, TMAX, TMIN, 1])
-    np.testing.assert_allclose(wsim.s[3:7, 3, 2], [1, 1.7, 1.7, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 3, 2], [1, 71, 71, 0])
     np.testing.assert_allclose(wsim.s[3:7, 3, 3], [1, TMAX, TMIN, 1])
-                                      
-    # check o captures                
+
+    # check o captures
     np.testing.assert_allclose(wsim.s[3:7, 4, 0], [1, TMAX, TMIN, 1])
-    np.testing.assert_allclose(wsim.s[3:7, 4, 1], [1, 1.2, 1.2, 0])
-    np.testing.assert_allclose(wsim.s[3:7, 4, 2], [0, 1.3, 1.3, 1])
+    np.testing.assert_allclose(wsim.s[3:7, 4, 1], [1, 21, 21, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 4, 2], [0, 31, 31, 1])
     np.testing.assert_allclose(wsim.s[3:7, 4, 3], [0, TMAX, TMIN, 0])
 
 
@@ -157,13 +156,16 @@ def compare_to_logic_sim(wsim: WaveSim):
     np.testing.assert_allclose(resp, exp)
 
 
-def test_b14(b14_circuit, b14_timing):
-    compare_to_logic_sim(WaveSim(b14_circuit, b14_timing, 8))
+def test_b14(b14_circuit, b14_delays):
+    compare_to_logic_sim(WaveSim(b14_circuit, b14_delays, 8))
+
 
+def test_b14_strip_forks(b14_circuit, b14_delays):
+    compare_to_logic_sim(WaveSim(b14_circuit, b14_delays, 8, strip_forks=True))
 
-def test_b14_strip_forks(b14_circuit, b14_timing):
-    compare_to_logic_sim(WaveSim(b14_circuit, b14_timing, 8, strip_forks=True))
 
+def test_b14_cuda(b14_circuit, b14_delays):
+    compare_to_logic_sim(WaveSimCuda(b14_circuit, b14_delays, 8, strip_forks=True))
 
-def test_b14_cuda(b14_circuit, b14_timing):
-    compare_to_logic_sim(WaveSimCuda(b14_circuit, b14_timing, 8, strip_forks=True))
+if __name__ == '__main__':
+    test_nand_delays()
\ No newline at end of file