21 changed files with 322 additions and 915 deletions
--- a/docs/conf.py
+++ b/docs/conf.py
@ -24,7 +24,7 @@ copyright = '2020-2023, Stefan Holst'
 author = 'Stefan Holst'
 # The full version, including alpha/beta/rc tags
-release = '0.0.5'
+release = '0.0.4'
 # -- General configuration ---------------------------------------------------
--- a/setup.py
+++ b/setup.py
@ -5,7 +5,7 @@ with open('README.rst', 'r') as f:
 setup(
    name='kyupy',
-    version='0.0.5',
+    version='0.0.4',
    description='High-performance processing and analysis of non-hierarchical VLSI designs',
    long_description=long_description,
    long_description_content_type='text/x-rst',
--- a/src/kyupy/init.py
+++ b/src/kyupy/init.py
@ -57,18 +57,6 @@ def hr_bytes(nbytes):
        multiplier += 1
    return f'{nbytes:.1f}{["", "ki", "Mi", "Gi", "Ti", "Pi"][multiplier]}B'
 def eng(number):
    """Formats a given number using engineering notation."""
    exponent = 0
    if abs(number) < 1:
        while abs(number) >= 1000:
            number *= 1000
            exponent -= 3
    else:
        while abs(number) >= 1000:
            number /= 1000
            exponent += 3
    return f'{number:.0f}' + (f'e{exponent}' if exponent != 0 else '')
 def hr_time(seconds):
    """Formats a given time interval for human readability."""
@ -150,10 +138,10 @@ class Log:
        self._limit = limit
    def stop_limit(self):
        self._limit = -1
        if self.filtered > 0:
-            self.info(f'{self.filtered} more messages (filtered).')
+            log.info(f'{self.filtered} more messages (filtered).')
            self.filtered = 0
        self._limit = -1
    def __getstate__(self):
        return {'elapsed': time.perf_counter() - self.start}
@ -161,8 +149,6 @@ class Log:
    def __setstate__(self, state):
        self.logfile = sys.stdout
        self.indent = 0
        self._limit = -1
        self.filtered = 0
        self.start = time.perf_counter() - state['elapsed']
    def write(self, s, indent=0):
--- a/src/kyupy/circuit.py
+++ b/src/kyupy/circuit.py
@ -10,40 +10,20 @@ Circuit graphs also define an ordering of inputs, outputs and other nodes to eas
 """
 from __future__ import annotations
 from collections import deque, defaultdict
 import re
 from typing import Union
 import numpy as np
 class GrowingList(list):
    def __setitem__(self, index, value):
-        if value is None: self.has_nones = True
+        if index >= len(self):
-        if index == len(self): return super().append(value)
+            self.extend([None] * (index + 1 - len(self)))
        if index > len(self):
            super().extend([None] * (index + 1 - len(self)))
            self.has_nones = True
        super().__setitem__(index, value)
-    def __getitem__(self, index):
+    def free_index(self):
-        if isinstance(index, slice): return super().__getitem__(index)
+        return next((i for i, x in enumerate(self) if x is None), len(self))
        return super().__getitem__(index) if index < len(self) else None
    @property
    def free_idx(self):
        fi = len(self)
        if hasattr(self, 'has_nones') and self.has_nones:
            fi = next((i for i, x in enumerate(self) if x is None), len(self))
            self.has_nones = fi < len(self)
        return fi
    def without_nones(self):
        for item in self:
            if item is not None:
                yield item
 class IndexList(list):
@ -96,10 +76,10 @@ class Node:
        by allocating an array or list :code:`my_data` of length :code:`len(n.circuit.nodes)` and
        accessing it by :code:`my_data[n.index]` or simply by :code:`my_data[n]`.
        """
-        self.ins: GrowingList[Line] = GrowingList()
+        self.ins = GrowingList()
        """A list of input connections (:class:`Line` objects).
        """
-        self.outs: GrowingList[Line] = GrowingList()
+        self.outs = GrowingList()
        """A list of output connections (:class:`Line` objects).
        """
@ -155,7 +135,7 @@ class Line:
    Use the explicit case only if connections to specific pins are required.
    It may overwrite any previous line references in the connection list of the nodes.
    """
-    def __init__(self, circuit: Circuit, driver: Union[Node, tuple[Node, int]], reader: Union[Node, tuple[Node, int]]):
+    def __init__(self, circuit, driver, reader):
        self.circuit = circuit
        """The :class:`Circuit` object the line is part of.
        """
@ -167,7 +147,7 @@ class Line:
        by allocating an array or list :code:`my_data` of length :code:`len(l.circuit.lines)` and
        accessing it by :code:`my_data[l.index]` or simply by :code:`my_data[l]`.
        """
-        if not isinstance(driver, tuple): driver = (driver, driver.outs.free_idx)
+        if not isinstance(driver, tuple): driver = (driver, driver.outs.free_index())
        self.driver = driver[0]
        """The :class:`Node` object that drives this line.
        """
@ -177,7 +157,7 @@ class Line:
        This is the position in the list :py:attr:`Node.outs` of the driving node this line referenced from:
        :code:`self.driver.outs[self.driver_pin] == self`.
        """
-        if not isinstance(reader, tuple): reader = (reader, reader.ins.free_idx)
+        if not isinstance(reader, tuple): reader = (reader, reader.ins.free_index())
        self.reader = reader[0]
        """The :class:`Node` object that reads this line.
        """
@ -312,7 +292,7 @@ class Circuit:
    def _locs(self, prefix, nodes):
        d_top = dict()
        for i, n in enumerate(nodes):
-            if m := re.match(fr'({re.escape(prefix)}.*?)((?:[\d_\[\]])*$)', n.name):
+            if m := re.match(fr'({prefix}.*?)((?:[\d_\[\]])*$)', n.name):
                path = [m[1]] + [int(v) for v in re.split(r'[_\[\]]+', m[2]) if len(v) > 0]
                d = d_top
                for j in path[:-1]:
@ -354,16 +334,15 @@ class Circuit:
    def get_or_add_fork(self, name):
        return self.forks[name] if name in self.forks else Node(self, name)
-    def remove_dangling_nodes(self, root_node:Node, keep=[]):
+    def remove_dangling_nodes(self, root_node:Node):
        if len([l for l in root_node.outs if l is not None]) > 0: return
        lines = [l for l in root_node.ins if l is not None]
        drivers = [l.driver for l in lines]
        if root_node in keep: return
        root_node.remove()
        for l in lines:
            l.remove()
        for d in drivers:
-            self.remove_dangling_nodes(d, keep=keep)
+            self.remove_dangling_nodes(d)
    def eliminate_1to1_forks(self):
        """Removes all forks that drive only one node.
@ -391,21 +370,6 @@ class Circuit:
            in_line.reader_pin = out_reader_pin
            in_line.reader.ins[in_line.reader_pin] = in_line
    def remove_forks(self):
        ios = set(self.io_nodes)
        for n in list(self.forks.values()):
            if n in ios: continue
            d = None
            if (l := n.ins[0]) is not None:
                d = l.driver
                l.remove()
            for l in list(n.outs):
                if l is None: continue
                r, rp = l.reader, l.reader_pin
                l.remove()
                if d is not None: Line(self, d, (r, rp))
            n.remove()
    def substitute(self, node, impl):
        """Replaces a given node with the given implementation circuit.
@ -464,7 +428,7 @@ class Circuit:
        for l, ll in zip(impl_out_lines, node_out_lines):  # connect outputs
            if ll is None:
                if l.driver in node_map:
-                    self.remove_dangling_nodes(node_map[l.driver], keep=ios)
+                    self.remove_dangling_nodes(node_map[l.driver])
                continue
            if len(l.reader.outs) > 0:  # output is also read by impl. circuit, connect to fork.
                ll.driver = node_map[l.reader]
@ -483,21 +447,6 @@ class Circuit:
            if n.kind in tlib.cells:
                self.substitute(n, tlib.cells[n.kind][0])
    def remove_constants(self):
        c1gen = None
        for n in self.nodes:
            if n.kind == '__const0__':  # just remove, unconnected inputs are defined 0.
                for l in n.outs:
                    l.remove()
                n.remove()
            elif n.kind == '__const1__':
                if c1gen is None: c1gen = Node(self, '__const1gen__', 'INV1')  # one unique const 1 generator
                for l in n.outs:
                    r, rp = l.reader, l.reader_pin
                    l.remove()
                    Line(self, c1gen, (r, rp))
                n.remove()
    def copy(self):
        """Returns a deep copy of the circuit.
        """
@ -552,15 +501,14 @@ class Circuit:
        substrings 'dff' or 'latch' are yielded first.
        """
        visit_count = np.zeros(len(self.nodes), dtype=np.uint32)
-        start = set(n for n in self.nodes if len(n.ins) == 0 or 'dff' in n.kind.lower() or 'latch' in n.kind.lower())
+        queue = deque(n for n in self.nodes if len(n.ins) == 0 or 'dff' in n.kind.lower() or 'latch' in n.kind.lower())
        queue = deque(start)
        while len(queue) > 0:
            n = queue.popleft()
            for line in n.outs:
                if line is None: continue
                succ = line.reader
                visit_count[succ] += 1
-                if visit_count[succ] == len(succ.ins) and succ not in start:
+                if visit_count[succ] == len(succ.ins) and 'dff' not in succ.kind.lower() and 'latch' not in succ.kind.lower():
                    queue.append(succ)
            yield n
@ -615,21 +563,6 @@ class Circuit:
            if marks[n]:
                yield n
    def fanout(self, origin_nodes):
        """Generator function to iterate over the fan-out cone of a given list of origin nodes.
        Nodes are yielded in topological order.
        """
        marks = [False] * len(self.nodes)
        for n in origin_nodes:
            marks[n] = True
        for n in self.topological_order():
            if not marks[n]:
                for line in n.ins.without_nones():
                    marks[n] |= marks[line.driver]
            if marks[n]:
                yield n
    def fanout_free_regions(self):
        for stem in self.reversed_topological_order():
            if len(stem.outs) == 1 and 'dff' not in stem.kind.lower(): continue
--- a/src/kyupy/logic.py
+++ b/src/kyupy/logic.py
@ -241,8 +241,6 @@ def mv_latch(d, t, q_prev, out=None):
 def mv_transition(init, final, out=None):
    """Computes the logic transitions from the initial values of ``init`` to the final values of ``final``.
    Pulses in the input data are ignored. If any of the inputs are ``UNKNOWN``, the result is ``UNKNOWN``.
    If init is ``UNASSIGNED``, the result is the final value of ``final``.
    If final is ``UNASSIGNED``, the result is the initial value of ``init``.
    If both inputs are ``UNASSIGNED``, the result is ``UNASSIGNED``.
    :param init: A multi-valued array.
@ -253,9 +251,7 @@ def mv_transition(init, final, out=None):
    out = out or np.empty(np.broadcast(init, final).shape, dtype=np.uint8)
    out[...] = (init & 0b010) | (final & 0b001)
    out[...] |= ((out << 1) ^ (out << 2)) & 0b100
-    out[...] = np.choose(init == UNASSIGNED, [out, (final & 0b001) * ONE])
+    unknown = (init == UNKNOWN) | (init == UNASSIGNED) | (final == UNKNOWN) | (final == UNASSIGNED)
    out[...] = np.choose(final == UNASSIGNED, [out, ((init & 0b010) >> 1) * ONE])
    unknown = (init == UNKNOWN) | (final == UNKNOWN)
    unassigned = (init == UNASSIGNED) & (final == UNASSIGNED)
    np.putmask(out, unknown, UNKNOWN)
    np.putmask(out, unassigned, UNASSIGNED)
@ -269,18 +265,6 @@ def mv_to_bp(mva):
    return np.packbits(unpackbits(mva)[...,:3], axis=-2, bitorder='little').swapaxes(-1,-2)
 def mv_init(mva):
    """Returns the initial binary values for mva.
    """
    return (mva>>1) & ((mva>>2)|mva) & 1
 def mv_final(mva):
    """Returns the final binary value of mva.
    """
    return mva & ((mva>>2)|(mva>>1)) & 1
 def bparray(*a):
    """Converts (lists of) Boolean values or strings into a bit-parallel array.
--- a/src/kyupy/logic_sim.py
+++ b/src/kyupy/logic_sim.py
@ -10,10 +10,9 @@ import math
 import numpy as np
-from . import numba, logic, hr_bytes, sim, eng, cdiv
+from . import numba, logic, hr_bytes, sim
 from .circuit import Circuit
 class LogicSim(sim.SimOps):
    """A bit-parallel naïve combinational simulator for 2-, 4-, or 8-valued logic.
@ -29,7 +28,7 @@ class LogicSim(sim.SimOps):
        self.m = m
        self.mdim = math.ceil(math.log2(m))
        self.sims = sims
-        nbytes = cdiv(sims, 8)
+        nbytes = (sims - 1) // 8 + 1
        self.c = np.zeros((self.c_len, self.mdim, nbytes), dtype=np.uint8)
        self.s = np.zeros((2, self.s_len, 3, nbytes), dtype=np.uint8)
@ -45,14 +44,14 @@ class LogicSim(sim.SimOps):
        self.s[:,:,1,:] = 255  # unassigned
    def __repr__(self):
-        return f'{{name: "{self.circuit.name}", sims: {self.sims}, m: {self.m}, c_bytes: {eng(self.c.nbytes)}}}'
+        return f'{{name: "{self.circuit.name}", sims: {self.sims}, m: {self.m}, c_bytes: {self.c.nbytes}}}'
    def s_to_c(self):
        """Copies the values from ``s[0]`` the inputs of the combinational portion.
        """
        self.c[self.pippi_c_locs] = self.s[0, self.pippi_s_locs, :self.mdim]
-    def c_prop(self, sims=None, inject_cb=None, flip_line=-1, flip_mask=None):
+    def c_prop(self, inject_cb=None):
        """Propagate the input values through the combinational circuit towards the outputs.
        Performs all logic operations in topological order.
@ -68,17 +67,10 @@ class LogicSim(sim.SimOps):
        t1 = self.c_locs[self.tmp2_idx]
        if self.m == 2:
            if inject_cb is None:
-                if flip_mask is None:
+                _prop_cpu(self.ops, self.c_locs, self.c)
                    flip_mask = np.full(self.c.shape[-1], 255, dtype=np.uint8)
                else:
                    if len(flip_mask) < self.c.shape[-1]:
                        flip_mask2 = np.full(self.c.shape[-1], 0, dtype=np.uint8)
                        flip_mask2[:len(flip_mask)] = flip_mask
                        flip_mask = flip_mask2
                _prop_cpu(self.ops, self.c_locs, self.c, int(flip_line), flip_mask)
            else:
-                for op, o0l, i0l, i1l, i2l, i3l in self.ops[:,:6]:
+                for op, o0, i0, i1, i2, i3 in self.ops[:,:6]:
-                    o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0l, i0l, i1l, i2l, i3l)]
+                    o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)]
                    if op == sim.BUF1: self.c[o0]=self.c[i0]
                    elif op == sim.INV1: self.c[o0] = ~self.c[i0]
                    elif op == sim.AND2: self.c[o0] = self.c[i0] & self.c[i1]
@ -113,10 +105,10 @@ class LogicSim(sim.SimOps):
                    elif op == sim.OAI211:self.c[o0] = ~((self.c[i0] | self.c[i1]) & self.c[i2] & self.c[i3])
                    elif op == sim.MUX21: self.c[o0] = (self.c[i0] & ~self.c[i2]) | (self.c[i1] & self.c[i2])
                    else: print(f'unknown op {op}')
-                    inject_cb(o0l, self.c[o0])
+                    inject_cb(o0, self.s[o0])
        elif self.m == 4:
-            for op, o0l, i0l, i1l, i2l, i3l in self.ops[:,:6]:
+            for op, o0, i0, i1, i2, i3 in self.ops[:,:6]:
-                o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0l, i0l, i1l, i2l, i3l)]
+                o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)]
                if op == sim.BUF1: self.c[o0]=self.c[i0]
                elif op == sim.INV1: logic.bp4v_not(self.c[o0], self.c[i0])
                elif op == sim.AND2: logic.bp4v_and(self.c[o0], self.c[i0], self.c[i1])
@ -189,10 +181,9 @@ class LogicSim(sim.SimOps):
                    logic.bp4v_and(self.c[t1], self.c[i1], self.c[i2])
                    logic.bp4v_or(self.c[o0], self.c[t0], self.c[t1])
                else: print(f'unknown op {op}')
                if inject_cb is not None: inject_cb(o0l, self.c[o0])
        else:
-            for op, o0l, i0l, i1l, i2l, i3l in self.ops[:,:6]:
+            for op, o0, i0, i1, i2, i3 in self.ops[:,:6]:
-                o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0l, i0l, i1l, i2l, i3l)]
+                o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)]
                if op == sim.BUF1: self.c[o0]=self.c[i0]
                elif op == sim.INV1: logic.bp8v_not(self.c[o0], self.c[i0])
                elif op == sim.AND2: logic.bp8v_and(self.c[o0], self.c[i0], self.c[i1])
@ -265,7 +256,7 @@ class LogicSim(sim.SimOps):
                    logic.bp8v_and(self.c[t1], self.c[i1], self.c[i2])
                    logic.bp8v_or(self.c[o0], self.c[t0], self.c[t1])
                else: print(f'unknown op {op}')
-                if inject_cb is not None: inject_cb(o0l, self.c[o0])
+                if inject_cb is not None: inject_cb(o0, self.s[o0])
    def c_to_s(self):
        """Copies (captures) the results of the combinational portion to ``s[1]``.
@ -305,9 +296,9 @@ class LogicSim(sim.SimOps):
@numba.njit
-def _prop_cpu(ops, c_locs, c, flip_line, flip_mask):
+def _prop_cpu(ops, c_locs, c):
-    for op, o0l, i0l, i1l, i2l, i3l in ops[:,:6]:
+    for op, o0, i0, i1, i2, i3 in ops[:,:6]:
-        o0, i0, i1, i2, i3 = [c_locs[x] for x in (o0l, i0l, i1l, i2l, i3l)]
+        o0, i0, i1, i2, i3 = [c_locs[x] for x in (o0, i0, i1, i2, i3)]
        if op == sim.BUF1: c[o0]=c[i0]
        elif op == sim.INV1: c[o0] = ~c[i0]
        elif op == sim.AND2: c[o0] = c[i0] & c[i1]
@ -342,124 +333,3 @@ def _prop_cpu(ops, c_locs, c, flip_line, flip_mask):
        elif op == sim.OAI211: c[o0] = ~((c[i0] | c[i1]) & c[i2] & c[i3])
        elif op == sim.MUX21: c[o0] = (c[i0] & ~c[i2]) | (c[i1] & c[i2])
        else: print(f'unknown op {op}')
        if flip_line >= 0 and o0l == flip_line:
            #n = len(flip_mask)
            c[o0] = c[o0] ^ flip_mask
 class LogicSim6V(sim.SimOps):
    """A bit-parallel naïve combinational simulator for 6-valued logic.
    :param circuit: The circuit to simulate.
    :param sims: The number of parallel logic simulations to perform.
    :param c_reuse: If True, intermediate signal values may get overwritten when not needed anymore to save memory.
    :param strip_forks: If True, forks are not included in the simulation model to save memory and simulation time.
    """
    def __init__(self, circuit: Circuit, sims: int = 8, c_reuse: bool = False, strip_forks: bool = False):
        super().__init__(circuit, c_reuse=c_reuse, strip_forks=strip_forks)
        self.sims = sims
        nbytes = cdiv(sims, 8)
        self.c = np.zeros((self.c_len, 3, nbytes), dtype=np.uint8)
        self.s = np.zeros((2, self.s_len, self.sims), dtype=np.uint8)
        """Logic values of the sequential elements (flip-flops) and ports.
        It is a pair of arrays in mv storage format:
        * ``s[0]`` Assigned values. Simulator will read (P)PI value from here.
        * ``s[1]`` Result values. Simulator will write (P)PO values here.
        Access this array to assign new values to the (P)PIs or read values from the (P)POs.
        """
    def __repr__(self):
        return f'{{name: "{self.circuit.name}", sims: {self.sims}, c_bytes: {eng(self.c.nbytes)}}}'
    def s_to_c(self):
        """Assigns the values from ``s[0]`` to the inputs of the combinational portion.
        """
        self.c[self.pippi_c_locs] = logic.mv_to_bp(self.s[0, self.pippi_s_locs])
    def c_prop(self):
        c_prop_cpu(self.ops, self.c, self.c_locs, self.tmp_idx, self.tmp2_idx)
    def c_to_s(self):
        """Captures the results of the combinational portion into ``s[1]``.
        """
        self.s[1, self.poppo_s_locs] = logic.bp_to_mv(self.c[self.poppo_c_locs])[:,:self.sims]
@numba.njit
 def c_prop_cpu(ops, c, c_locs, tmp_idx, tmp2_idx):
    t0 = c[c_locs[tmp_idx]]
    t1 = c[c_locs[tmp2_idx]]
    inv_op = np.array([255, 255, 0], dtype=np.uint8)[np.newaxis, :, np.newaxis]
    for op, o0l, i0l, i1l, i2l, i3l in ops[:,:6]:
        o0, i0, i1, i2, i3 = [c[c_locs[x]] for x in (o0l, i0l, i1l, i2l, i3l)]
        if op == sim.BUF1 or op == sim.INV1:
            o0[...] = i0
        elif op == sim.AND2 or op == sim.NAND2:
            o0[0] = i0[0] & i1[0]
            o0[1] = i0[1] & i1[1]
            o0[2] = (i0[2]&(i1[0]|i1[1]|i1[2])|
                     i1[2]&(i0[0]|i0[1]|i0[2]))
        elif op == sim.AND3 or op == sim.NAND3:
            o0[0] = i0[0] & i1[0] & i2[0]
            o0[1] = i0[1] & i1[1] & i2[1]
            o0[2] = (i0[2]&(i1[0]|i1[1]|i1[2])&(i2[0]|i2[1]|i2[2])|
                     i1[2]&(i0[0]|i0[1]|i0[2])&(i2[0]|i2[1]|i2[2])|
                     i2[2]&(i0[0]|i0[1]|i0[2])&(i1[0]|i1[1]|i1[2]))
        elif op == sim.AND4 or op == sim.NAND4:
            o0[0] = i0[0] & i1[0] & i2[0] & i3[0]
            o0[1] = i0[1] & i1[1] & i2[1] & i3[1]
            o0[2] = (i0[2]&(i1[0]|i1[1]|i1[2])&(i2[0]|i2[1]|i2[2])&(i3[0]|i3[1]|i3[2])|
                     i1[2]&(i0[0]|i0[1]|i0[2])&(i2[0]|i2[1]|i2[2])&(i3[0]|i3[1]|i3[2])|
                     i2[2]&(i0[0]|i0[1]|i0[2])&(i1[0]|i1[1]|i1[2])&(i3[0]|i3[1]|i3[2])|
                     i3[2]&(i0[0]|i0[1]|i0[2])&(i1[0]|i1[1]|i1[2])&(i2[0]|i2[1]|i2[2]))
        elif op == sim.OR2 or op == sim.NOR2:
            o0[0] = i0[0] | i1[0]
            o0[1] = i0[1] | i1[1]
            o0[2] = (i0[2]&(~i1[0]|~i1[1]|i1[2])|
                     i1[2]&(~i0[0]|~i0[1]|i0[2]))
        elif op == sim.OR3 or op == sim.NOR3:
            o0[0] = i0[0] | i1[0] | i2[0]
            o0[1] = i0[1] | i1[1] | i2[1]
            o0[2] = (i0[2]&(~i1[0]|~i1[1]|i1[2])&(~i2[0]|~i2[1]|i2[2])|
                     i1[2]&(~i0[0]|~i0[1]|i0[2])&(~i2[0]|~i2[1]|i2[2])|
                     i2[2]&(~i0[0]|~i0[1]|i0[2])&(~i1[0]|~i1[1]|i1[2]))
        elif op == sim.OR4 or op == sim.NOR4:
            o0[0] = i0[0] | i1[0] | i2[0] | i3[0]
            o0[1] = i0[1] | i1[1] | i2[1] | i3[1]
            o0[2] = (i0[2]&(~i1[0]|~i1[1]|i1[2])&(~i2[0]|~i2[1]|i2[2])&(~i3[0]|~i3[1]|i3[2])|
                     i1[2]&(~i0[0]|~i0[1]|i0[2])&(~i2[0]|~i2[1]|i2[2])&(~i3[0]|~i3[1]|i3[2])|
                     i2[2]&(~i0[0]|~i0[1]|i0[2])&(~i1[0]|~i1[1]|i1[2])&(~i3[0]|~i3[1]|i3[2])|
                     i3[2]&(~i0[0]|~i0[1]|i0[2])&(~i1[0]|~i1[1]|i1[2])&(~i2[0]|~i2[1]|i2[2]))
        elif op == sim.XOR2 or op == sim.XNOR2:
            o0[0] = i0[0] ^ i1[0]
            o0[1] = i0[1] ^ i1[1]
            o0[2] = i0[2] | i1[2]
        elif op == sim.MUX21:
            # t1 = ~i2
            t1[...] = i2 ^ inv_op
            # t0 = i0 & t1
            t0[0] = i0[0] & t1[0]
            t0[1] = i0[1] & t1[1]
            t0[2] = (i0[2]&(t1[0]|t1[1]|t1[2])|
                     t1[2]&(i0[0]|i0[1]|i0[2]))
            # t1 = i1 & i2
            t1[0] = i1[0] & i2[0]
            t1[1] = i1[1] & i2[1]
            t1[2] = (i1[2]&(i2[0]|i2[1]|i2[2])|
                     i2[2]&(i1[0]|i1[1]|i1[2]))
            # o0 = t0 | t1
            o0[0] = t0[0] | t1[0]
            o0[1] = t0[1] | t1[1]
            o0[2] = (t0[2]&(~t1[0]|~t1[1]|t1[2])|
                     t1[2]&(~t0[0]|~t0[1]|t0[2]))
        else: print(f'unknown op {op}')
        if (op == sim.INV1 or
            op == sim.NAND2 or op == sim.NAND3 or op == sim.NAND4 or
            op == sim.NOR2 or op == sim.NOR3 or op == sim.NOR4 or
            op == sim.XNOR2):
            o0[...] = o0 ^ inv_op
--- a/src/kyupy/sdf.py
+++ b/src/kyupy/sdf.py
@ -61,7 +61,6 @@ class DelayFile:
        delays = np.zeros((len(circuit.lines), 2, 2, 3))  # dataset last during construction.
        with log.limit(50):
        for name, iopaths in self.cells.items():
            name = name.replace('\\', '')
            if cell := circuit.cells.get(name, None):
@ -103,12 +102,11 @@ class DelayFile:
        delays = np.zeros((len(circuit.lines), 2, 2, 3))  # dataset last during construction.
        nonfork_annotations = 0
        for n1, n2, *delvals in self._interconnects:
            delvals = [d if len(d) > 0 else [0, 0, 0] for d in delvals]
            if max(max(delvals)) == 0: continue
-            cn1, pn1 = (n1, None) if (slash := n1.rfind('/')) < 0 else (n1[:slash], n1[slash+1:])
+            cn1, pn1 = n1.split('/') if '/' in n1 else (n1, None)
-            cn2, pn2 = (n2, None) if (slash := n2.rfind('/')) < 0 else (n2[:slash], n2[slash+1:])
+            cn2, pn2 = n2.split('/') if '/' in n2 else (n2, None)
            cn1 = cn1.replace('\\','')
            cn2 = cn2.replace('\\','')
            c1, c2 = circuit.cells[cn1], circuit.cells[cn2]
@ -121,27 +119,19 @@ class DelayFile:
                log.warn(f'No line to annotate pin {pn2} of {c2}')
                continue
            f1, f2 = c1.outs[p1].reader, c2.ins[p2].driver  # find the forks between cells.
            if f1 == c2 and f2 == c1:
                nonfork_annotations += 1
                if nonfork_annotations < 10:
                    log.warn(f'No fork between {c1.name}/{p1} and {c2.name}/{p2}, using {c2.name}/{p2}')
                line = c2.ins[p2]
            else:
            assert f1.kind == '__fork__'
            assert f2.kind == '__fork__'
-                if len(f2.outs) == 1:
+            if f1 != f2:  # at least two forks, make sure f2 is a branchfork connected to f1
-                    assert f1 == f2 or f1.outs[f2.ins[0].driver_pin] == f2.ins[0]
+                assert len(f2.outs) == 1
                assert f1.outs[f2.ins[0].driver_pin] == f2.ins[0]
                line = f2.ins[0]
            elif len(f2.outs) == 1:  # f1==f2, only OK when there is no fanout.
                line = f2.ins[0]
            else:
-                    nonfork_annotations += 1
+                log.warn(f'No branchfork to annotate interconnect delay {c1.name}/{p1}->{c2.name}/{p2}')
-                    if nonfork_annotations < 10:
+                continue
                        log.warn(f'No branchfork between {c1.name}/{p1} and {c2.name}/{p2}, using {c2.name}/{p2}')
                    line = c2.ins[p2]
            delays[line, :] = delvals
        if nonfork_annotations > 0:
            log.warn(f'{nonfork_annotations} interconnect annotations were moved to gate inputs due to missing forks.')
        return np.moveaxis(delays, -1, 0)
@ -166,10 +156,6 @@ class SdfTransformer(Transformer):
        entries = [e for a in args if hasattr(a, 'children') for e in a.children]
        return name, entries
    @staticmethod
    def cond(args):  # ignore conditions
        return args[1]
    @staticmethod
    def start(args):
        name = next((a for a in args if isinstance(a, str)), None)
@ -194,12 +180,9 @@ GRAMMAR = r"""
        | "(INSTANCE" ID? ")"
        | "(TIMINGCHECK" _ignore* ")"
        | delay )* ")"
-    delay: "(DELAY" "(ABSOLUTE" (interconnect | iopath | cond)* ")" ")"
+    delay: "(DELAY" "(ABSOLUTE" (interconnect | iopath)* ")" ")"
    interconnect: "(INTERCONNECT" ID ID triple* ")"
    iopath: "(IOPATH" ID_OR_EDGE ID_OR_EDGE triple* ")"
    cond: "(" "COND" cond_port_expr iopath ")"
    ?cond_port_expr: ID | "(" cond_port_expr ")" | cond_port_expr BINARY_OP cond_port_expr
    BINARY_OP: /&&/ | /==/
    NAME: /[^"]+/
    ID_OR_EDGE: ( /[^() ]+/ | "(" /[^)]+/ ")" )
    ID: ( /[^"() ]+/ | "\"" /[^"]+/ "\"" )
--- a/src/kyupy/sim.py
+++ b/src/kyupy/sim.py
@ -4,14 +4,9 @@ from bisect import bisect, insort_left
 import numpy as np
 from .circuit import Circuit
 BUF1 = np.uint16(0b1010_1010_1010_1010)
 INV1 = ~BUF1
 __const0__ = BUF1
 __const1__ = INV1
 AND2 = np.uint16(0b1000_1000_1000_1000)
 AND3 = np.uint16(0b1000_0000_1000_0000)
 AND4 = np.uint16(0b1000_0000_0000_0000)
@ -44,10 +39,7 @@ AOI211, OAI211 = ~AO211, ~OA211
 MUX21 = np.uint16(0b1100_1010_1100_1010)  # z = i1 if i2 else i0 (i2 is select)
-names = dict([(v, k) for k, v in globals().items() if isinstance(v, np.uint16) and '__' not in k])
+names = dict([(v, k) for k, v in globals().items() if isinstance(v, np.uint16)])
 prim2name = dict([(v, k) for k, v in globals().items() if isinstance(v, np.uint16) and '__' not in k])
 name2prim = dict([(k, v) for k, v in globals().items() if isinstance(v, np.uint16)])
 kind_prefixes = {
    'nand': (NAND4, NAND3, NAND2),
@ -164,7 +156,7 @@ class SimOps:
    :param c_reuse: If enabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
        memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
    """
-    def __init__(self, circuit: Circuit, c_caps=1, c_caps_min=1, a_ctrl=None, c_reuse=False, strip_forks=False):
+    def __init__(self, circuit, c_caps=1, c_caps_min=1, a_ctrl=None, c_reuse=False, strip_forks=False):
        self.circuit = circuit
        self.s_len = len(circuit.s_nodes)
@ -183,74 +175,84 @@ class SimOps:
        self.ppo_offset = self.ppi_offset + self.s_len
        self.c_locs_len = self.ppo_offset + self.s_len
-        # ALAP-toposort the circuit into self.ops
+        # translate circuit structure into self.ops
-        levels = []
+        ops = []
-
+        interface_dict = dict((n, i) for i, n in enumerate(circuit.s_nodes))
-        ppio2idx = dict((n, i) for i, n in enumerate(circuit.s_nodes))
+        for n in circuit.topological_order():
-        ppos = set([n for n in circuit.s_nodes if len(n.ins) > 0])
+            if n in interface_dict:
-        readers = np.array([1 if l.reader in ppos else len(l.reader.outs) for l in circuit.lines], dtype=np.int32)  # for ref-counting forks
+                inp_idx = self.ppi_offset + interface_dict[n]
-
+                if len(n.outs) > 0 and n.outs[0] is not None:  # first output of a PI/PPI
-        level_lines = [n.ins[0] for n in ppos]  # start from PPOs
+                    ops.append((BUF1, n.outs[0].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx, *a_ctrl[n.outs[0]]))
-        # FIXME: Should probably instanciate buffers for PPOs and attach DFF clocks
+                if 'dff' in n.kind.lower():  # second output of DFF is inverted
-
+                    if len(n.outs) > 1 and n.outs[1] is not None:
-        while len(level_lines) > 0:  # traverse the circuit level-wise back towards (P)PIs
+                        ops.append((INV1, n.outs[1].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx, *a_ctrl[n.outs[1]]))
-            level_ops = []
+                else:  # if not DFF, no output is inverted.
-            prev_level_lines = []
+                    for o_line in n.outs[1:]:
-
+                        if o_line is not None:
-            for l in level_lines:
+                            ops.append((BUF1, o_line.index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx, *a_ctrl[o_line]))
-                n = l.driver
+                continue
-                in_idxs = [n.ins[x].index if len(n.ins) > x and n.ins[x] is not None else self.zero_idx for x in [0,1,2,3]]
+            # regular node, not PI/PPI or PO/PPO
-                if n in ppio2idx:
+            o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx
-                    in_idxs[0] = self.ppi_offset + ppio2idx[n]
+            i0_idx = n.ins[0].index if len(n.ins) > 0 and n.ins[0] is not None else self.zero_idx
-                    if l.driver_pin == 1 and 'dff' in n.kind.lower():  # second output of DFF is inverted
+            i1_idx = n.ins[1].index if len(n.ins) > 1 and n.ins[1] is not None else self.zero_idx
-                        level_ops.append((INV1, l.index, *in_idxs, *a_ctrl[l]))
+            i2_idx = n.ins[2].index if len(n.ins) > 2 and n.ins[2] is not None else self.zero_idx
-                    else:
+            i3_idx = n.ins[3].index if len(n.ins) > 3 and n.ins[3] is not None else self.zero_idx
                        level_ops.append((BUF1, l.index, *in_idxs, *a_ctrl[l]))
                elif n.kind == '__fork__':
                    readers[n.ins[0]] -= 1
                    if readers[n.ins[0]] == 0: prev_level_lines.append(n.ins[0])
                    if not strip_forks: level_ops.append((BUF1, l.index, *in_idxs, *a_ctrl[l]))
                else:
                    prev_level_lines += n.ins
                    sp = None
            kind = n.kind.lower()
            if kind == '__fork__':
                if not strip_forks:
                    for o_line in n.outs:
                        if o_line is not None:
                            ops.append((BUF1, o_line.index, i0_idx, i1_idx, i2_idx, i3_idx, *a_ctrl[o_line]))
                continue
            sp = None
            for prefix, prims in kind_prefixes.items():
                if kind.startswith(prefix):
                    sp = prims[0]
-                            if in_idxs[3] == self.zero_idx:
+                    if i3_idx == self.zero_idx:
                        sp = prims[1]
-                                if in_idxs[2] == self.zero_idx:
+                        if i2_idx == self.zero_idx:
                            sp = prims[2]
                    break
            if sp is None:
                print('unknown cell type', kind)
            else:
-                        level_ops.append((sp, l.index, *in_idxs, *a_ctrl[l]))
+                ops.append((sp, o0_idx, i0_idx, i1_idx, i2_idx, i3_idx, *a_ctrl[o0_idx]))
            if len(level_ops) > 0: levels.append(level_ops)
            level_lines = prev_level_lines
-        self.levels = [np.asarray(lv, dtype=np.int32) for lv in levels[::-1]]
+        self.ops = np.asarray(ops, dtype='int32')
        level_sums = np.cumsum([0]+[len(lv) for lv in self.levels], dtype=np.int32)
        self.level_starts, self.level_stops = level_sums[:-1], level_sums[1:]
        self.ops = np.vstack(self.levels)
        # create a map from fanout lines to stem lines for fork stripping
-        stems = np.full(self.c_locs_len, -1, dtype=np.int32)  # default to -1: 'no fanout line'
+        stems = np.zeros(self.c_locs_len, dtype='int32') - 1  # default to -1: 'no fanout line'
        if strip_forks:
            for f in circuit.forks.values():
                prev_line = f.ins[0]
                while prev_line.driver.kind == '__fork__':
                    prev_line = prev_line.driver.ins[0]
                stem_idx = prev_line.index
                for ol in f.outs:
                    if ol is not None:
-                        stems[ol] = prev_line.index
+                        stems[ol] = stem_idx
-
+
-        ref_count = np.zeros(self.c_locs_len, dtype=np.int32)
+        # calculate level (distance from PI/PPI) and reference count for each line
-
+        levels = np.zeros(self.c_locs_len, dtype='int32')
-        for op in self.ops:
+        ref_count = np.zeros(self.c_locs_len, dtype='int32')
-            for x in [2, 3, 4, 5]:
+        level_starts = [0]
-                ref_count[stems[op[x]] if stems[op[x]] >= 0 else op[x]] += 1
+        current_level = 1
        for i, op in enumerate(self.ops):
            # if we fork-strip, always take the stems for determining fan-in level
            i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
            i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
            i2_idx = stems[op[4]] if stems[op[4]] >= 0 else op[4]
            i3_idx = stems[op[5]] if stems[op[5]] >= 0 else op[5]
            if levels[i0_idx] >= current_level or levels[i1_idx] >= current_level or levels[i2_idx] >= current_level or levels[i3_idx] >= current_level:
                current_level += 1
                level_starts.append(i)
            levels[op[1]] = current_level  # set level of the output line
            ref_count[i0_idx] += 1
            ref_count[i1_idx] += 1
            ref_count[i2_idx] += 1
            ref_count[i3_idx] += 1
        self.level_starts = np.asarray(level_starts, dtype='int32')
        self.level_stops = np.asarray(level_starts[1:] + [len(self.ops)], dtype='int32')
        # combinational signal allocation table. maps line and interface indices to self.c memory locations
        self.c_locs = np.full((self.c_locs_len,), -1, dtype=np.int32)
@ -276,9 +278,9 @@ class SimOps:
                ref_count[i0_idx] += 1
        # allocate memory for the rest of the circuit
-        for ops in self.levels:
+        for op_start, op_stop in zip(self.level_starts, self.level_stops):
            free_set = set()
-            for op in ops:
+            for op in self.ops[op_start:op_stop]:
                # if we fork-strip, always take the stems
                i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
                i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
@ -297,7 +299,6 @@ class SimOps:
                self.c_locs[o_idx], self.c_caps[o_idx] = h.alloc(cap), cap
            if c_reuse:
                for loc in free_set:
                    if loc >= 0:  # DFF clocks are not allocated. Ignore for now.
                    h.free(loc)
        # copy memory location and capacity from stems to fanout lines
@ -310,15 +311,6 @@ class SimOps:
            if len(n.ins) > 0:
                self.c_locs[self.ppo_offset + i], self.c_caps[self.ppo_offset + i] = self.c_locs[n.ins[0]], self.c_caps[n.ins[0]]
        # line use information
        self.line_use_start = np.full(self.c_locs_len, -1, dtype=np.int32)
        self.line_use_stop = np.full(self.c_locs_len, len(self.levels), dtype=np.int32)
        for i, lv in enumerate(self.levels):
            for op in lv:
                self.line_use_start[op[1]] = i
                for x in [2, 3, 4, 5]:
                    self.line_use_stop[op[x]] = i
        self.c_len = h.max_size
        d = defaultdict(int)
--- a/src/kyupy/stil.py
+++ b/src/kyupy/stil.py
@ -41,7 +41,7 @@ class StilFile:
                unload = {}
                for so_port in self.so_ports:
                    if so_port in call.parameters:
-                        unload[so_port] = call.parameters[so_port]
+                        unload[so_port] = call.parameters[so_port].replace('\n', '').replace('N', '-')
                if len(capture) > 0:
                    self.patterns.append(ScanPattern(sload, launch, capture, unload))
                    capture = {}
@ -49,9 +49,11 @@ class StilFile:
                sload = {}
                for si_port in self.si_ports:
                    if si_port in call.parameters:
-                        sload[si_port] = call.parameters[si_port]
+                        sload[si_port] = call.parameters[si_port].replace('\n', '').replace('N', '-')
-            if call.name.endswith('_launch'): launch = call.parameters
+            if call.name.endswith('_launch'):
-            if call.name.endswith('_capture'): capture = call.parameters
+                launch = dict((k, v.replace('\n', '').replace('N', '-')) for k, v in call.parameters.items())
            if call.name.endswith('_capture'):
                capture = dict((k, v.replace('\n', '').replace('N', '-')) for k, v in call.parameters.items())
    def _maps(self, c):
        interface = list(c.io_nodes) + [n for n in c.nodes if 'DFF' in n.kind]
@ -98,12 +100,12 @@ class StilFile:
        tests = np.full((len(interface), len(self.patterns)), logic.UNASSIGNED)
        for i, p in enumerate(self.patterns):
            for si_port in self.si_ports.keys():
-                pattern = logic.mvarray(p.load[si_port][0])
+                pattern = logic.mvarray(p.load[si_port])
                inversions = np.choose((pattern == logic.UNASSIGNED) | (pattern == logic.UNKNOWN),
                                       [scan_inversions[si_port], logic.ZERO]).astype(np.uint8)
                np.bitwise_xor(pattern, inversions, out=pattern)
                tests[scan_maps[si_port], i] = pattern
-            tests[pi_map, i] = logic.mvarray(p.capture['_pi'][0])
+            tests[pi_map, i] = logic.mvarray(p.capture['_pi'])
        return tests
    def tests_loc(self, circuit, init_filter=None, launch_filter=None):
@ -132,12 +134,12 @@ class StilFile:
        for i, p in enumerate(self.patterns):
            # init.set_values(i, '0' * len(interface))
            for si_port in self.si_ports.keys():
-                pattern = logic.mvarray(p.load[si_port][0])
+                pattern = logic.mvarray(p.load[si_port])
                inversions = np.choose((pattern == logic.UNASSIGNED) | (pattern == logic.UNKNOWN),
                                       [scan_inversions[si_port], logic.ZERO]).astype(np.uint8)
                np.bitwise_xor(pattern, inversions, out=pattern)
                init[scan_maps[si_port], i] = pattern
-            init[pi_map, i] = logic.mvarray(p.launch['_pi'][0] if '_pi' in p.launch else p.capture['_pi'][0])
+            init[pi_map, i] = logic.mvarray(p.launch['_pi'] if '_pi' in p.launch else p.capture['_pi'])
        if init_filter: init = init_filter(init)
        sim8v = LogicSim(circuit, init.shape[-1], m=8)
        sim8v.s[0] = logic.mv_to_bp(init)
@ -147,12 +149,12 @@ class StilFile:
        launch = logic.bp_to_mv(sim8v.s[1])[..., :init.shape[-1]]
        for i, p in enumerate(self.patterns):
            # if there was no launch cycle or launch clock, then init = launch
-            if '_pi' not in p.launch or 'P' not in p.launch['_pi'][0] or 'P' not in p.capture['_pi'][0]:
+            if '_pi' not in p.launch or 'P' not in p.launch['_pi'] or 'P' not in p.capture['_pi']:
                for si_port in self.si_ports.keys():
-                    pattern = logic.mv_xor(logic.mvarray(p.load[si_port][0]), scan_inversions[si_port])
+                    pattern = logic.mv_xor(logic.mvarray(p.load[si_port]), scan_inversions[si_port])
                    launch[scan_maps[si_port], i] = pattern
-            if '_pi' in p.capture and 'P' in p.capture['_pi'][0]:
+            if '_pi' in p.capture and 'P' in p.capture['_pi']:
-                launch[pi_map, i] = logic.mvarray(p.capture['_pi'][0])
+                launch[pi_map, i] = logic.mvarray(p.capture['_pi'])
            launch[po_map, i] = logic.UNASSIGNED
        if launch_filter: launch = launch_filter(launch)
@ -169,9 +171,9 @@ class StilFile:
        interface, _, po_map, scan_maps, scan_inversions = self._maps(circuit)
        resp = np.full((len(interface), len(self.patterns)), logic.UNASSIGNED)
        for i, p in enumerate(self.patterns):
-            resp[po_map, i] = logic.mvarray(p.capture['_po'][0] if len(p.capture) > 0 else p.launch['_po'][0])
+            resp[po_map, i] = logic.mvarray(p.capture['_po'] if len(p.capture) > 0 else p.launch['_po'])
            for so_port in self.so_ports.keys():
-                pattern = logic.mv_xor(logic.mvarray(p.unload[so_port][0]), scan_inversions[so_port])
+                pattern = logic.mv_xor(logic.mvarray(p.unload[so_port]), scan_inversions[so_port])
                resp[scan_maps[so_port], i] = pattern
        return resp
@ -190,7 +192,7 @@ class StilTransformer(Transformer):
    def call(args): return Call(args[0], dict(args[1:]))
    @staticmethod
-    def call_parameter(args): return args[0], (args[1].value.replace('\n', '').replace('N', '-'), args[1].start_pos)
+    def call_parameter(args): return args[0], args[1].value
    @staticmethod
    def signal_group(args): return args[0], args[1:]
--- a/src/kyupy/techlib.py
+++ b/src/kyupy/techlib.py
@ -11,6 +11,50 @@ from itertools import product
 from . import bench
 class TechLibOld:
    @staticmethod
    def pin_index(kind, pin):
        if isinstance(pin, int):
            return max(0, pin-1)
        if kind[:3] in ('OAI', 'AOI'):
            if pin[0] == 'A': return int(pin[1]) - 1
            if pin == 'B': return int(kind[3])
            if pin[0] == 'B': return int(pin[1]) - 1 + int(kind[3])
        for prefix, pins, index in [('HADD', ('B0', 'SO'), 1),
                                    ('HADD', ('A0', 'C1'), 0),
                                    ('MUX21', ('S', 'S0'), 2),
                                    ('MX2', ('S0',), 2),
                                    ('TBUF', ('OE',), 1),
                                    ('TINV', ('OE',), 1),
                                    ('LATCH', ('D',), 0),
                                    ('LATCH', ('QN',), 1),
                                    ('DFF', ('D',), 0),
                                    ('DFF', ('QN',), 1),
                                    ('SDFF', ('D',), 0),
                                    ('SDFF', ('QN',), 1),
                                    ('SDFF', ('CLK',), 3),
                                    ('SDFF', ('RSTB', 'RN'), 4),
                                    ('SDFF', ('SETB',), 5),
                                    ('ISOL', ('ISO',), 0),
                                    ('ISOL', ('D',), 1)]:
            if kind.startswith(prefix) and pin in pins: return index
        for index, pins in enumerate([('A1', 'IN1', 'A', 'S', 'INP', 'I', 'Q', 'QN', 'Y', 'Z', 'ZN'),
                                      ('A2', 'IN2', 'B', 'CK', 'CLK', 'CO', 'SE'),
                                      ('A3', 'IN3', 'C', 'RN', 'RSTB', 'CI', 'SI'),
                                      ('A4', 'IN4', 'D', 'SN', 'SETB'),
                                      ('A5', 'IN5', 'E'),
                                      ('A6', 'IN6', 'F')]):
            if pin in pins: return index
        raise ValueError(f'Unknown pin index for {kind}.{pin}')
    @staticmethod
    def pin_is_output(kind, pin):
        if isinstance(pin, int):
            return pin == 0
        if 'MUX' in kind and pin == 'S': return False
        return pin in ('Q', 'QN', 'Z', 'ZN', 'Y', 'CO', 'S', 'SO', 'C1')
 class TechLib:
    """Class for standard cell library definitions.
@ -49,14 +93,6 @@ class TechLib:
        assert pin in self.cells[kind][1], f'Unknown pin: {pin} for cell {kind}'
        return self.cells[kind][1][pin][0]
    def pin_name(self, kind, pos, output=False):
        """Returns the pin name for a given node kind, list position, and direction."""
        assert kind in self.cells, f'Unknown cell: {kind}'
        for name, (ppos, isout) in self.cells[kind][1].items():
            if isout == output and ppos == pos:
                return name
        return None
    def pin_is_output(self, kind, pin):
        """Returns True, if given pin name of a node kind is an output."""
        assert kind in self.cells, f'Unknown cell: {kind}'
@ -102,92 +138,21 @@ TLATX1   input(C,D)       output(Q,QN) Q=LATCH(D,C) QN=INV1(Q) ;
 """
-NANGATE = TechLib(r"""
+_nangate_common = r"""
 FILLTIE ;
 FILL_X{1,2,4,8,16} ;
 ANTENNA input(I)   ;
 TIEH output(Z) Z=__const1__() ;
 TIEL output(ZN) ZN=__const0__() ;
 BUF_X{1,2,4,8,12,16}  input(I) output(Z)  Z=BUF1(I)  ;
 INV_X{1,2,4,8,12,16}  input(I) output(ZN) ZN=INV1(I) ;
 CLKBUF_X{1,2,4,8,12,16} input(I) output(Z)  Z=BUF1(I)  ;
 CLKGATETST_X1 input(CLK,E,TE) output(Q) Q=OA21(CLK,E,TE) ;
 AND2_X{1,2}  input(A1,A2)       output(Z)  Z=AND2(A1,A2)         ;
 AND3_X{1,2}  input(A1,A2,A3)    output(Z)  Z=AND3(A1,A2,A3)      ;
 AND4_X{1,2}  input(A1,A2,A3,A4) output(Z)  Z=AND4(A1,A2,A3,A4)   ;
 NAND2_X{1,2} input(A1,A2)       output(ZN) ZN=NAND2(A1,A2)       ;
 NAND3_X{1,2} input(A1,A2,A3)    output(ZN) ZN=NAND3(A1,A2,A3)    ;
 NAND4_X{1,2} input(A1,A2,A3,A4) output(ZN) ZN=NAND4(A1,A2,A3,A4) ;
 OR2_X{1,2}   input(A1,A2)       output(Z)  Z=OR2(A1,A2)          ;
 OR3_X{1,2}   input(A1,A2,A3)    output(Z)  Z=OR3(A1,A2,A3)       ;
 OR4_X{1,2}   input(A1,A2,A3,A4) output(Z)  Z=OR4(A1,A2,A3,A4)    ;
 NOR2_X{1,2}  input(A1,A2)       output(ZN) ZN=NOR2(A1,A2)        ;
 NOR3_X{1,2}  input(A1,A2,A3)    output(ZN) ZN=NOR3(A1,A2,A3)     ;
 NOR4_X{1,2}  input(A1,A2,A3,A4) output(ZN) ZN=NOR4(A1,A2,A3,A4)  ;
 XOR2_X1      input(A1,A2)       output(Z)  Z=XOR2(A1,A2)         ;
 XNOR2_X1     input(A1,A2)       output(ZN) ZN=XNOR2(A1,A2)       ;
 MUX2_X1 input(I0,I1,S) output(Z) Z=MUX21(I0,I1,S) ;
 HA_X1 input(A,B) output(CO,S) CO=XOR2(A,B) S=AND2(A,B) ;
 FA_X1 input(A,B,CI) output(CO,S) AB=XOR2(A,B) CO=XOR2(AB,CI) S=AO22(CI,A,B) ;
 AOI21_X{1,2} input(A1,A2,B)     output(ZN) ZN=AOI21(A1,A2,B)     ;
 OAI21_X{1,2} input(A1,A2,B)     output(ZN) ZN=OAI21(A1,A2,B)     ;
 AOI22_X{1,2} input(A1,A2,B1,B2) output(ZN) ZN=AOI22(A1,A2,B1,B2) ;
 OAI22_X{1,2} input(A1,A2,B1,B2) output(ZN) ZN=OAI22(A1,A2,B1,B2) ;
 DFFRNQ_X1  input(D,RN,CLK)    output(Q)  DR=AND2(D,RN) Q=DFF(DR,CLK) ;
 DFFSNQ_X1  input(D,SN,CLK)    output(Q)  S=INV1(SN) DS=OR2(D,S) Q=DFF(DS,CLK) ;
 SDFFRNQ_X1 input(D,RN,SE,SI,CLK) output(Q) DR=AND2(D,RN) DI=MUX21(DR,SI,SE) Q=DFF(DI,CLK) ;
 SDFFSNQ_X1 input(D,SE,SI,SN,CLK) output(Q) S=INV1(SN) DS=OR2(D,S) DI=MUX21(DS,SI,SE) Q=DFF(DI,CLK) ;
 TBUF_X{1,2,4,8,12,16} input(EN,I)   output(Z)  Z=BUF1(I) ;
 LHQ_X1 input(D,E) output(Q)  Q=LATCH(D,E) ;
 """)
 """Nangate 15nm Open Cell Library (NanGate_15nm_OCL_v0.1_2014_06.A).
 """
 NANGATE45 = TechLib(r"""
 FILLCELL_X{1,2,4,8,16,32} ;
 ANTENNA_X1 input(A) ;
 LOGIC0_X1 output(Z) Z=__const0__() ;
 LOGIC1_X1 output(Z) Z=__const1__() ;
 BUF_X{1,2,4,8,16,32}  input(A) output(Z)  Z=BUF1(A)  ;
 INV_X{1,2,4,8,16,32}  input(A) output(ZN) ZN=INV1(A) ;
 CLKBUF_X{1,2,3}       input(A) output(Z)  Z=BUF1(A)  ;
 CLKGATETST_X{1,2,4,8} input(CK,E,SE) output(GCK) GCK=OA21(CK,E,SE) ;
 CLKGATE_X{1,2,4,8}    input(CK,E)    output(GCK) GCK=AND2(CK,E)    ;
 AND2_X{1,2,4}  input(A1,A2)       output(ZN) ZN=AND2(A1,A2)        ;
 AND3_X{1,2,4}  input(A1,A2,A3)    output(ZN) ZN=AND3(A1,A2,A3)     ;
 AND4_X{1,2,4}  input(A1,A2,A3,A4) output(ZN) ZN=AND4(A1,A2,A3,A4)  ;
 NAND2_X{1,2,4} input(A1,A2)       output(ZN) ZN=NAND2(A1,A2)       ;
 NAND3_X{1,2,4} input(A1,A2,A3)    output(ZN) ZN=NAND3(A1,A2,A3)    ;
 NAND4_X{1,2,4} input(A1,A2,A3,A4) output(ZN) ZN=NAND4(A1,A2,A3,A4) ;
 OR2_X{1,2,4}   input(A1,A2)       output(ZN) ZN=OR2(A1,A2)         ;
 OR3_X{1,2,4}   input(A1,A2,A3)    output(ZN) ZN=OR3(A1,A2,A3)      ;
 OR4_X{1,2,4}   input(A1,A2,A3,A4) output(ZN) ZN=OR4(A1,A2,A3,A4)   ;
 NOR2_X{1,2,4}  input(A1,A2)       output(ZN) ZN=NOR2(A1,A2)        ;
 NOR3_X{1,2,4}  input(A1,A2,A3)    output(ZN) ZN=NOR3(A1,A2,A3)     ;
 NOR4_X{1,2,4}  input(A1,A2,A3,A4) output(ZN) ZN=NOR4(A1,A2,A3,A4)  ;
 XOR2_X{1,2}    input(A,B)         output(Z)  Z=XOR2(A,B)           ;
 XNOR2_X{1,2}   input(A,B)         output(ZN) ZN=XNOR2(A,B)         ;
 MUX2_X{1,2} input(A,B,S) output(Z) Z=MUX21(A,B,S) ;
 HA_X1 input(A,B) output(CO,S) CO=XOR2(A,B) S=AND2(A,B) ;
 FA_X1 input(A,B,CI) output(CO,S) AB=XOR2(A,B) CO=XOR2(AB,CI) S=AO22(CI,A,B) ;
 AOI21_X{1,2,4} input(A,B1,B2)     output(ZN) ZN=AOI21(B1,B2,A)     ;
 OAI21_X{1,2,4} input(A,B1,B2)     output(ZN) ZN=OAI21(B1,B2,A)     ;
@ -197,6 +162,8 @@ OAI22_X{1,2,4} input(A1,A2,B1,B2) output(ZN) ZN=OAI22(A1,A2,B1,B2) ;
 OAI211_X{1,2,4} input(A,B,C1,C2) output(ZN) ZN=OAI211(C1,C2,A,B)   ;
 AOI211_X{1,2,4} input(A,B,C1,C2) output(ZN) ZN=AOI211(C1,C2,A,B)   ;
 MUX2_X{1,2} input(A,B,S) output(Z) Z=MUX21(A,B,S) ;
 AOI221_X{1,2,4} input(A,B1,B2,C1,C2) output(ZN) BC=AO22(B1,B2,C1,C2) ZN=NOR2(BC,A)  ;
 OAI221_X{1,2,4} input(A,B1,B2,C1,C2) output(ZN) BC=OA22(B1,B2,C1,C2) ZN=NAND2(BC,A) ;
@ -205,6 +172,14 @@ OAI222_X{1,2,4} input(A1,A2,B1,B2,C1,C2) output(ZN) BC=OA22(B1,B2,C1,C2) ZN=OAI2
 OAI33_X1 input(A1,A2,A3,B1,B2,B3) output(ZN) AA=OR2(A1,A2) BB=OR2(B1,B2) ZN=OAI22(AA,A3,BB,B3) ;
 HA_X1 input(A,B) output(CO,S) CO=XOR2(A,B) S=AND2(A,B) ;
 FA_X1 input(A,B,CI) output(CO,S) AB=XOR2(A,B) CO=XOR2(AB,CI) S=AO22(CI,A,B) ;
 CLKGATE_X{1,2,4,8} input(CK,E) output(GCK) GCK=AND2(CK,E) ;
 CLKGATETST_X{1,2,4,8} input(CK,E,SE) output(GCK) GCK=OA21(CK,E,SE) ;
 DFF_X{1,2}   input(D,CK)       output(Q,QN)  Q=DFF(D,CK) QN=INV1(Q) ;
 DFFR_X{1,2}  input(D,RN,CK)    output(Q,QN)  DR=AND2(D,RN) Q=DFF(DR,CK) QN=INV1(Q) ;
 DFFS_X{1,2}  input(D,SN,CK)    output(Q,QN)  S=INV1(SN) DS=OR2(D,S) Q=DFF(DS,CK) QN=INV1(Q) ;
@ -216,16 +191,43 @@ SDFFS_X{1,2}  input(D,SE,SI,SN,CK)    output(Q,QN)  S=INV1(SN) DS=OR2(D,S) DI=MU
 SDFFRS_X{1,2} input(D,RN,SE,SI,SN,CK) output(Q,QN)  S=INV1(SN) DS=OR2(D,S) DRS=AND2(DS,RN) DI=MUX21(DRS,SI,SE) Q=DFF(DI,CK) QN=INV1(Q) ;
 TBUF_X{1,2,4,8,16} input(A,EN)   output(Z)  Z=BUF1(A)    ;
 TINV_X1            input(I,EN)   output(ZN) ZN=INV1(I)   ;
 TLAT_X1            input(D,G,OE) output(Q)  Q=LATCH(D,G) ;
 DLH_X{1,2} input(D,G) output(Q)  Q=LATCH(D,G)            ;
 DLL_X{1,2} input(D,GN) output(Q) G=INV1(GN) Q=LATCH(D,G) ;
 """
 NANGATE = TechLib(_nangate_common + r"""
 INV_X{1,2,4,8,16,32}  input(I) output(ZN) ZN=INV1(I) ;
 AND2_X{1,2,4}  input(A1,A2)       output(Z)  Z=AND2(A1,A2)        ;
 AND3_X{1,2,4}  input(A1,A2,A3)    output(Z)  Z=AND3(A1,A2,A3)     ;
 AND4_X{1,2,4}  input(A1,A2,A3,A4) output(Z)  Z=AND4(A1,A2,A3,A4)  ;
 OR2_X{1,2,4}   input(A1,A2)       output(Z)  Z=OR2(A1,A2)         ;
 OR3_X{1,2,4}   input(A1,A2,A3)    output(Z)  Z=OR3(A1,A2,A3)      ;
 OR4_X{1,2,4}   input(A1,A2,A3,A4) output(Z)  Z=OR4(A1,A2,A3,A4)   ;
 XOR2_X{1,2}    input(A1,A2)       output(Z)  Z=XOR2(A1,A2)        ;
 XNOR2_X{1,2}   input(A1,A2)       output(ZN) ZN=XNOR2(A1,A2)      ;
 """)
 """An newer NANGATE-variant that uses 'Z' as output pin names for AND and OR gates.
 """
 NANGATE_ZN = TechLib(_nangate_common + r"""
 INV_X{1,2,4,8,16,32}  input(A) output(ZN) ZN=INV1(A) ;
 AND2_X{1,2,4}  input(A1,A2)       output(ZN) ZN=AND2(A1,A2)        ;
 AND3_X{1,2,4}  input(A1,A2,A3)    output(ZN) ZN=AND3(A1,A2,A3)     ;
 AND4_X{1,2,4}  input(A1,A2,A3,A4) output(ZN) ZN=AND4(A1,A2,A3,A4)  ;
 OR2_X{1,2,4}   input(A1,A2)       output(ZN) ZN=OR2(A1,A2)         ;
 OR3_X{1,2,4}   input(A1,A2,A3)    output(ZN) ZN=OR3(A1,A2,A3)      ;
 OR4_X{1,2,4}   input(A1,A2,A3,A4) output(ZN) ZN=OR4(A1,A2,A3,A4)   ;
 XOR2_X{1,2}    input(A,B)         output(Z)  Z=XOR2(A,B)           ;
 XNOR2_X{1,2}   input(A,B)         output(ZN) ZN=XNOR2(A,B)         ;
 """)
-"""Nangate 45nm Open Cell Library (NangateOpenCellLibrary_PDKv1_3_v2010_12).
+"""An older NANGATE-variant that uses 'ZN' as output pin names for AND and OR gates.
 This NANGATE-variant that uses 'ZN' as output pin names for AND and OR gates.
 """
--- a/src/kyupy/verilog.py
+++ b/src/kyupy/verilog.py
@ -123,9 +123,6 @@ class VerilogTransformer(Transformer):
        assignments = []
        for stmt in args[2:]:  # pass 1: instantiate cells and driven signals
            if isinstance(stmt, Instantiation):
                if stmt.type not in self.tlib.cells:
                    log.warn(f'Ignoring cell of unknown kind "{stmt.type}"')
                    continue
                n = Node(c, stmt.name, kind=stmt.type)
                for p, s in stmt.pins.items():
                    if self.tlib.pin_is_output(n.kind, p):
@ -144,8 +141,6 @@ class VerilogTransformer(Transformer):
                        c.io_nodes[positions[name]] = n
                    if sd.kind == 'input':
                        Line(c, n, Node(c, name))
        while len(assignments) > 0:
            more_assignments = []
        for target, source in assignments:  # pass 1.5: process signal assignments
            target_sigs = []
            if not isinstance(target, list): target = [target]
@ -172,13 +167,9 @@ class VerilogTransformer(Transformer):
                    cnode = Node(c, f'__const{s[3]}_{const_count}__', f'__const{s[3]}__')
                    const_count += 1
                    Line(c, cnode, Node(c, t))
                    else:
                        more_assignments.append((target, source))
            assignments = more_assignments
        for stmt in args[2:]:  # pass 2: connect signals to readers
            if isinstance(stmt, Instantiation):
                for p, s in stmt.pins.items():
                    if stmt.name not in c.cells: continue
                    n = c.cells[stmt.name]
                    if self.tlib.pin_is_output(n.kind, p): continue
                    if s.startswith("1'b"):
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@ -13,11 +13,10 @@ Two simulators are available: :py:class:`WaveSim` runs on the CPU, and the deriv
 """
 import math
 from collections import defaultdict
 import numpy as np
-from . import log, numba, cuda, sim, cdiv, eng
+from . import numba, cuda, sim, cdiv
 TMAX = np.float32(2 ** 127)
@ -60,8 +59,8 @@ class WaveSim(sim.SimOps):
        self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype)
        self.delays[:, :delays.shape[1]] = delays
-        self.c = np.full((self.c_len, self.sims), TMAX, dtype=np.float32)
+        self.c = np.zeros((self.c_len, sims), dtype=np.float32) + TMAX
-        self.s = np.zeros((11, self.s_len, self.sims), dtype=np.float32)
+        self.s = np.zeros((11, self.s_len, sims), dtype=np.float32)
        """Information about the logic values and transitions around the sequential elements (flip-flops) and ports.
        The first 3 values are read by :py:func:`s_to_c`.
@ -99,18 +98,12 @@ class WaveSim(sim.SimOps):
        self.simctl_int[0] = range(sims)  # unique seed for each sim by default, zero this to pick same delays for all sims.
        self.simctl_int[1] = 2  # random picking by default.
-        self.e = np.zeros((self.c_locs_len, sims, 2), dtype=np.uint8)  # aux data for each line and sim
+        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.simctl_int)])
        self.error_counts = np.zeros(self.s_len, dtype=np.uint32)  # number of capture errors by PPO
        self.lsts = np.zeros(self.s_len, dtype=np.float32)  # LST by PPO
        self.overflows = np.zeros(self.s_len, dtype=np.uint32)  # Overflows by PPO
        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.e, self.c_locs, self.c_caps, self.ops, self.simctl_int)])
    def __repr__(self):
        dev = 'GPU' if hasattr(self.c, 'copy_to_host') else 'CPU'
        return f'{{name: "{self.circuit.name}", device: "{dev}", sims: {self.sims}, ops: {len(self.ops)}, ' + \
-               f'levels: {len(self.level_starts)}, nbytes: {eng(self.nbytes)}}}'
+               f'levels: {len(self.level_starts)}, nbytes: {self.nbytes}}}'
    def s_to_c(self):
        """Transfers values of sequential elements and primary inputs to the combinational portion.
@ -123,7 +116,7 @@ class WaveSim(sim.SimOps):
        self.c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX])
        self.c[self.pippi_c_locs+2] = TMAX
-    def c_prop(self, sims=None, seed=1, delta=0):
+    def c_prop(self, sims=None, seed=1):
        """Propagates all waveforms from the (pseudo) primary inputs to the (pseudo) primary outputs.
        :param sims: Number of parallel simulations to execute. If None, all available simulations are performed.
@ -131,7 +124,7 @@ class WaveSim(sim.SimOps):
        """
        sims = min(sims or self.sims, self.sims)
        for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, 0, sims, self.delays, self.simctl_int, seed, delta)
+            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, 0, sims, self.delays, self.simctl_int, seed)
    def c_to_s(self, time=TMAX, sd=0.0, seed=1):
        """Simulates a capture operation at all sequential elements and primary outputs.
@ -159,7 +152,7 @@ class WaveSim(sim.SimOps):
        self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs]
-def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed, delta):
+def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
    overflows = int(0)
    lut = op[0]
@ -169,18 +162,6 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed, de
    c_idx = op[4]
    d_idx = op[5]
    input_epoch = (ebuf[a_idx, sim, 1]|
                   ebuf[b_idx, sim, 1]|
                   ebuf[c_idx, sim, 1]|
                   ebuf[d_idx, sim, 1])
    output_epoch = ebuf[z_idx, sim, 1]
    if (delta):
        if input_epoch == 0 and output_epoch == 0: return 0, 0
    out_changed = output_epoch
    if len(delays) > 1:
        if simctl_int[1] == 0:
            delays = delays[seed]
@ -225,25 +206,25 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed, de
        if a == current_t:
            a_cur += 1
            inputs ^= 1
-            thresh = delays[a_idx, a_cur & 1 ^ 1, z_val]
+            thresh = delays[a_idx, a_cur & 1, z_val]
            a = cbuf[a_mem + a_cur, sim] + delays[a_idx, a_cur & 1, z_val]
            next_t = cbuf[a_mem + a_cur, sim] + delays[a_idx, (a_cur & 1) ^ 1, z_val ^ 1]
        elif b == current_t:
            b_cur += 1
            inputs ^= 2
-            thresh = delays[b_idx, b_cur & 1 ^ 1, z_val]
+            thresh = delays[b_idx, b_cur & 1, z_val]
            b = cbuf[b_mem + b_cur, sim] + delays[b_idx, b_cur & 1, z_val]
            next_t = cbuf[b_mem + b_cur, sim] + delays[b_idx, (b_cur & 1) ^ 1, z_val ^ 1]
        elif c == current_t:
            c_cur += 1
            inputs ^= 4
-            thresh = delays[c_idx, c_cur & 1 ^ 1, z_val]
+            thresh = delays[c_idx, c_cur & 1, z_val]
            c = cbuf[c_mem + c_cur, sim] + delays[c_idx, c_cur & 1, z_val]
            next_t = cbuf[c_mem + c_cur, sim] + delays[c_idx, (c_cur & 1) ^ 1, z_val ^ 1]
        else:
            d_cur += 1
            inputs ^= 8
-            thresh = delays[d_idx, d_cur & 1 ^ 1, z_val]
+            thresh = delays[d_idx, d_cur & 1, z_val]
            d = cbuf[d_mem + d_cur, sim] + delays[d_idx, d_cur & 1, z_val]
            next_t = cbuf[d_mem + d_cur, sim] + delays[d_idx, (d_cur & 1) ^ 1, z_val ^ 1]
@ -254,15 +235,13 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed, de
                or (current_t - previous_t) > thresh  # -OR- the generated hazard is wider than pulse threshold.
                ):
                if z_cur < (z_cap - 1):  # enough space in z_mem?
                    if delta and (cbuf[z_mem + z_cur, sim] != current_t):
                        out_changed = 1
                    cbuf[z_mem + z_cur, sim] = current_t
                    previous_t = current_t
                    z_cur += 1
                else:
                    overflows += 1
                    previous_t = cbuf[z_mem + z_cur - 1, sim]
                    z_cur -= 1
                    previous_t = cbuf[z_mem + z_cur, sim]
            else:
                z_cur -= 1
                previous_t = cbuf[z_mem + z_cur - 1, sim] if z_cur > 0 else TMIN
@ -276,23 +255,12 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed, de
        current_t = min(a, b, c, d)
    if delta and (cbuf[z_mem + z_cur, sim] != TMAX):
        out_changed = 1
    # generate or propagate overflow flag
    cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
    nrise = max(0, (z_cur+1) // 2 - (cbuf[z_mem, sim] == TMIN))
    nfall = z_cur // 2
    e = int(((cbuf[z_mem, sim] == TMIN) << 1) & 2)  # initial value
    e |= z_val  # final value
    e |= (nrise + nfall)<<2  # number of transitions
    ebuf[z_idx, sim, 0] = e
    ebuf[z_idx, sim, 1] = input_epoch & out_changed
    return nrise, nfall
@ -300,11 +268,11 @@ wave_eval_cpu = numba.njit(_wave_eval)
@numba.njit
-def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, seed, delta):
+def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed):
    for op_idx in range(op_start, op_stop):
        op = ops[op_idx]
        for sim in range(sim_start, sim_stop):
-            nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], seed, delta)
+            nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed)
            a_loc = op[6]
            a_wr = op[7]
            a_wf = op[8]
@ -377,18 +345,12 @@ class WaveSimCuda(WaveSim):
        self.delays = cuda.to_device(self.delays)
        self.simctl_int = cuda.to_device(self.simctl_int)
        self.abuf = cuda.to_device(self.abuf)
        self.e = cuda.to_device(self.e)
        self.error_counts = cuda.to_device(self.error_counts)
        self.lsts = cuda.to_device(self.lsts)
        self.overflows = cuda.to_device(self.overflows)
        self.aux = cuda.to_device(np.zeros(8*1024, dtype=np.int32))
        self._block_dim = (32, 16)
    def __getstate__(self):
        state = self.__dict__.copy()
-        del state['c']
+        state['c'] = np.array(self.c)
        state['s'] = np.array(self.s)
        state['ops'] = np.array(self.ops)
        state['c_locs'] = np.array(self.c_locs)
@ -396,16 +358,11 @@ class WaveSimCuda(WaveSim):
        state['delays'] = np.array(self.delays)
        state['simctl_int'] = np.array(self.simctl_int)
        state['abuf'] = np.array(self.abuf)
        state['e'] = np.array(self.e)
        state['error_counts'] = np.array(self.error_counts)
        state['lsts'] = np.array(self.lsts)
        state['overflows'] = np.array(self.overflows)
        state['aux'] = np.array(self.aux)
        return state
    def __setstate__(self, state):
        self.__dict__.update(state)
-        self.c = cuda.to_device(np.full((self.c_len, self.sims), TMAX, dtype=np.float32))
+        self.c = cuda.to_device(self.c)
        self.s = cuda.to_device(self.s)
        self.ops = cuda.to_device(self.ops)
        self.c_locs = cuda.to_device(self.c_locs)
@ -413,11 +370,6 @@ class WaveSimCuda(WaveSim):
        self.delays = cuda.to_device(self.delays)
        self.simctl_int = cuda.to_device(self.simctl_int)
        self.abuf = cuda.to_device(self.abuf)
        self.e = cuda.to_device(self.e)
        self.error_counts = cuda.to_device(self.error_counts)
        self.lsts = cuda.to_device(self.lsts)
        self.overflows = cuda.to_device(self.overflows)
        self.aux = cuda.to_device(self.aux)
    def s_to_c(self):
        grid_dim = self._grid_dim(self.sims, self.s_len)
@ -425,24 +377,14 @@ class WaveSimCuda(WaveSim):
    def _grid_dim(self, x, y): return cdiv(x, self._block_dim[0]), cdiv(y, self._block_dim[1])
-    def c_prop(self, sims=None, seed=1, op_from=0, op_to=None, delta=0):
+    def c_prop(self, sims=None, seed=1):
        sims = min(sims or self.sims, self.sims)
        for op_start, op_stop in zip(self.level_starts, self.level_stops):
            if op_from > op_start: continue
            if op_to is not None and op_to <= op_start: break
            grid_dim = self._grid_dim(sims, op_stop - op_start)
-            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, int(0),
+            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, int(0),
-                sims, self.delays, self.simctl_int, seed, delta)
+                sims, self.delays, self.simctl_int, seed)
        cuda.synchronize()
    def c_prop_level(self, level, sims=None, seed=1, delta=0):
        sims = min(sims or self.sims, self.sims)
        op_start = self.level_starts[level]
        op_stop = self.level_stops[level]
        grid_dim = self._grid_dim(sims, op_stop - op_start)
        wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, int(0),
            sims, self.delays, self.simctl_int, seed, delta)
    def c_to_s(self, time=TMAX, sd=0.0, seed=1):
        grid_dim = self._grid_dim(self.sims, self.s_len)
        wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.c_locs, self.c_caps, self.ppo_offset,
@ -452,77 +394,6 @@ class WaveSimCuda(WaveSim):
        grid_dim = self._grid_dim(self.sims, self.s_len)
        ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.c_locs, time, self.ppi_offset, self.ppo_offset)
    def acc_error_counts(self, sims=None):
        sims = min(sims or self.sims, self.sims)
        grid_dim = cdiv(self.s_len, 256)
        acc_error_counts_gpu[grid_dim, 256](self.s, sims, self.error_counts)
    def reset_error_counts(self):
        self.error_counts[:] = 0
    def get_error_counts(self):
        return np.array(self.error_counts)
    def acc_overflows(self, sims=None):
        sims = min(sims or self.sims, self.sims)
        grid_dim = cdiv(self.s_len, 256)
        acc_overflows_gpu[grid_dim, 256](self.s, sims, self.overflows)
    def reset_overflows(self):
        self.overflows[:] = 0
    def get_overflows(self):
        return np.array(self.overflows)
    def acc_lsts(self, sims=None):
        sims = min(sims or self.sims, self.sims)
        grid_dim = cdiv(self.s_len, 256)
        acc_lsts_gpu[grid_dim, 256](self.s, sims, self.lsts)
    def reset_lsts(self):
        self.lsts[:] = 0.0
    def get_lsts(self):
        return np.array(self.lsts)
@cuda.jit()
 def memcpy_gpu (src, dst, nitems):
    tid = cuda.grid(1)
    stride = cuda.gridDim.x * cuda.blockDim.x
    for i in range(tid, nitems, stride):
        dst.flat[i] = src.flat[i]
@cuda.jit()
 def acc_error_counts_gpu(s, sims, error_counts):
    x = cuda.grid(1)
    if x >= s.shape[1]: return
    cnt = 0
    for i in range(sims):
        cnt += (s[6,x,i] != s[8,x,i])
    error_counts[x] += cnt
@cuda.jit()
 def acc_overflows_gpu(s, sims, overflows):
    x = cuda.grid(1)
    if x >= s.shape[1]: return
    cnt = 0
    for i in range(sims):
        cnt += s[10,x,i]
    overflows[x] += cnt
@cuda.jit()
 def acc_lsts_gpu(s, sims, lsts):
    x = cuda.grid(1)
    if x >= s.shape[1]: return
    lst = 0
    for i in range(sims):
        lst = max(lst, s[5,x,i])
    lsts[x] = max(lsts[x], lst)
@cuda.jit()
 def wave_assign_gpu(c, s, c_locs, ppi_offset):
@ -552,7 +423,7 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True)
@cuda.jit()
-def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, seed, delta):
+def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed):
    x, y = cuda.grid(2)
    sim = sim_start + x
    op_idx = op_start + y
@ -564,7 +435,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, ebuf, abuf, sim_
    a_wr = op[7]
    a_wf = op[8]
-    nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], seed, delta)
+    nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed)
    # accumulate WSA into abuf
    if a_loc >= 0:
--- a/tests/b15_4ig.sa_rf.stil.gz
+++ b/tests/b15_4ig.sa_rf.stil.gz
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -13,44 +13,8 @@ def b15_2ig_circuit(mydir):
    from kyupy.techlib import SAED32
    return verilog.load(mydir / 'b15_2ig.v.gz', branchforks=True, tlib=SAED32)
@pytest.fixture(scope='session')
 def b15_2ig_circuit_resolved(b15_2ig_circuit):
    from kyupy.techlib import SAED32
    cr = b15_2ig_circuit.copy()
    cr.resolve_tlib_cells(SAED32)
    return cr
@pytest.fixture(scope='session')
 def b15_4ig_circuit(mydir):
    from kyupy import verilog
    from kyupy.techlib import SAED32
    return verilog.load(mydir / 'b15_4ig.v.gz', branchforks=True, tlib=SAED32)
@pytest.fixture(scope='session')
 def b15_4ig_circuit_resolved(b15_4ig_circuit):
    from kyupy.techlib import SAED32
    cr = b15_4ig_circuit.copy()
    cr.resolve_tlib_cells(SAED32)
    return cr
@pytest.fixture(scope='session')
 def b15_2ig_delays(mydir, b15_2ig_circuit):
    from kyupy import sdf
    from kyupy.techlib import SAED32
    return sdf.load(mydir / 'b15_2ig.sdf.gz').iopaths(b15_2ig_circuit, tlib=SAED32)[1:2]
@pytest.fixture(scope='session')
 def b15_2ig_sa_nf_test_resp(mydir, b15_2ig_circuit_resolved):
    from kyupy import stil
    s = stil.load(mydir / 'b15_2ig.sa_nf.stil.gz')
    tests = s.tests(b15_2ig_circuit_resolved)[:,1:]
    resp = s.responses(b15_2ig_circuit_resolved)[:,1:]
    return (tests, resp)
@pytest.fixture(scope='session')
 def b15_4ig_sa_rf_test_resp(mydir, b15_4ig_circuit_resolved):
    from kyupy import stil
    s = stil.load(mydir / 'b15_4ig.sa_rf.stil.gz')
    tests = s.tests(b15_4ig_circuit_resolved)[:,1:]
    resp = s.responses(b15_4ig_circuit_resolved)[:,1:]
    return (tests, resp)
--- a/tests/gates.sdf
+++ b/tests/gates.sdf
@ -7,49 +7,22 @@
 (TEMPERATURE 25.00:25.00:25.00)
 (TIMESCALE 1ns)
 (CELL
-  (CELLTYPE "NAND2_X1")
+  (CELLTYPE "NAND2X1")
  (INSTANCE nandgate)
  (DELAY
    (ABSOLUTE
-    (IOPATH A1 ZN (0.099:0.103:0.103) (0.122:0.127:0.127))
+    (IOPATH IN1 QN (0.099:0.103:0.103) (0.122:0.127:0.127))
-    (IOPATH A2 ZN (0.083:0.086:0.086) (0.100:0.104:0.104))
+    (IOPATH IN2 QN (0.083:0.086:0.086) (0.100:0.104:0.104))
    )
  )
 )
 (CELL
-  (CELLTYPE "AND2_X1")
+  (CELLTYPE "AND2X1")
  (INSTANCE andgate)
  (DELAY
    (ABSOLUTE
-    (IOPATH A1 ZN (0.367:0.378:0.378) (0.351:0.377:0.377))
+    (IOPATH IN1 Q (0.367:0.378:0.378) (0.351:0.377:0.377))
-    (IOPATH A2 ZN (0.366:0.375:0.375) (0.359:0.370:0.370))
+    (IOPATH IN2 Q (0.366:0.375:0.375) (0.359:0.370:0.370))
    )
  )
 )
 (CELL
  (CELLTYPE "OAI21_X1")
  (INSTANCE oai21gate)
  (DELAY 
    (ABSOLUTE
    (IOPATH B1 ZN (0.000:0.025:0.025) (0.000:0.013:0.013))
    (IOPATH B2 ZN (0.000:0.030:0.030) (0.000:0.016:0.016))
    ( COND (B1 == 1'b0) && (B2 == 1'b1) (IOPATH A ZN (0.000:0.018:0.018)))
    (COND (B1 == 1'b1) && (B2 == 1'b0) (IOPATH A ZN (0.000:0.018:0.018) (0.000:0.016:0.016)))
    (COND (B1 == 1'b1) && (B2 == 1'b1) (IOPATH A ZN (0.000:0.019:0.019) (0.000:0.014:0.014)))
    )
  )
 )
 (CELL
  (CELLTYPE "MUX2_X1")
  (INSTANCE mux2gate)
  (DELAY 
    (ABSOLUTE
    (COND (B == 1'b0) && (S == 1'b0) (IOPATH A Z (0.000:0.037:0.037) (0.000:0.058:0.058)))
    (COND (B == 1'b1) && (S == 1'b0) (IOPATH A Z (0.000:0.037:0.037) (0.000:0.058:0.058)))
    (COND (A == 1'b0) && (S == 1'b1) (IOPATH B Z (0.000:0.035:0.035) (0.000:0.056:0.056)))
    (COND (A == 1'b1) && (S == 1'b1) (IOPATH B Z (0.000:0.035:0.035) (0.000:0.056:0.056)))
    (COND (A == 1'b0) && (B == 1'b1) (IOPATH S Z (0.000:0.047:0.047) (0.000:0.073:0.073)))
    (COND (A == 1'b1) && (B == 1'b0) (IOPATH S Z (0.000:0.072:0.072) (0.000:0.064:0.064)))
    )
  )
 )
--- a/tests/gates.v
+++ b/tests/gates.v
@ -1,15 +1,11 @@
-module gates (a, b, c, o0, o1, o2, o3 );
+module gates (a, b, o0, o1 );
 input a;
 input b;
 input c;
 output o0;
 output o1;
 output o2;
 output o3;
-AND2_X1 andgate (.A1 ( a ) , .A2 ( b ) , .ZN ( o0 ) ) ;
+AND2X1 andgate (.IN1 ( a ) , .IN2 ( b ) , .Q ( o0 ) ) ;
-NAND2_X1 nandgate (.A1 ( a ) , .A2 ( b ) , .ZN ( o1 ) ) ;
+NAND2X1 nandgate (.IN1 ( a ) , .IN2 ( b ) , .QN ( o1 ) ) ;
-OAI21_X1 oai21gate (.B1(a), .B2(b), .A(c), .ZN(o2) ) ;
+
 MUX2_X1 mux2gate (.A(a), .B(b), .S(c), .Z(o3)) ;
 endmodule
--- a/tests/test_circuit.py
+++ b/tests/test_circuit.py
@ -1,30 +1,9 @@
 import pickle
-from kyupy.circuit import GrowingList, Circuit, Node, Line
+from kyupy.circuit import Circuit, Node, Line
 from kyupy import verilog, bench
 from kyupy.techlib import SAED32
 def test_growing_list():
    gl = GrowingList()
    assert gl.free_idx == 0
    gl[0] = 1
    assert gl.free_idx == 1
    gl[2] = 1
    assert gl.free_idx == 1
    gl[0] = None
    assert gl.free_idx == 0
    gl[0] = 1
    assert gl.free_idx == 1
    gl[1] = 1
    assert gl.free_idx == 3
    gl.append(1)
    assert gl.free_idx == 4
    gl[2] = None
    assert gl.free_idx == 2
    gl[2] = 1
    gl[1] = None
    assert gl.free_idx == 1
 def test_lines():
    c = Circuit()
    n1 = Node(c, 'n1')
--- a/tests/test_logic_sim.py
+++ b/tests/test_logic_sim.py
@ -1,6 +1,6 @@
 import numpy as np
-from kyupy.logic_sim import LogicSim, LogicSim6V
+from kyupy.logic_sim import LogicSim
 from kyupy import bench, logic, sim
 from kyupy.logic import mvarray, bparray, bp_to_mv, mv_to_bp
@ -94,30 +94,6 @@ def test_4v():
        '--0XX', '--X1X', '--XXX', '--XXX'))
 def test_6v():
    c = bench.parse('input(x, y) output(a, o, n, xo, no) a=AND2(x,y) o=OR2(x,y) n=INV1(x) xo=XOR2(x,y) no=NOR2(x,y)')
    s = LogicSim6V(c, 36)
    assert s.s_len == 7
    mva = mvarray(
        '0000101', '0101110', '0R0R1RF', '0F0F1FR', '0P0P1PN', '0N0N1NP',
        '1001010', '1111000', '1RR10F0', '1FF10R0', '1PP10N0', '1NN10P0',
        'R00RFRF', 'R1R1FF0', 'RRRRFPF', 'RFPNFNP', 'RPPRFRF', 'RNRNFFP',
        'F00FRFR', 'F1F1RR0', 'FRPNRNP', 'FFFFRPR', 'FPPFRFR', 'FNFNRRP',
        'P00PNPN', 'P1P1NN0', 'PRPRNRF', 'PFPFNFR', 'PPPPNPN', 'PNPNNNP',
        'N00NPNP', 'N1N1PP0', 'NRRNPFP', 'NFFNPRP', 'NPPNPNP', 'NNNNPPP')
    tests = np.copy(mva)
    tests[2:] = logic.ZERO
    s.s[0] = tests
    s.s_to_c()
    s.c_prop()
    s.c_to_s()
    resp = s.s[1].copy()
    exp_resp = np.copy(mva)
    exp_resp[:2] = logic.ZERO
    np.testing.assert_allclose(resp, exp_resp)
 def test_8v():
    c = bench.parse('input(x, y) output(a, o, n, xo) a=and(x,y) o=or(x,y) n=not(x) xo=xor(x,y)')
    s = LogicSim(c, 64, m=8)
@ -197,64 +173,3 @@ def test_b01(mydir):
    s.c_prop()
    s.c_to_s()
    bp_to_mv(s.s[1])
 def sim_and_compare(c, test_resp, m=8):
    tests, resp = test_resp
    lsim = LogicSim(c, m=m, sims=tests.shape[1])
    lsim.s[0] = logic.mv_to_bp(tests)
    lsim.s_to_c()
    lsim.c_prop()
    lsim.c_to_s()
    resp_sim = logic.bp_to_mv(lsim.s[1])[:,:tests.shape[1]]
    idxs, pats = np.nonzero(((resp == logic.ONE) & (resp_sim != logic.ONE)) | ((resp == logic.ZERO) & (resp_sim != logic.ZERO)))
    for i, (idx, pat) in enumerate(zip(idxs, pats)):
        if i >= 10:
            print(f'...')
            break
        print(f'mismatch pattern:{pat} ppio:{idx} exp:{logic.mv_str(resp[idx,pat])} act:{logic.mv_str(resp_sim[idx,pat])}')
    assert len(idxs) == 0
 def sim_and_compare_6v(c, test_resp):
    tests, resp = test_resp
    lsim = LogicSim6V(c, sims=tests.shape[1])
    lsim.s[0] = tests
    lsim.s_to_c()
    lsim.c_prop()
    lsim.c_to_s()
    resp_sim = lsim.s[1]
    idxs, pats = np.nonzero(((resp == logic.ONE) & (resp_sim != logic.ONE)) | ((resp == logic.ZERO) & (resp_sim != logic.ZERO)))
    for i, (idx, pat) in enumerate(zip(idxs, pats)):
        if i >= 10:
            print(f'...')
            break
        print(f'mismatch pattern:{pat} ppio:{idx} exp:{logic.mv_str(resp[idx,pat])} act:{logic.mv_str(resp_sim[idx,pat])}')
    assert len(idxs) == 0
 def test_b15_2ig_sa_2v(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp):
    sim_and_compare(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp, m=2)
 def test_b15_2ig_sa_4v(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp):
    sim_and_compare(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp, m=4)
 def test_b15_2ig_sa_6v(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp):
    sim_and_compare_6v(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp)
 def test_b15_2ig_sa_8v(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp):
    sim_and_compare(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp, m=8)
 def test_b15_4ig_sa_2v(b15_4ig_circuit_resolved, b15_4ig_sa_rf_test_resp):
    sim_and_compare(b15_4ig_circuit_resolved, b15_4ig_sa_rf_test_resp, m=2)
 def test_b15_4ig_sa_4v(b15_4ig_circuit_resolved, b15_4ig_sa_rf_test_resp):
    sim_and_compare(b15_4ig_circuit_resolved, b15_4ig_sa_rf_test_resp, m=4)
 def test_b15_4ig_sa_8v(b15_4ig_circuit_resolved, b15_4ig_sa_rf_test_resp):
    sim_and_compare(b15_4ig_circuit_resolved, b15_4ig_sa_rf_test_resp, m=8)
--- a/tests/test_sdf.py
+++ b/tests/test_sdf.py
@ -2,7 +2,7 @@ import numpy as np
 from kyupy import sdf, verilog, bench
 from kyupy.wave_sim import WaveSim, TMAX, TMIN
-from kyupy.techlib import SAED32, NANGATE45
+from kyupy.techlib import SAED32, SAED90
 def test_parse():
    test = '''
@ -80,9 +80,9 @@ def test_b15(mydir):
 def test_gates(mydir):
-    c = verilog.load(mydir / 'gates.v', tlib=NANGATE45)
+    c = verilog.load(mydir / 'gates.v', tlib=SAED90)
    df = sdf.load(mydir / 'gates.sdf')
-    lt = df.iopaths(c, tlib=NANGATE45)[1]
+    lt = df.iopaths(c, tlib=SAED90)[1]
    nand_a = c.cells['nandgate'].ins[0]
    nand_b = c.cells['nandgate'].ins[1]
    and_a = c.cells['andgate'].ins[0]
--- a/tests/test_verilog.py
+++ b/tests/test_verilog.py
@ -1,5 +1,5 @@
 from kyupy import verilog
-from kyupy.techlib import SAED90, SAED32, NANGATE45
+from kyupy.techlib import SAED90, SAED32
 def test_b01(mydir):
    with open(mydir / 'b01.v', 'r') as f:
@ -26,12 +26,12 @@ def test_b15(mydir):
 def test_gates(mydir):
-    c = verilog.load(mydir / 'gates.v', tlib=NANGATE45)
+    c = verilog.load(mydir / 'gates.v', tlib=SAED90)
-    assert len(c.nodes) == 18
+    assert len(c.nodes) == 10
-    assert len(c.lines) == 21
+    assert len(c.lines) == 10
    stats = c.stats
-    assert stats['input'] == 3
+    assert stats['input'] == 2
-    assert stats['output'] == 4
+    assert stats['output'] == 2
    assert stats['__seq__'] == 0
--- a/tests/test_wave_sim.py
+++ b/tests/test_wave_sim.py
@ -5,56 +5,22 @@ from kyupy.logic_sim import LogicSim
 from kyupy import logic, bench, sim
 from kyupy.logic import mvarray
 def test_xnor2_delays():
    op = (sim.XNOR2, 2, 0, 1, 3, 3, -1, 0, 0)
    #op = (0b0111, 4, 0, 1)
    c = np.full((4*16, 1), TMAX, dtype=np.float32)  # 4 waveforms of capacity 16
    c_locs = np.zeros((4,), dtype='int')
    c_caps = np.zeros((4,), dtype='int')
    ebuf = np.zeros((4, 1, 2), dtype=np.int32)
    for i in range(4): c_locs[i], c_caps[i] = i*16, 16  # 1:1 mapping
    delays = np.zeros((1, 4, 2, 2))
    delays[0, 0, 0, 0] = 0.031  # A rise -> Z rise
    delays[0, 0, 0, 1] = 0.027  # A rise -> Z fall
    delays[0, 0, 1, 0] = 0.033  # A fall -> Z rise
    delays[0, 0, 1, 1] = 0.037  # A fall -> Z fall
    delays[0, 1, 0, 0] = 0.032  # B rise -> Z rise
    delays[0, 1, 0, 1] = 0.030  # B rise -> Z fall
    delays[0, 1, 1, 0] = 0.038  # B fall -> Z rise
    delays[0, 1, 1, 1] = 0.036  # B fall -> Z fall
    simctl_int = np.asarray([0], dtype=np.int32)
    def wave_assert(inputs, output):
        for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
        wave_eval_cpu(op, c, c_locs, c_caps, ebuf, 0, delays, simctl_int, 0, 0)
        for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[2,i], v)
    wave_assert([[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX])      # XNOR(1,1) => 1
    wave_assert([[TMAX,TMAX],[TMIN,TMAX]], [TMAX])           # XNOR(0,1) => 0
    # using Afall/Zfall for pulse length, bug: was using Arise/Zfall
    #wave_assert([[0.07, 0.10, TMAX], [0.0, TMAX]], [TMIN, 0.03, 0.101, 0.137, TMAX])
    wave_assert([[0.07, 0.10, TMAX], [0.0, TMAX]], [TMIN, 0.03, TMAX])
    wave_assert([[0.06, 0.10, TMAX], [0.0, TMAX]], [TMIN, 0.03, 0.091, 0.137, TMAX])
 def test_nand_delays():
    op = (sim.NAND4, 4, 0, 1, 2, 3, -1, 0, 0)
    #op = (0b0111, 4, 0, 1)
-    c = np.full((5*16, 1), TMAX, dtype=np.float32)  # 5 waveforms of capacity 16
+    c = np.full((5*16, 1), TMAX)  # 5 waveforms of capacity 16
    c_locs = np.zeros((5,), dtype='int')
    c_caps = np.zeros((5,), dtype='int')
    ebuf = np.zeros((4, 1, 2), dtype=np.int32)
    for i in range(5): c_locs[i], c_caps[i] = i*16, 16  # 1:1 mapping
    # SDF specifies IOPATH delays with respect to output polarity
    # SDF pulse rejection value is determined by IOPATH causing last transition and polarity of last transition
    delays = np.zeros((1, 5, 2, 2))
-    delays[0, 0, 0, 0] = 0.1  # A rise -> Z rise
+    delays[0, 0, 0, 0] = 0.1  # A -> Z rise delay
-    delays[0, 0, 0, 1] = 0.2  # A rise -> Z fall
+    delays[0, 0, 0, 1] = 0.2  # A -> Z fall delay
-    delays[0, 0, 1, 0] = 0.1  # A fall -> Z rise
+    delays[0, 0, 1, 0] = 0.1  # A -> Z negative pulse limit (terminate in rising Z)
-    delays[0, 0, 1, 1] = 0.2  # A fall -> Z fall
+    delays[0, 0, 1, 1] = 0.2  # A -> Z positive pulse limit
    delays[0, 1, :, 0] = 0.3  # as above for B -> Z
    delays[0, 1, :, 1] = 0.4
    delays[0, 2, :, 0] = 0.5  # as above for C -> Z
@ -66,7 +32,7 @@ def test_nand_delays():
    def wave_assert(inputs, output):
        for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
-        wave_eval_cpu(op, c, c_locs, c_caps, ebuf, 0, delays, simctl_int, 0, 0)
+        wave_eval_cpu(op, c, c_locs, c_caps, 0, delays, simctl_int)
        for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[4,i], v)
    wave_assert([[TMAX,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(0,0,1,1) => 1
@ -179,7 +145,7 @@ def compare_to_logic_sim(wsim: WaveSim):
    lsim.s_to_c()
    lsim.c_prop()
    lsim.c_to_s()
-    exp = logic.bp_to_mv(lsim.s[1])[:,:tests.shape[-1]]
+    exp = logic.bp_to_mv(lsim.s[1])
    resp[resp == logic.PPULSE] = logic.ZERO
    resp[resp == logic.NPULSE] = logic.ONE
@ -190,13 +156,13 @@ def compare_to_logic_sim(wsim: WaveSim):
    np.testing.assert_allclose(resp, exp)
-def test_b15(b15_2ig_circuit_resolved, b15_2ig_delays):
+def test_b15(b15_2ig_circuit, b15_2ig_delays):
-    compare_to_logic_sim(WaveSim(b15_2ig_circuit_resolved, b15_2ig_delays, 8))
+    compare_to_logic_sim(WaveSim(b15_2ig_circuit, b15_2ig_delays, 8))
-def test_b15_strip_forks(b15_2ig_circuit_resolved, b15_2ig_delays):
+def test_b15_strip_forks(b15_2ig_circuit, b15_2ig_delays):
-    compare_to_logic_sim(WaveSim(b15_2ig_circuit_resolved, b15_2ig_delays, 8, strip_forks=True))
+    compare_to_logic_sim(WaveSim(b15_2ig_circuit, b15_2ig_delays, 8, strip_forks=True))
-def test_b15_cuda(b15_2ig_circuit_resolved, b15_2ig_delays):
+def test_b15_cuda(b15_2ig_circuit, b15_2ig_delays):
-    compare_to_logic_sim(WaveSimCuda(b15_2ig_circuit_resolved, b15_2ig_delays, 8, strip_forks=True))
+    compare_to_logic_sim(WaveSimCuda(b15_2ig_circuit, b15_2ig_delays, 8, strip_forks=True))