diff --git a/docs/conf.py b/docs/conf.py
index 53f405e..83cafc9 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -24,7 +24,7 @@ copyright = '2020-2023, Stefan Holst'
 author = 'Stefan Holst'
 
 # The full version, including alpha/beta/rc tags
-release = '0.0.4'
+release = '0.0.5'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b8aaecb
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,32 @@
+[project]
+name = "kyupy"
+version = "0.0.5"
+authors = [
+  { name="Stefan Holst", email="mail@s-holst.de" },
+]
+description = 'High-performance processing and analysis of non-hierarchical VLSI designs'
+readme = "README.rst"
+requires_python = ">=3.8"
+dependencies = [
+    "numpy>=1.17.0",
+    "lark-parser>=0.8.0",
+]
+classifiers = [
+        "Development Status :: 3 - Alpha",
+        "Environment :: GPU :: NVIDIA CUDA",
+        "Intended Audience :: Science/Research",
+        "Topic :: Scientific/Engineering :: Electronic Design Automation (EDA)",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+]
+license = "MIT"
+license-files = ["LICENSE.txt"]
+
+[project.urls]
+homepage = "https://github.com/s-holst/kyupy"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 47ef1f7..0000000
--- a/setup.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from setuptools import setup, find_packages
-
-with open('README.rst', 'r') as f:
-    long_description = f.read()
-
-setup(
-    name='kyupy',
-    version='0.0.4',
-    description='High-performance processing and analysis of non-hierarchical VLSI designs',
-    long_description=long_description,
-    long_description_content_type='text/x-rst',
-    packages=find_packages(where='src'),
-    package_dir={'': 'src'},
-    url='https://github.com/s-holst/kyupy',
-    author='Stefan Holst',
-    author_email='mail@s-holst.de',
-    python_requires='>=3.8',
-    install_requires=[
-        'numpy>=1.17.0',
-        'lark-parser>=0.8.0'
-    ],
-    extras_requires={
-        'dev': [
-            'pytest>=6.1',
-        ],
-    },
-    classifiers=[
-        'Development Status :: 3 - Alpha',
-        'Environment :: GPU :: NVIDIA CUDA',
-        'Intended Audience :: Science/Research',
-        'Topic :: Scientific/Engineering :: Electronic Design Automation (EDA)',
-        'License :: OSI Approved :: MIT License',
-        'Operating System :: OS Independent',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3 :: Only',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3.9',
-        'Programming Language :: Python :: 3.10',
-    ],
-)
diff --git a/src/kyupy/__init__.py b/src/kyupy/__init__.py
index 123bd03..b473546 100644
--- a/src/kyupy/__init__.py
+++ b/src/kyupy/__init__.py
@@ -57,6 +57,18 @@ def hr_bytes(nbytes):
         multiplier += 1
     return f'{nbytes:.1f}{["", "ki", "Mi", "Gi", "Ti", "Pi"][multiplier]}B'
 
+def eng(number):
+    """Formats a given number using engineering notation."""
+    exponent = 0
+    if abs(number) < 1:
+        while abs(number) >= 1000:
+            number *= 1000
+            exponent -= 3
+    else:
+        while abs(number) >= 1000:
+            number /= 1000
+            exponent += 3
+    return f'{number:.0f}' + (f'e{exponent}' if exponent != 0 else '')
 
 def hr_time(seconds):
     """Formats a given time interval for human readability."""
@@ -138,10 +150,10 @@ class Log:
         self._limit = limit
 
     def stop_limit(self):
+        self._limit = -1
         if self.filtered > 0:
-            log.info(f'{self.filtered} more messages (filtered).')
+            self.info(f'{self.filtered} more messages (filtered).')
             self.filtered = 0
-        self._limit = -1
 
     def __getstate__(self):
         return {'elapsed': time.perf_counter() - self.start}
@@ -149,6 +161,8 @@ class Log:
     def __setstate__(self, state):
         self.logfile = sys.stdout
         self.indent = 0
+        self._limit = -1
+        self.filtered = 0
         self.start = time.perf_counter() - state['elapsed']
 
     def write(self, s, indent=0):
@@ -169,7 +183,7 @@ class Log:
             return
         t = time.perf_counter() - self.start
         self.logfile.write(f'# {t:011.3f} {level} {message}\n')
-        self.logfile.flush()
+        #self.logfile.flush()
         self._limit -= 1
 
     def info(self, message):
diff --git a/src/kyupy/circuit.py b/src/kyupy/circuit.py
index 46c9f38..e387ffc 100644
--- a/src/kyupy/circuit.py
+++ b/src/kyupy/circuit.py
@@ -10,20 +10,40 @@ Circuit graphs also define an ordering of inputs, outputs and other nodes to eas
 
 """
 
+from __future__ import annotations
+
 from collections import deque, defaultdict
 import re
+from typing import Union
 
 import numpy as np
 
 
 class GrowingList(list):
     def __setitem__(self, index, value):
-        if index >= len(self):
-            self.extend([None] * (index + 1 - len(self)))
+        if value is None: self.has_nones = True
+        if index == len(self): return super().append(value)
+        if index > len(self):
+            super().extend([None] * (index + 1 - len(self)))
+            self.has_nones = True
         super().__setitem__(index, value)
 
-    def free_index(self):
-        return next((i for i, x in enumerate(self) if x is None), len(self))
+    def __getitem__(self, index):
+        if isinstance(index, slice): return super().__getitem__(index)
+        return super().__getitem__(index) if index < len(self) else None
+
+    @property
+    def free_idx(self):
+        fi = len(self)
+        if hasattr(self, 'has_nones') and self.has_nones:
+            fi = next((i for i, x in enumerate(self) if x is None), len(self))
+            self.has_nones = fi < len(self)
+        return fi
+
+    def without_nones(self):
+        for item in self:
+            if item is not None:
+                yield item
 
 
 class IndexList(list):
@@ -76,10 +96,10 @@ class Node:
         by allocating an array or list :code:`my_data` of length :code:`len(n.circuit.nodes)` and
         accessing it by :code:`my_data[n.index]` or simply by :code:`my_data[n]`.
         """
-        self.ins = GrowingList()
+        self.ins: GrowingList[Line] = GrowingList()
         """A list of input connections (:class:`Line` objects).
         """
-        self.outs = GrowingList()
+        self.outs: GrowingList[Line] = GrowingList()
         """A list of output connections (:class:`Line` objects).
         """
 
@@ -135,7 +155,7 @@ class Line:
     Use the explicit case only if connections to specific pins are required.
     It may overwrite any previous line references in the connection list of the nodes.
     """
-    def __init__(self, circuit, driver, reader):
+    def __init__(self, circuit: Circuit, driver: Union[Node, tuple[Node, int]], reader: Union[Node, tuple[Node, int]]):
         self.circuit = circuit
         """The :class:`Circuit` object the line is part of.
         """
@@ -147,7 +167,7 @@ class Line:
         by allocating an array or list :code:`my_data` of length :code:`len(l.circuit.lines)` and
         accessing it by :code:`my_data[l.index]` or simply by :code:`my_data[l]`.
         """
-        if not isinstance(driver, tuple): driver = (driver, driver.outs.free_index())
+        if not isinstance(driver, tuple): driver = (driver, driver.outs.free_idx)
         self.driver = driver[0]
         """The :class:`Node` object that drives this line.
         """
@@ -157,7 +177,7 @@ class Line:
         This is the position in the list :py:attr:`Node.outs` of the driving node this line referenced from:
         :code:`self.driver.outs[self.driver_pin] == self`.
         """
-        if not isinstance(reader, tuple): reader = (reader, reader.ins.free_index())
+        if not isinstance(reader, tuple): reader = (reader, reader.ins.free_idx)
         self.reader = reader[0]
         """The :class:`Node` object that reads this line.
         """
@@ -292,7 +312,7 @@ class Circuit:
     def _locs(self, prefix, nodes):
         d_top = dict()
         for i, n in enumerate(nodes):
-            if m := re.match(fr'({prefix}.*?)((?:[\d_\[\]])*$)', n.name):
+            if m := re.match(fr'({re.escape(prefix)}.*?)((?:[\d_\[\]])*$)', n.name):
                 path = [m[1]] + [int(v) for v in re.split(r'[_\[\]]+', m[2]) if len(v) > 0]
                 d = d_top
                 for j in path[:-1]:
@@ -334,15 +354,16 @@ class Circuit:
     def get_or_add_fork(self, name):
         return self.forks[name] if name in self.forks else Node(self, name)
 
-    def remove_dangling_nodes(self, root_node:Node):
+    def remove_dangling_nodes(self, root_node:Node, keep=[]):
         if len([l for l in root_node.outs if l is not None]) > 0: return
         lines = [l for l in root_node.ins if l is not None]
         drivers = [l.driver for l in lines]
+        if root_node in keep: return
         root_node.remove()
         for l in lines:
             l.remove()
         for d in drivers:
-            self.remove_dangling_nodes(d)
+            self.remove_dangling_nodes(d, keep=keep)
 
     def eliminate_1to1_forks(self):
         """Removes all forks that drive only one node.
@@ -370,6 +391,21 @@ class Circuit:
             in_line.reader_pin = out_reader_pin
             in_line.reader.ins[in_line.reader_pin] = in_line
 
+    def remove_forks(self):
+        ios = set(self.io_nodes)
+        for n in list(self.forks.values()):
+            if n in ios: continue
+            d = None
+            if (l := n.ins[0]) is not None:
+                d = l.driver
+                l.remove()
+            for l in list(n.outs):
+                if l is None: continue
+                r, rp = l.reader, l.reader_pin
+                l.remove()
+                if d is not None: Line(self, d, (r, rp))
+            n.remove()
+
     def substitute(self, node, impl):
         """Replaces a given node with the given implementation circuit.
 
@@ -428,7 +464,7 @@ class Circuit:
         for l, ll in zip(impl_out_lines, node_out_lines):  # connect outputs
             if ll is None:
                 if l.driver in node_map:
-                    self.remove_dangling_nodes(node_map[l.driver])
+                    self.remove_dangling_nodes(node_map[l.driver], keep=ios)
                 continue
             if len(l.reader.outs) > 0:  # output is also read by impl. circuit, connect to fork.
                 ll.driver = node_map[l.reader]
@@ -447,6 +483,21 @@ class Circuit:
             if n.kind in tlib.cells:
                 self.substitute(n, tlib.cells[n.kind][0])
 
+    def remove_constants(self):
+        c1gen = None
+        for n in self.nodes:
+            if n.kind == '__const0__':  # just remove, unconnected inputs are defined 0.
+                for l in n.outs:
+                    l.remove()
+                n.remove()
+            elif n.kind == '__const1__':
+                if c1gen is None: c1gen = Node(self, '__const1gen__', 'INV1')  # one unique const 1 generator
+                for l in n.outs:
+                    r, rp = l.reader, l.reader_pin
+                    l.remove()
+                    Line(self, c1gen, (r, rp))
+                n.remove()
+
     def copy(self):
         """Returns a deep copy of the circuit.
         """
@@ -501,14 +552,15 @@ class Circuit:
         substrings 'dff' or 'latch' are yielded first.
         """
         visit_count = np.zeros(len(self.nodes), dtype=np.uint32)
-        queue = deque(n for n in self.nodes if len(n.ins) == 0 or 'dff' in n.kind.lower() or 'latch' in n.kind.lower())
+        start = set(n for n in self.nodes if len(n.ins) == 0 or 'dff' in n.kind.lower() or 'latch' in n.kind.lower())
+        queue = deque(start)
         while len(queue) > 0:
             n = queue.popleft()
             for line in n.outs:
                 if line is None: continue
                 succ = line.reader
                 visit_count[succ] += 1
-                if visit_count[succ] == len(succ.ins) and 'dff' not in succ.kind.lower() and 'latch' not in succ.kind.lower():
+                if visit_count[succ] == len(succ.ins) and succ not in start:
                     queue.append(succ)
             yield n
 
@@ -563,6 +615,21 @@ class Circuit:
             if marks[n]:
                 yield n
 
+    def fanout(self, origin_nodes):
+        """Generator function to iterate over the fan-out cone of a given list of origin nodes.
+
+        Nodes are yielded in topological order.
+        """
+        marks = [False] * len(self.nodes)
+        for n in origin_nodes:
+            marks[n] = True
+        for n in self.topological_order():
+            if not marks[n]:
+                for line in n.ins.without_nones():
+                    marks[n] |= marks[line.driver]
+            if marks[n]:
+                yield n
+
     def fanout_free_regions(self):
         for stem in self.reversed_topological_order():
             if len(stem.outs) == 1 and 'dff' not in stem.kind.lower(): continue
diff --git a/src/kyupy/logic.py b/src/kyupy/logic.py
index ff49344..3e97f8e 100644
--- a/src/kyupy/logic.py
+++ b/src/kyupy/logic.py
@@ -241,6 +241,8 @@ def mv_latch(d, t, q_prev, out=None):
 def mv_transition(init, final, out=None):
     """Computes the logic transitions from the initial values of ``init`` to the final values of ``final``.
     Pulses in the input data are ignored. If any of the inputs are ``UNKNOWN``, the result is ``UNKNOWN``.
+    If init is ``UNASSIGNED``, the result is the final value of ``final``.
+    If final is ``UNASSIGNED``, the result is the initial value of ``init``.
     If both inputs are ``UNASSIGNED``, the result is ``UNASSIGNED``.
 
     :param init: A multi-valued array.
@@ -251,7 +253,9 @@ def mv_transition(init, final, out=None):
     out = out or np.empty(np.broadcast(init, final).shape, dtype=np.uint8)
     out[...] = (init & 0b010) | (final & 0b001)
     out[...] |= ((out << 1) ^ (out << 2)) & 0b100
-    unknown = (init == UNKNOWN) | (init == UNASSIGNED) | (final == UNKNOWN) | (final == UNASSIGNED)
+    out[...] = np.choose(init == UNASSIGNED, [out, (final & 0b001) * ONE])
+    out[...] = np.choose(final == UNASSIGNED, [out, ((init & 0b010) >> 1) * ONE])
+    unknown = (init == UNKNOWN) | (final == UNKNOWN)
     unassigned = (init == UNASSIGNED) & (final == UNASSIGNED)
     np.putmask(out, unknown, UNKNOWN)
     np.putmask(out, unassigned, UNASSIGNED)
@@ -265,6 +269,18 @@ def mv_to_bp(mva):
     return np.packbits(unpackbits(mva)[...,:3], axis=-2, bitorder='little').swapaxes(-1,-2)
 
 
+def mv_init(mva):
+    """Returns the initial binary values for mva.
+    """
+    return (mva>>1) & ((mva>>2)|mva) & 1
+
+
+def mv_final(mva):
+    """Returns the final binary value of mva.
+    """
+    return mva & ((mva>>2)|(mva>>1)) & 1
+
+
 def bparray(*a):
     """Converts (lists of) Boolean values or strings into a bit-parallel array.
 
diff --git a/src/kyupy/logic_sim.py b/src/kyupy/logic_sim.py
index 81ef44b..3ac5233 100644
--- a/src/kyupy/logic_sim.py
+++ b/src/kyupy/logic_sim.py
@@ -10,9 +10,10 @@ import math
 
 import numpy as np
 
-from . import numba, logic, hr_bytes, sim
+from . import numba, logic, hr_bytes, sim, eng, cdiv
 from .circuit import Circuit
 
+
 class LogicSim(sim.SimOps):
     """A bit-parallel naïve combinational simulator for 2-, 4-, or 8-valued logic.
 
@@ -28,7 +29,7 @@ class LogicSim(sim.SimOps):
         self.m = m
         self.mdim = math.ceil(math.log2(m))
         self.sims = sims
-        nbytes = (sims - 1) // 8 + 1
+        nbytes = cdiv(sims, 8)
 
         self.c = np.zeros((self.c_len, self.mdim, nbytes), dtype=np.uint8)
         self.s = np.zeros((2, self.s_len, 3, nbytes), dtype=np.uint8)
@@ -44,14 +45,14 @@ class LogicSim(sim.SimOps):
         self.s[:,:,1,:] = 255  # unassigned
 
     def __repr__(self):
-        return f'{{name: "{self.circuit.name}", sims: {self.sims}, m: {self.m}, c_bytes: {self.c.nbytes}}}'
+        return f'{{name: "{self.circuit.name}", sims: {self.sims}, m: {self.m}, c_bytes: {eng(self.c.nbytes)}}}'
 
     def s_to_c(self):
         """Copies the values from ``s[0]`` the inputs of the combinational portion.
         """
         self.c[self.pippi_c_locs] = self.s[0, self.pippi_s_locs, :self.mdim]
 
-    def c_prop(self, inject_cb=None):
+    def c_prop(self, sims=None, inject_cb=None, fault_line=-1, fault_mask=None, fault_model=2):
         """Propagate the input values through the combinational circuit towards the outputs.
 
         Performs all logic operations in topological order.
@@ -67,10 +68,17 @@ class LogicSim(sim.SimOps):
         t1 = self.c_locs[self.tmp2_idx]
         if self.m == 2:
             if inject_cb is None:
-                _prop_cpu(self.ops, self.c_locs, self.c)
+                if fault_mask is None:
+                    fault_mask = np.full(self.c.shape[-1], 255, dtype=np.uint8)
+                else:
+                    if len(fault_mask) < self.c.shape[-1]:
+                        fault_mask2 = np.full(self.c.shape[-1], 0, dtype=np.uint8)
+                        fault_mask2[:len(fault_mask)] = fault_mask
+                        fault_mask = fault_mask2
+                _prop_cpu(self.ops, self.c_locs, self.c, int(fault_line), fault_mask, int(fault_model))
             else:
-                for op, o0, i0, i1, i2, i3 in self.ops[:,:6]:
-                    o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)]
+                for op, o0l, i0l, i1l, i2l, i3l in self.ops[:,:6]:
+                    o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0l, i0l, i1l, i2l, i3l)]
                     if op == sim.BUF1: self.c[o0]=self.c[i0]
                     elif op == sim.INV1: self.c[o0] = ~self.c[i0]
                     elif op == sim.AND2: self.c[o0] = self.c[i0] & self.c[i1]
@@ -105,10 +113,10 @@ class LogicSim(sim.SimOps):
                     elif op == sim.OAI211:self.c[o0] = ~((self.c[i0] | self.c[i1]) & self.c[i2] & self.c[i3])
                     elif op == sim.MUX21: self.c[o0] = (self.c[i0] & ~self.c[i2]) | (self.c[i1] & self.c[i2])
                     else: print(f'unknown op {op}')
-                    inject_cb(o0, self.s[o0])
+                    inject_cb(o0l, self.c[o0])
         elif self.m == 4:
-            for op, o0, i0, i1, i2, i3 in self.ops[:,:6]:
-                o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)]
+            for op, o0l, i0l, i1l, i2l, i3l in self.ops[:,:6]:
+                o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0l, i0l, i1l, i2l, i3l)]
                 if op == sim.BUF1: self.c[o0]=self.c[i0]
                 elif op == sim.INV1: logic.bp4v_not(self.c[o0], self.c[i0])
                 elif op == sim.AND2: logic.bp4v_and(self.c[o0], self.c[i0], self.c[i1])
@@ -181,9 +189,10 @@ class LogicSim(sim.SimOps):
                     logic.bp4v_and(self.c[t1], self.c[i1], self.c[i2])
                     logic.bp4v_or(self.c[o0], self.c[t0], self.c[t1])
                 else: print(f'unknown op {op}')
+                if inject_cb is not None: inject_cb(o0l, self.c[o0])
         else:
-            for op, o0, i0, i1, i2, i3 in self.ops[:,:6]:
-                o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)]
+            for op, o0l, i0l, i1l, i2l, i3l in self.ops[:,:6]:
+                o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0l, i0l, i1l, i2l, i3l)]
                 if op == sim.BUF1: self.c[o0]=self.c[i0]
                 elif op == sim.INV1: logic.bp8v_not(self.c[o0], self.c[i0])
                 elif op == sim.AND2: logic.bp8v_and(self.c[o0], self.c[i0], self.c[i1])
@@ -256,7 +265,7 @@ class LogicSim(sim.SimOps):
                     logic.bp8v_and(self.c[t1], self.c[i1], self.c[i2])
                     logic.bp8v_or(self.c[o0], self.c[t0], self.c[t1])
                 else: print(f'unknown op {op}')
-                if inject_cb is not None: inject_cb(o0, self.s[o0])
+                if inject_cb is not None: inject_cb(o0l, self.c[o0])
 
     def c_to_s(self):
         """Copies (captures) the results of the combinational portion to ``s[1]``.
@@ -296,9 +305,9 @@ class LogicSim(sim.SimOps):
 
 
 @numba.njit
-def _prop_cpu(ops, c_locs, c):
-    for op, o0, i0, i1, i2, i3 in ops[:,:6]:
-        o0, i0, i1, i2, i3 = [c_locs[x] for x in (o0, i0, i1, i2, i3)]
+def _prop_cpu(ops, c_locs, c, fault_line, fault_mask, fault_model):
+    for op, o0l, i0l, i1l, i2l, i3l in ops[:,:6]:
+        o0, i0, i1, i2, i3 = [c_locs[x] for x in (o0l, i0l, i1l, i2l, i3l)]
         if op == sim.BUF1: c[o0]=c[i0]
         elif op == sim.INV1: c[o0] = ~c[i0]
         elif op == sim.AND2: c[o0] = c[i0] & c[i1]
@@ -333,3 +342,129 @@ def _prop_cpu(ops, c_locs, c):
         elif op == sim.OAI211: c[o0] = ~((c[i0] | c[i1]) & c[i2] & c[i3])
         elif op == sim.MUX21: c[o0] = (c[i0] & ~c[i2]) | (c[i1] & c[i2])
         else: print(f'unknown op {op}')
+        if fault_line >= 0 and o0l == fault_line:
+            #n = len(fault_mask)
+            if fault_model == 0:
+                c[o0] = c[o0] & ~fault_mask
+            elif fault_model == 1:
+                c[o0] = c[o0] | fault_mask
+            else:
+                c[o0] = c[o0] ^ fault_mask
+
+
+class LogicSim6V(sim.SimOps):
+    """A bit-parallel naïve combinational simulator for 6-valued logic.
+
+    :param circuit: The circuit to simulate.
+    :param sims: The number of parallel logic simulations to perform.
+    :param c_reuse: If True, intermediate signal values may get overwritten when not needed anymore to save memory.
+    :param strip_forks: If True, forks are not included in the simulation model to save memory and simulation time.
+    """
+    def __init__(self, circuit: Circuit, sims: int = 8, c_reuse: bool = False, strip_forks: bool = False):
+        super().__init__(circuit, c_reuse=c_reuse, strip_forks=strip_forks)
+        self.sims = sims
+        nbytes = cdiv(sims, 8)
+
+        self.c = np.zeros((self.c_len, 3, nbytes), dtype=np.uint8)
+        self.s = np.zeros((2, self.s_len, self.sims), dtype=np.uint8)
+        """Logic values of the sequential elements (flip-flops) and ports.
+
+        It is a pair of arrays in mv storage format:
+
+        * ``s[0]`` Assigned values. Simulator will read (P)PI value from here.
+        * ``s[1]`` Result values. Simulator will write (P)PO values here.
+
+        Access this array to assign new values to the (P)PIs or read values from the (P)POs.
+        """
+
+    def __repr__(self):
+        return f'{{name: "{self.circuit.name}", sims: {self.sims}, c_bytes: {eng(self.c.nbytes)}}}'
+
+    def s_to_c(self):
+        """Assigns the values from ``s[0]`` to the inputs of the combinational portion.
+        """
+        self.c[self.pippi_c_locs] = logic.mv_to_bp(self.s[0, self.pippi_s_locs])
+
+    def c_prop(self):
+        c_prop_cpu(self.ops, self.c, self.c_locs, self.tmp_idx, self.tmp2_idx)
+
+    def c_to_s(self):
+        """Captures the results of the combinational portion into ``s[1]``.
+        """
+        self.s[1, self.poppo_s_locs] = logic.bp_to_mv(self.c[self.poppo_c_locs])[:,:self.sims]
+
+
+@numba.njit
+def c_prop_cpu(ops, c, c_locs, tmp_idx, tmp2_idx):
+    t0 = c[c_locs[tmp_idx]]
+    t1 = c[c_locs[tmp2_idx]]
+    inv_op = np.array([255, 255, 0], dtype=np.uint8)[np.newaxis, :, np.newaxis]
+    for op, o0l, i0l, i1l, i2l, i3l in ops[:,:6]:
+        o0, i0, i1, i2, i3 = [c[c_locs[x]] for x in (o0l, i0l, i1l, i2l, i3l)]
+        if op == sim.BUF1 or op == sim.INV1:
+            o0[...] = i0
+        elif op == sim.AND2 or op == sim.NAND2:
+            o0[0] = i0[0] & i1[0]
+            o0[1] = i0[1] & i1[1]
+            o0[2] = (i0[2]&(i1[0]|i1[1]|i1[2])|
+                     i1[2]&(i0[0]|i0[1]|i0[2]))
+        elif op == sim.AND3 or op == sim.NAND3:
+            o0[0] = i0[0] & i1[0] & i2[0]
+            o0[1] = i0[1] & i1[1] & i2[1]
+            o0[2] = (i0[2]&(i1[0]|i1[1]|i1[2])&(i2[0]|i2[1]|i2[2])|
+                     i1[2]&(i0[0]|i0[1]|i0[2])&(i2[0]|i2[1]|i2[2])|
+                     i2[2]&(i0[0]|i0[1]|i0[2])&(i1[0]|i1[1]|i1[2]))
+        elif op == sim.AND4 or op == sim.NAND4:
+            o0[0] = i0[0] & i1[0] & i2[0] & i3[0]
+            o0[1] = i0[1] & i1[1] & i2[1] & i3[1]
+            o0[2] = (i0[2]&(i1[0]|i1[1]|i1[2])&(i2[0]|i2[1]|i2[2])&(i3[0]|i3[1]|i3[2])|
+                     i1[2]&(i0[0]|i0[1]|i0[2])&(i2[0]|i2[1]|i2[2])&(i3[0]|i3[1]|i3[2])|
+                     i2[2]&(i0[0]|i0[1]|i0[2])&(i1[0]|i1[1]|i1[2])&(i3[0]|i3[1]|i3[2])|
+                     i3[2]&(i0[0]|i0[1]|i0[2])&(i1[0]|i1[1]|i1[2])&(i2[0]|i2[1]|i2[2]))
+        elif op == sim.OR2 or op == sim.NOR2:
+            o0[0] = i0[0] | i1[0]
+            o0[1] = i0[1] | i1[1]
+            o0[2] = (i0[2]&(~i1[0]|~i1[1]|i1[2])|
+                     i1[2]&(~i0[0]|~i0[1]|i0[2]))
+        elif op == sim.OR3 or op == sim.NOR3:
+            o0[0] = i0[0] | i1[0] | i2[0]
+            o0[1] = i0[1] | i1[1] | i2[1]
+            o0[2] = (i0[2]&(~i1[0]|~i1[1]|i1[2])&(~i2[0]|~i2[1]|i2[2])|
+                     i1[2]&(~i0[0]|~i0[1]|i0[2])&(~i2[0]|~i2[1]|i2[2])|
+                     i2[2]&(~i0[0]|~i0[1]|i0[2])&(~i1[0]|~i1[1]|i1[2]))
+        elif op == sim.OR4 or op == sim.NOR4:
+            o0[0] = i0[0] | i1[0] | i2[0] | i3[0]
+            o0[1] = i0[1] | i1[1] | i2[1] | i3[1]
+            o0[2] = (i0[2]&(~i1[0]|~i1[1]|i1[2])&(~i2[0]|~i2[1]|i2[2])&(~i3[0]|~i3[1]|i3[2])|
+                     i1[2]&(~i0[0]|~i0[1]|i0[2])&(~i2[0]|~i2[1]|i2[2])&(~i3[0]|~i3[1]|i3[2])|
+                     i2[2]&(~i0[0]|~i0[1]|i0[2])&(~i1[0]|~i1[1]|i1[2])&(~i3[0]|~i3[1]|i3[2])|
+                     i3[2]&(~i0[0]|~i0[1]|i0[2])&(~i1[0]|~i1[1]|i1[2])&(~i2[0]|~i2[1]|i2[2]))
+        elif op == sim.XOR2 or op == sim.XNOR2:
+            o0[0] = i0[0] ^ i1[0]
+            o0[1] = i0[1] ^ i1[1]
+            o0[2] = i0[2] | i1[2]
+        elif op == sim.MUX21:
+            # t1 = ~i2
+            t1[...] = i2 ^ inv_op
+            # t0 = i0 & t1
+            t0[0] = i0[0] & t1[0]
+            t0[1] = i0[1] & t1[1]
+            t0[2] = (i0[2]&(t1[0]|t1[1]|t1[2])|
+                     t1[2]&(i0[0]|i0[1]|i0[2]))
+            # t1 = i1 & i2
+            t1[0] = i1[0] & i2[0]
+            t1[1] = i1[1] & i2[1]
+            t1[2] = (i1[2]&(i2[0]|i2[1]|i2[2])|
+                     i2[2]&(i1[0]|i1[1]|i1[2]))
+            # o0 = t0 | t1
+            o0[0] = t0[0] | t1[0]
+            o0[1] = t0[1] | t1[1]
+            o0[2] = (t0[2]&(~t1[0]|~t1[1]|t1[2])|
+                     t1[2]&(~t0[0]|~t0[1]|t0[2]))
+        else: print(f'unknown op {op}')
+
+        if (op == sim.INV1 or
+            op == sim.NAND2 or op == sim.NAND3 or op == sim.NAND4 or
+            op == sim.NOR2 or op == sim.NOR3 or op == sim.NOR4 or
+            op == sim.XNOR2):
+            o0[...] = o0 ^ inv_op
diff --git a/src/kyupy/sdf.py b/src/kyupy/sdf.py
index 015f975..5314efb 100644
--- a/src/kyupy/sdf.py
+++ b/src/kyupy/sdf.py
@@ -61,20 +61,21 @@ class DelayFile:
 
         delays = np.zeros((len(circuit.lines), 2, 2, 3))  # dataset last during construction.
 
-        for name, iopaths in self.cells.items():
-            name = name.replace('\\', '')
-            if cell := circuit.cells.get(name, None):
-                for i_pin_spec, o_pin_spec, *dels in iopaths:
-                    if i_pin_spec.startswith('(posedge '): i_pol_idxs = [0]
-                    elif i_pin_spec.startswith('(negedge '): i_pol_idxs = [1]
-                    else: i_pol_idxs = [0, 1]
-                    i_pin_spec = re.sub(r'\((neg|pos)edge ([^)]+)\)', r'\2', i_pin_spec)
-                    if line := cell.ins[tlib.pin_index(cell.kind, i_pin_spec)]:
-                        delays[line, i_pol_idxs] = [d if len(d) > 0 else [0, 0, 0] for d in dels]
-                    else:
-                        log.warn(f'No line to annotate in circuit: {i_pin_spec} for {cell}')
-            else:
-                log.warn(f'Name from SDF not found in circuit: {name}')
+        with log.limit(50):
+            for name, iopaths in self.cells.items():
+                name = name.replace('\\', '')
+                if cell := circuit.cells.get(name, None):
+                    for i_pin_spec, o_pin_spec, *dels in iopaths:
+                        if i_pin_spec.startswith('(posedge '): i_pol_idxs = [0]
+                        elif i_pin_spec.startswith('(negedge '): i_pol_idxs = [1]
+                        else: i_pol_idxs = [0, 1]
+                        i_pin_spec = re.sub(r'\((neg|pos)edge ([^)]+)\)', r'\2', i_pin_spec)
+                        if line := cell.ins[tlib.pin_index(cell.kind, i_pin_spec)]:
+                            delays[line, i_pol_idxs] = [d if len(d) > 0 else [0, 0, 0] for d in dels]
+                        else:
+                            log.warn(f'No line to annotate in circuit: {i_pin_spec} for {cell}')
+                else:
+                    log.warn(f'Name from SDF not found in circuit: {name}')
 
         return np.moveaxis(delays, -1, 0)
 
@@ -102,11 +103,12 @@ class DelayFile:
 
         delays = np.zeros((len(circuit.lines), 2, 2, 3))  # dataset last during construction.
 
+        nonfork_annotations = 0
         for n1, n2, *delvals in self._interconnects:
             delvals = [d if len(d) > 0 else [0, 0, 0] for d in delvals]
             if max(max(delvals)) == 0: continue
-            cn1, pn1 = n1.split('/') if '/' in n1 else (n1, None)
-            cn2, pn2 = n2.split('/') if '/' in n2 else (n2, None)
+            cn1, pn1 = (n1, None) if (slash := n1.rfind('/')) < 0 else (n1[:slash], n1[slash+1:])
+            cn2, pn2 = (n2, None) if (slash := n2.rfind('/')) < 0 else (n2[:slash], n2[slash+1:])
             cn1 = cn1.replace('\\','')
             cn2 = cn2.replace('\\','')
             c1, c2 = circuit.cells[cn1], circuit.cells[cn2]
@@ -119,19 +121,27 @@ class DelayFile:
                 log.warn(f'No line to annotate pin {pn2} of {c2}')
                 continue
             f1, f2 = c1.outs[p1].reader, c2.ins[p2].driver  # find the forks between cells.
-            assert f1.kind == '__fork__'
-            assert f2.kind == '__fork__'
-            if f1 != f2:  # at least two forks, make sure f2 is a branchfork connected to f1
-                assert len(f2.outs) == 1
-                assert f1.outs[f2.ins[0].driver_pin] == f2.ins[0]
-                line = f2.ins[0]
-            elif len(f2.outs) == 1:  # f1==f2, only OK when there is no fanout.
-                line = f2.ins[0]
+            if f1 == c2 and f2 == c1:
+                nonfork_annotations += 1
+                if nonfork_annotations < 10:
+                    log.warn(f'No fork between {c1.name}/{p1} and {c2.name}/{p2}, using {c2.name}/{p2}')
+                line = c2.ins[p2]
             else:
-                log.warn(f'No branchfork to annotate interconnect delay {c1.name}/{p1}->{c2.name}/{p2}')
-                continue
+                assert f1.kind == '__fork__'
+                assert f2.kind == '__fork__'
+                if len(f2.outs) == 1:
+                    assert f1 == f2 or f1.outs[f2.ins[0].driver_pin] == f2.ins[0]
+                    line = f2.ins[0]
+                else:
+                    nonfork_annotations += 1
+                    if nonfork_annotations < 10:
+                        log.warn(f'No branchfork between {c1.name}/{p1} and {c2.name}/{p2}, using {c2.name}/{p2}')
+                    line = c2.ins[p2]
             delays[line, :] = delvals
 
+        if nonfork_annotations > 0:
+            log.warn(f'{nonfork_annotations} interconnect annotations were moved to gate inputs due to missing forks.')
+
         return np.moveaxis(delays, -1, 0)
 
 
@@ -156,6 +166,10 @@ class SdfTransformer(Transformer):
         entries = [e for a in args if hasattr(a, 'children') for e in a.children]
         return name, entries
 
+    @staticmethod
+    def cond(args):  # ignore conditions
+        return args[1]
+
     @staticmethod
     def start(args):
         name = next((a for a in args if isinstance(a, str)), None)
@@ -180,9 +194,12 @@ GRAMMAR = r"""
         | "(INSTANCE" ID? ")"
         | "(TIMINGCHECK" _ignore* ")"
         | delay )* ")"
-    delay: "(DELAY" "(ABSOLUTE" (interconnect | iopath)* ")" ")"
+    delay: "(DELAY" "(ABSOLUTE" (interconnect | iopath | cond)* ")" ")"
     interconnect: "(INTERCONNECT" ID ID triple* ")"
     iopath: "(IOPATH" ID_OR_EDGE ID_OR_EDGE triple* ")"
+    cond: "(" "COND" cond_port_expr iopath ")"
+    ?cond_port_expr: ID | "(" cond_port_expr ")" | cond_port_expr BINARY_OP cond_port_expr
+    BINARY_OP: /&&/ | /==/
     NAME: /[^"]+/
     ID_OR_EDGE: ( /[^() ]+/ | "(" /[^)]+/ ")" )
     ID: ( /[^"() ]+/ | "\"" /[^"]+/ "\"" )
diff --git a/src/kyupy/sim.py b/src/kyupy/sim.py
index de21b27..32b7459 100644
--- a/src/kyupy/sim.py
+++ b/src/kyupy/sim.py
@@ -4,9 +4,14 @@ from bisect import bisect, insort_left
 
 import numpy as np
 
+from .circuit import Circuit
+
 BUF1 = np.uint16(0b1010_1010_1010_1010)
 INV1 = ~BUF1
 
+__const0__ = BUF1
+__const1__ = INV1
+
 AND2 = np.uint16(0b1000_1000_1000_1000)
 AND3 = np.uint16(0b1000_0000_1000_0000)
 AND4 = np.uint16(0b1000_0000_0000_0000)
@@ -39,7 +44,10 @@ AOI211, OAI211 = ~AO211, ~OA211
 
 MUX21 = np.uint16(0b1100_1010_1100_1010)  # z = i1 if i2 else i0 (i2 is select)
 
-names = dict([(v, k) for k, v in globals().items() if isinstance(v, np.uint16)])
+names = dict([(v, k) for k, v in globals().items() if isinstance(v, np.uint16) and '__' not in k])
+
+prim2name = dict([(v, k) for k, v in globals().items() if isinstance(v, np.uint16) and '__' not in k])
+name2prim = dict([(k, v) for k, v in globals().items() if isinstance(v, np.uint16)])
 
 kind_prefixes = {
     'nand': (NAND4, NAND3, NAND2),
@@ -156,7 +164,7 @@ class SimOps:
     :param c_reuse: If enabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
         memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
     """
-    def __init__(self, circuit, c_caps=1, c_caps_min=1, a_ctrl=None, c_reuse=False, strip_forks=False):
+    def __init__(self, circuit: Circuit, c_caps=1, c_caps_min=1, a_ctrl=None, c_reuse=False, strip_forks=False):
         self.circuit = circuit
         self.s_len = len(circuit.s_nodes)
 
@@ -175,84 +183,74 @@ class SimOps:
         self.ppo_offset = self.ppi_offset + self.s_len
         self.c_locs_len = self.ppo_offset + self.s_len
 
-        # translate circuit structure into self.ops
-        ops = []
-        interface_dict = dict((n, i) for i, n in enumerate(circuit.s_nodes))
-        for n in circuit.topological_order():
-            if n in interface_dict:
-                inp_idx = self.ppi_offset + interface_dict[n]
-                if len(n.outs) > 0 and n.outs[0] is not None:  # first output of a PI/PPI
-                    ops.append((BUF1, n.outs[0].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx, *a_ctrl[n.outs[0]]))
-                if 'dff' in n.kind.lower():  # second output of DFF is inverted
-                    if len(n.outs) > 1 and n.outs[1] is not None:
-                        ops.append((INV1, n.outs[1].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx, *a_ctrl[n.outs[1]]))
-                else:  # if not DFF, no output is inverted.
-                    for o_line in n.outs[1:]:
-                        if o_line is not None:
-                            ops.append((BUF1, o_line.index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx, *a_ctrl[o_line]))
-                continue
-            # regular node, not PI/PPI or PO/PPO
-            o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx
-            i0_idx = n.ins[0].index if len(n.ins) > 0 and n.ins[0] is not None else self.zero_idx
-            i1_idx = n.ins[1].index if len(n.ins) > 1 and n.ins[1] is not None else self.zero_idx
-            i2_idx = n.ins[2].index if len(n.ins) > 2 and n.ins[2] is not None else self.zero_idx
-            i3_idx = n.ins[3].index if len(n.ins) > 3 and n.ins[3] is not None else self.zero_idx
-            kind = n.kind.lower()
-            if kind == '__fork__':
-                if not strip_forks:
-                    for o_line in n.outs:
-                        if o_line is not None:
-                            ops.append((BUF1, o_line.index, i0_idx, i1_idx, i2_idx, i3_idx, *a_ctrl[o_line]))
-                continue
-            sp = None
-            for prefix, prims in kind_prefixes.items():
-                if kind.startswith(prefix):
-                    sp = prims[0]
-                    if i3_idx == self.zero_idx:
-                        sp = prims[1]
-                        if i2_idx == self.zero_idx:
-                            sp = prims[2]
-                    break
-            if sp is None:
-                print('unknown cell type', kind)
-            else:
-                ops.append((sp, o0_idx, i0_idx, i1_idx, i2_idx, i3_idx, *a_ctrl[o0_idx]))
-
-        self.ops = np.asarray(ops, dtype='int32')
+        # ALAP-toposort the circuit into self.ops
+        levels = []
+
+        ppio2idx = dict((n, i) for i, n in enumerate(circuit.s_nodes))
+        ppos = set([n for n in circuit.s_nodes if len(n.ins) > 0])
+        readers = np.array([1 if l.reader in ppos else len(l.reader.outs) for l in circuit.lines], dtype=np.int32)  # for ref-counting forks
+
+        level_lines = [n.ins[0] for n in ppos]  # start from PPOs
+        # FIXME: Should probably instanciate buffers for PPOs and attach DFF clocks
+
+        while len(level_lines) > 0:  # traverse the circuit level-wise back towards (P)PIs
+            level_ops = []
+            prev_level_lines = []
+
+            for l in level_lines:
+                n = l.driver
+                in_idxs = [n.ins[x].index if len(n.ins) > x and n.ins[x] is not None else self.zero_idx for x in [0,1,2,3]]
+                if n in ppio2idx:
+                    in_idxs[0] = self.ppi_offset + ppio2idx[n]
+                    if l.driver_pin == 1 and 'dff' in n.kind.lower():  # second output of DFF is inverted
+                        level_ops.append((INV1, l.index, *in_idxs, *a_ctrl[l]))
+                    else:
+                        level_ops.append((BUF1, l.index, *in_idxs, *a_ctrl[l]))
+                elif n.kind == '__fork__':
+                    readers[n.ins[0]] -= 1
+                    if readers[n.ins[0]] == 0: prev_level_lines.append(n.ins[0])
+                    if not strip_forks: level_ops.append((BUF1, l.index, *in_idxs, *a_ctrl[l]))
+                else:
+                    prev_level_lines += n.ins
+                    sp = None
+                    kind = n.kind.lower()
+                    for prefix, prims in kind_prefixes.items():
+                        if kind.startswith(prefix):
+                            sp = prims[0]
+                            if in_idxs[3] == self.zero_idx:
+                                sp = prims[1]
+                                if in_idxs[2] == self.zero_idx:
+                                    sp = prims[2]
+                            break
+                    if sp is None:
+                        print('unknown cell type', kind)
+                    else:
+                        level_ops.append((sp, l.index, *in_idxs, *a_ctrl[l]))
+
+            if len(level_ops) > 0: levels.append(level_ops)
+            level_lines = prev_level_lines
+
+        self.levels = [np.asarray(lv, dtype=np.int32) for lv in levels[::-1]]
+        level_sums = np.cumsum([0]+[len(lv) for lv in self.levels], dtype=np.int32)
+        self.level_starts, self.level_stops = level_sums[:-1], level_sums[1:]
+        self.ops = np.vstack(self.levels)
 
         # create a map from fanout lines to stem lines for fork stripping
-        stems = np.zeros(self.c_locs_len, dtype='int32') - 1  # default to -1: 'no fanout line'
+        stems = np.full(self.c_locs_len, -1, dtype=np.int32)  # default to -1: 'no fanout line'
         if strip_forks:
             for f in circuit.forks.values():
                 prev_line = f.ins[0]
                 while prev_line.driver.kind == '__fork__':
                     prev_line = prev_line.driver.ins[0]
-                stem_idx = prev_line.index
                 for ol in f.outs:
                     if ol is not None:
-                        stems[ol] = stem_idx
-
-        # calculate level (distance from PI/PPI) and reference count for each line
-        levels = np.zeros(self.c_locs_len, dtype='int32')
-        ref_count = np.zeros(self.c_locs_len, dtype='int32')
-        level_starts = [0]
-        current_level = 1
-        for i, op in enumerate(self.ops):
-            # if we fork-strip, always take the stems for determining fan-in level
-            i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
-            i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
-            i2_idx = stems[op[4]] if stems[op[4]] >= 0 else op[4]
-            i3_idx = stems[op[5]] if stems[op[5]] >= 0 else op[5]
-            if levels[i0_idx] >= current_level or levels[i1_idx] >= current_level or levels[i2_idx] >= current_level or levels[i3_idx] >= current_level:
-                current_level += 1
-                level_starts.append(i)
-            levels[op[1]] = current_level  # set level of the output line
-            ref_count[i0_idx] += 1
-            ref_count[i1_idx] += 1
-            ref_count[i2_idx] += 1
-            ref_count[i3_idx] += 1
-        self.level_starts = np.asarray(level_starts, dtype='int32')
-        self.level_stops = np.asarray(level_starts[1:] + [len(self.ops)], dtype='int32')
+                        stems[ol] = prev_line.index
+
+        ref_count = np.zeros(self.c_locs_len, dtype=np.int32)
+
+        for op in self.ops:
+            for x in [2, 3, 4, 5]:
+                ref_count[stems[op[x]] if stems[op[x]] >= 0 else op[x]] += 1
 
         # combinational signal allocation table. maps line and interface indices to self.c memory locations
         self.c_locs = np.full((self.c_locs_len,), -1, dtype=np.int32)
@@ -278,9 +276,9 @@ class SimOps:
                 ref_count[i0_idx] += 1
 
         # allocate memory for the rest of the circuit
-        for op_start, op_stop in zip(self.level_starts, self.level_stops):
+        for ops in self.levels:
             free_set = set()
-            for op in self.ops[op_start:op_stop]:
+            for op in ops:
                 # if we fork-strip, always take the stems
                 i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
                 i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
@@ -299,7 +297,8 @@ class SimOps:
                 self.c_locs[o_idx], self.c_caps[o_idx] = h.alloc(cap), cap
             if c_reuse:
                 for loc in free_set:
-                    h.free(loc)
+                    if loc >= 0:  # DFF clocks are not allocated. Ignore for now.
+                        h.free(loc)
 
         # copy memory location and capacity from stems to fanout lines
         for lidx, stem in enumerate(stems):
@@ -311,6 +310,15 @@ class SimOps:
             if len(n.ins) > 0:
                 self.c_locs[self.ppo_offset + i], self.c_caps[self.ppo_offset + i] = self.c_locs[n.ins[0]], self.c_caps[n.ins[0]]
 
+        # line use information
+        self.line_use_start = np.full(self.c_locs_len, -1, dtype=np.int32)
+        self.line_use_stop = np.full(self.c_locs_len, len(self.levels), dtype=np.int32)
+        for i, lv in enumerate(self.levels):
+            for op in lv:
+                self.line_use_start[op[1]] = i
+                for x in [2, 3, 4, 5]:
+                    self.line_use_stop[op[x]] = i
+
         self.c_len = h.max_size
 
         d = defaultdict(int)
diff --git a/src/kyupy/stil.py b/src/kyupy/stil.py
index 98cc2df..0b217ee 100644
--- a/src/kyupy/stil.py
+++ b/src/kyupy/stil.py
@@ -41,7 +41,7 @@ class StilFile:
                 unload = {}
                 for so_port in self.so_ports:
                     if so_port in call.parameters:
-                        unload[so_port] = call.parameters[so_port].replace('\n', '').replace('N', '-')
+                        unload[so_port] = call.parameters[so_port]
                 if len(capture) > 0:
                     self.patterns.append(ScanPattern(sload, launch, capture, unload))
                     capture = {}
@@ -49,11 +49,9 @@ class StilFile:
                 sload = {}
                 for si_port in self.si_ports:
                     if si_port in call.parameters:
-                        sload[si_port] = call.parameters[si_port].replace('\n', '').replace('N', '-')
-            if call.name.endswith('_launch'):
-                launch = dict((k, v.replace('\n', '').replace('N', '-')) for k, v in call.parameters.items())
-            if call.name.endswith('_capture'):
-                capture = dict((k, v.replace('\n', '').replace('N', '-')) for k, v in call.parameters.items())
+                        sload[si_port] = call.parameters[si_port]
+            if call.name.endswith('_launch'): launch = call.parameters
+            if call.name.endswith('_capture'): capture = call.parameters
 
     def _maps(self, c):
         interface = list(c.io_nodes) + [n for n in c.nodes if 'DFF' in n.kind]
@@ -100,12 +98,12 @@ class StilFile:
         tests = np.full((len(interface), len(self.patterns)), logic.UNASSIGNED)
         for i, p in enumerate(self.patterns):
             for si_port in self.si_ports.keys():
-                pattern = logic.mvarray(p.load[si_port])
+                pattern = logic.mvarray(p.load[si_port][0])
                 inversions = np.choose((pattern == logic.UNASSIGNED) | (pattern == logic.UNKNOWN),
                                        [scan_inversions[si_port], logic.ZERO]).astype(np.uint8)
                 np.bitwise_xor(pattern, inversions, out=pattern)
                 tests[scan_maps[si_port], i] = pattern
-            tests[pi_map, i] = logic.mvarray(p.capture['_pi'])
+            tests[pi_map, i] = logic.mvarray(p.capture['_pi'][0])
         return tests
 
     def tests_loc(self, circuit, init_filter=None, launch_filter=None):
@@ -134,12 +132,12 @@ class StilFile:
         for i, p in enumerate(self.patterns):
             # init.set_values(i, '0' * len(interface))
             for si_port in self.si_ports.keys():
-                pattern = logic.mvarray(p.load[si_port])
+                pattern = logic.mvarray(p.load[si_port][0])
                 inversions = np.choose((pattern == logic.UNASSIGNED) | (pattern == logic.UNKNOWN),
                                        [scan_inversions[si_port], logic.ZERO]).astype(np.uint8)
                 np.bitwise_xor(pattern, inversions, out=pattern)
                 init[scan_maps[si_port], i] = pattern
-            init[pi_map, i] = logic.mvarray(p.launch['_pi'] if '_pi' in p.launch else p.capture['_pi'])
+            init[pi_map, i] = logic.mvarray(p.launch['_pi'][0] if '_pi' in p.launch else p.capture['_pi'][0])
         if init_filter: init = init_filter(init)
         sim8v = LogicSim(circuit, init.shape[-1], m=8)
         sim8v.s[0] = logic.mv_to_bp(init)
@@ -149,12 +147,12 @@ class StilFile:
         launch = logic.bp_to_mv(sim8v.s[1])[..., :init.shape[-1]]
         for i, p in enumerate(self.patterns):
             # if there was no launch cycle or launch clock, then init = launch
-            if '_pi' not in p.launch or 'P' not in p.launch['_pi'] or 'P' not in p.capture['_pi']:
+            if '_pi' not in p.launch or 'P' not in p.launch['_pi'][0] or 'P' not in p.capture['_pi'][0]:
                 for si_port in self.si_ports.keys():
-                    pattern = logic.mv_xor(logic.mvarray(p.load[si_port]), scan_inversions[si_port])
+                    pattern = logic.mv_xor(logic.mvarray(p.load[si_port][0]), scan_inversions[si_port])
                     launch[scan_maps[si_port], i] = pattern
-            if '_pi' in p.capture and 'P' in p.capture['_pi']:
-                launch[pi_map, i] = logic.mvarray(p.capture['_pi'])
+            if '_pi' in p.capture and 'P' in p.capture['_pi'][0]:
+                launch[pi_map, i] = logic.mvarray(p.capture['_pi'][0])
             launch[po_map, i] = logic.UNASSIGNED
         if launch_filter: launch = launch_filter(launch)
 
@@ -171,9 +169,9 @@ class StilFile:
         interface, _, po_map, scan_maps, scan_inversions = self._maps(circuit)
         resp = np.full((len(interface), len(self.patterns)), logic.UNASSIGNED)
         for i, p in enumerate(self.patterns):
-            resp[po_map, i] = logic.mvarray(p.capture['_po'] if len(p.capture) > 0 else p.launch['_po'])
+            resp[po_map, i] = logic.mvarray(p.capture['_po'][0] if len(p.capture) > 0 else p.launch['_po'][0])
             for so_port in self.so_ports.keys():
-                pattern = logic.mv_xor(logic.mvarray(p.unload[so_port]), scan_inversions[so_port])
+                pattern = logic.mv_xor(logic.mvarray(p.unload[so_port][0]), scan_inversions[so_port])
                 resp[scan_maps[so_port], i] = pattern
         return resp
 
@@ -192,7 +190,7 @@ class StilTransformer(Transformer):
     def call(args): return Call(args[0], dict(args[1:]))
 
     @staticmethod
-    def call_parameter(args): return args[0], args[1].value
+    def call_parameter(args): return args[0], (args[1].value.replace('\n', '').replace('N', '-'), args[1].start_pos)
 
     @staticmethod
     def signal_group(args): return args[0], args[1:]
diff --git a/src/kyupy/techlib.py b/src/kyupy/techlib.py
index ce15ed1..6304a56 100644
--- a/src/kyupy/techlib.py
+++ b/src/kyupy/techlib.py
@@ -11,50 +11,6 @@ from itertools import product
 from . import bench
 
 
-class TechLibOld:
-    @staticmethod
-    def pin_index(kind, pin):
-        if isinstance(pin, int):
-            return max(0, pin-1)
-        if kind[:3] in ('OAI', 'AOI'):
-            if pin[0] == 'A': return int(pin[1]) - 1
-            if pin == 'B': return int(kind[3])
-            if pin[0] == 'B': return int(pin[1]) - 1 + int(kind[3])
-        for prefix, pins, index in [('HADD', ('B0', 'SO'), 1),
-                                    ('HADD', ('A0', 'C1'), 0),
-                                    ('MUX21', ('S', 'S0'), 2),
-                                    ('MX2', ('S0',), 2),
-                                    ('TBUF', ('OE',), 1),
-                                    ('TINV', ('OE',), 1),
-                                    ('LATCH', ('D',), 0),
-                                    ('LATCH', ('QN',), 1),
-                                    ('DFF', ('D',), 0),
-                                    ('DFF', ('QN',), 1),
-                                    ('SDFF', ('D',), 0),
-                                    ('SDFF', ('QN',), 1),
-                                    ('SDFF', ('CLK',), 3),
-                                    ('SDFF', ('RSTB', 'RN'), 4),
-                                    ('SDFF', ('SETB',), 5),
-                                    ('ISOL', ('ISO',), 0),
-                                    ('ISOL', ('D',), 1)]:
-            if kind.startswith(prefix) and pin in pins: return index
-        for index, pins in enumerate([('A1', 'IN1', 'A', 'S', 'INP', 'I', 'Q', 'QN', 'Y', 'Z', 'ZN'),
-                                      ('A2', 'IN2', 'B', 'CK', 'CLK', 'CO', 'SE'),
-                                      ('A3', 'IN3', 'C', 'RN', 'RSTB', 'CI', 'SI'),
-                                      ('A4', 'IN4', 'D', 'SN', 'SETB'),
-                                      ('A5', 'IN5', 'E'),
-                                      ('A6', 'IN6', 'F')]):
-            if pin in pins: return index
-        raise ValueError(f'Unknown pin index for {kind}.{pin}')
-
-    @staticmethod
-    def pin_is_output(kind, pin):
-        if isinstance(pin, int):
-            return pin == 0
-        if 'MUX' in kind and pin == 'S': return False
-        return pin in ('Q', 'QN', 'Z', 'ZN', 'Y', 'CO', 'S', 'SO', 'C1')
-
-
 class TechLib:
     """Class for standard cell library definitions.
 
@@ -93,6 +49,14 @@ class TechLib:
         assert pin in self.cells[kind][1], f'Unknown pin: {pin} for cell {kind}'
         return self.cells[kind][1][pin][0]
 
+    def pin_name(self, kind, pos, output=False):
+        """Returns the pin name for a given node kind, list position, and direction."""
+        assert kind in self.cells, f'Unknown cell: {kind}'
+        for name, (ppos, isout) in self.cells[kind][1].items():
+            if isout == output and ppos == pos:
+                return name
+        return None
+
     def pin_is_output(self, kind, pin):
         """Returns True, if given pin name of a node kind is an output."""
         assert kind in self.cells, f'Unknown cell: {kind}'
@@ -138,21 +102,92 @@ TLATX1   input(C,D)       output(Q,QN) Q=LATCH(D,C) QN=INV1(Q) ;
 """
 
 
-_nangate_common = r"""
+NANGATE = TechLib(r"""
+FILLTIE ;
+FILL_X{1,2,4,8,16} ;
+ANTENNA input(I)   ;
+
+TIEH output(Z) Z=__const1__() ;
+TIEL output(ZN) ZN=__const0__() ;
+
+BUF_X{1,2,4,8,12,16}  input(I) output(Z)  Z=BUF1(I)  ;
+INV_X{1,2,4,8,12,16}  input(I) output(ZN) ZN=INV1(I) ;
+
+CLKBUF_X{1,2,4,8,12,16} input(I) output(Z)  Z=BUF1(I)  ;
+CLKGATETST_X1 input(CLK,E,TE) output(Q) Q=OA21(CLK,E,TE) ;
+
+AND2_X{1,2}  input(A1,A2)       output(Z)  Z=AND2(A1,A2)         ;
+AND3_X{1,2}  input(A1,A2,A3)    output(Z)  Z=AND3(A1,A2,A3)      ;
+AND4_X{1,2}  input(A1,A2,A3,A4) output(Z)  Z=AND4(A1,A2,A3,A4)   ;
+NAND2_X{1,2} input(A1,A2)       output(ZN) ZN=NAND2(A1,A2)       ;
+NAND3_X{1,2} input(A1,A2,A3)    output(ZN) ZN=NAND3(A1,A2,A3)    ;
+NAND4_X{1,2} input(A1,A2,A3,A4) output(ZN) ZN=NAND4(A1,A2,A3,A4) ;
+OR2_X{1,2}   input(A1,A2)       output(Z)  Z=OR2(A1,A2)          ;
+OR3_X{1,2}   input(A1,A2,A3)    output(Z)  Z=OR3(A1,A2,A3)       ;
+OR4_X{1,2}   input(A1,A2,A3,A4) output(Z)  Z=OR4(A1,A2,A3,A4)    ;
+NOR2_X{1,2}  input(A1,A2)       output(ZN) ZN=NOR2(A1,A2)        ;
+NOR3_X{1,2}  input(A1,A2,A3)    output(ZN) ZN=NOR3(A1,A2,A3)     ;
+NOR4_X{1,2}  input(A1,A2,A3,A4) output(ZN) ZN=NOR4(A1,A2,A3,A4)  ;
+XOR2_X1      input(A1,A2)       output(Z)  Z=XOR2(A1,A2)         ;
+XNOR2_X1     input(A1,A2)       output(ZN) ZN=XNOR2(A1,A2)       ;
+
+MUX2_X1 input(I0,I1,S) output(Z) Z=MUX21(I0,I1,S) ;
+
+HA_X1 input(A,B) output(CO,S) CO=XOR2(A,B) S=AND2(A,B) ;
+FA_X1 input(A,B,CI) output(CO,S) AB=XOR2(A,B) CO=XOR2(AB,CI) S=AO22(CI,A,B) ;
+
+AOI21_X{1,2} input(A1,A2,B)     output(ZN) ZN=AOI21(A1,A2,B)     ;
+OAI21_X{1,2} input(A1,A2,B)     output(ZN) ZN=OAI21(A1,A2,B)     ;
+AOI22_X{1,2} input(A1,A2,B1,B2) output(ZN) ZN=AOI22(A1,A2,B1,B2) ;
+OAI22_X{1,2} input(A1,A2,B1,B2) output(ZN) ZN=OAI22(A1,A2,B1,B2) ;
+
+DFFRNQ_X1  input(D,RN,CLK)    output(Q)  DR=AND2(D,RN) Q=DFF(DR,CLK) ;
+DFFSNQ_X1  input(D,SN,CLK)    output(Q)  S=INV1(SN) DS=OR2(D,S) Q=DFF(DS,CLK) ;
+
+SDFFRNQ_X1 input(D,RN,SE,SI,CLK) output(Q) DR=AND2(D,RN) DI=MUX21(DR,SI,SE) Q=DFF(DI,CLK) ;
+SDFFSNQ_X1 input(D,SE,SI,SN,CLK) output(Q) S=INV1(SN) DS=OR2(D,S) DI=MUX21(DS,SI,SE) Q=DFF(DI,CLK) ;
+
+TBUF_X{1,2,4,8,12,16} input(EN,I)   output(Z)  Z=BUF1(I) ;
+
+LHQ_X1 input(D,E) output(Q)  Q=LATCH(D,E) ;
+""")
+"""Nangate 15nm Open Cell Library (NanGate_15nm_OCL_v0.1_2014_06.A).
+"""
+
+
+NANGATE45 = TechLib(r"""
 FILLCELL_X{1,2,4,8,16,32} ;
+ANTENNA_X1 input(A) ;
 
 LOGIC0_X1 output(Z) Z=__const0__() ;
 LOGIC1_X1 output(Z) Z=__const1__() ;
 
 BUF_X{1,2,4,8,16,32}  input(A) output(Z)  Z=BUF1(A)  ;
-CLKBUF_X{1,2,3}       input(A) output(Z)  Z=BUF1(A)  ;
+INV_X{1,2,4,8,16,32}  input(A) output(ZN) ZN=INV1(A) ;
+
+CLKBUF_X{1,2,3}       input(A)       output(Z)   Z=BUF1(A)         ;
+CLKGATETST_X{1,2,4,8} input(CK,E,SE) output(GCK) GCK=OA21(CK,E,SE) ;
+CLKGATE_X{1,2,4,8}    input(CK,E)    output(GCK) GCK=AND2(CK,E)    ;
 
+AND2_X{1,2,4}  input(A1,A2)       output(ZN) ZN=AND2(A1,A2)        ;
+AND3_X{1,2,4}  input(A1,A2,A3)    output(ZN) ZN=AND3(A1,A2,A3)     ;
+AND4_X{1,2,4}  input(A1,A2,A3,A4) output(ZN) ZN=AND4(A1,A2,A3,A4)  ;
 NAND2_X{1,2,4} input(A1,A2)       output(ZN) ZN=NAND2(A1,A2)       ;
 NAND3_X{1,2,4} input(A1,A2,A3)    output(ZN) ZN=NAND3(A1,A2,A3)    ;
 NAND4_X{1,2,4} input(A1,A2,A3,A4) output(ZN) ZN=NAND4(A1,A2,A3,A4) ;
+OR2_X{1,2,4}   input(A1,A2)       output(ZN) ZN=OR2(A1,A2)         ;
+OR3_X{1,2,4}   input(A1,A2,A3)    output(ZN) ZN=OR3(A1,A2,A3)      ;
+OR4_X{1,2,4}   input(A1,A2,A3,A4) output(ZN) ZN=OR4(A1,A2,A3,A4)   ;
 NOR2_X{1,2,4}  input(A1,A2)       output(ZN) ZN=NOR2(A1,A2)        ;
 NOR3_X{1,2,4}  input(A1,A2,A3)    output(ZN) ZN=NOR3(A1,A2,A3)     ;
 NOR4_X{1,2,4}  input(A1,A2,A3,A4) output(ZN) ZN=NOR4(A1,A2,A3,A4)  ;
+XOR2_X{1,2}    input(A,B)         output(Z)  Z=XOR2(A,B)           ;
+XNOR2_X{1,2}   input(A,B)         output(ZN) ZN=XNOR2(A,B)         ;
+
+MUX2_X{1,2} input(A,B,S) output(Z) Z=MUX21(A,B,S) ;
+
+HA_X1 input(A,B) output(CO,S) CO=XOR2(A,B) S=AND2(A,B) ;
+FA_X1 input(A,B,CI) output(CO,S) AB=XOR2(A,B) CO=XOR2(AB,CI) S=AO22(CI,A,B) ;
 
 AOI21_X{1,2,4} input(A,B1,B2)     output(ZN) ZN=AOI21(B1,B2,A)     ;
 OAI21_X{1,2,4} input(A,B1,B2)     output(ZN) ZN=OAI21(B1,B2,A)     ;
@@ -162,8 +197,6 @@ OAI22_X{1,2,4} input(A1,A2,B1,B2) output(ZN) ZN=OAI22(A1,A2,B1,B2) ;
 OAI211_X{1,2,4} input(A,B,C1,C2) output(ZN) ZN=OAI211(C1,C2,A,B)   ;
 AOI211_X{1,2,4} input(A,B,C1,C2) output(ZN) ZN=AOI211(C1,C2,A,B)   ;
 
-MUX2_X{1,2} input(A,B,S) output(Z) Z=MUX21(A,B,S) ;
-
 AOI221_X{1,2,4} input(A,B1,B2,C1,C2) output(ZN) BC=AO22(B1,B2,C1,C2) ZN=NOR2(BC,A)  ;
 OAI221_X{1,2,4} input(A,B1,B2,C1,C2) output(ZN) BC=OA22(B1,B2,C1,C2) ZN=NAND2(BC,A) ;
 
@@ -172,14 +205,6 @@ OAI222_X{1,2,4} input(A1,A2,B1,B2,C1,C2) output(ZN) BC=OA22(B1,B2,C1,C2) ZN=OAI2
 
 OAI33_X1 input(A1,A2,A3,B1,B2,B3) output(ZN) AA=OR2(A1,A2) BB=OR2(B1,B2) ZN=OAI22(AA,A3,BB,B3) ;
 
-HA_X1 input(A,B) output(CO,S) CO=XOR2(A,B) S=AND2(A,B) ;
-
-FA_X1 input(A,B,CI) output(CO,S) AB=XOR2(A,B) CO=XOR2(AB,CI) S=AO22(CI,A,B) ;
-
-CLKGATE_X{1,2,4,8} input(CK,E) output(GCK) GCK=AND2(CK,E) ;
-
-CLKGATETST_X{1,2,4,8} input(CK,E,SE) output(GCK) GCK=OA21(CK,E,SE) ;
-
 DFF_X{1,2}   input(D,CK)       output(Q,QN)  Q=DFF(D,CK) QN=INV1(Q) ;
 DFFR_X{1,2}  input(D,RN,CK)    output(Q,QN)  DR=AND2(D,RN) Q=DFF(DR,CK) QN=INV1(Q) ;
 DFFS_X{1,2}  input(D,SN,CK)    output(Q,QN)  S=INV1(SN) DS=OR2(D,S) Q=DFF(DS,CK) QN=INV1(Q) ;
@@ -191,43 +216,16 @@ SDFFS_X{1,2}  input(D,SE,SI,SN,CK)    output(Q,QN)  S=INV1(SN) DS=OR2(D,S) DI=MU
 SDFFRS_X{1,2} input(D,RN,SE,SI,SN,CK) output(Q,QN)  S=INV1(SN) DS=OR2(D,S) DRS=AND2(DS,RN) DI=MUX21(DRS,SI,SE) Q=DFF(DI,CK) QN=INV1(Q) ;
 
 TBUF_X{1,2,4,8,16} input(A,EN)   output(Z)  Z=BUF1(A)    ;
+
 TINV_X1            input(I,EN)   output(ZN) ZN=INV1(I)   ;
 TLAT_X1            input(D,G,OE) output(Q)  Q=LATCH(D,G) ;
 
 DLH_X{1,2} input(D,G) output(Q)  Q=LATCH(D,G)            ;
 DLL_X{1,2} input(D,GN) output(Q) G=INV1(GN) Q=LATCH(D,G) ;
-"""
 
-
-NANGATE = TechLib(_nangate_common + r"""
-INV_X{1,2,4,8,16,32}  input(I) output(ZN) ZN=INV1(I) ;
-
-AND2_X{1,2,4}  input(A1,A2)       output(Z)  Z=AND2(A1,A2)        ;
-AND3_X{1,2,4}  input(A1,A2,A3)    output(Z)  Z=AND3(A1,A2,A3)     ;
-AND4_X{1,2,4}  input(A1,A2,A3,A4) output(Z)  Z=AND4(A1,A2,A3,A4)  ;
-OR2_X{1,2,4}   input(A1,A2)       output(Z)  Z=OR2(A1,A2)         ;
-OR3_X{1,2,4}   input(A1,A2,A3)    output(Z)  Z=OR3(A1,A2,A3)      ;
-OR4_X{1,2,4}   input(A1,A2,A3,A4) output(Z)  Z=OR4(A1,A2,A3,A4)   ;
-XOR2_X{1,2}    input(A1,A2)       output(Z)  Z=XOR2(A1,A2)        ;
-XNOR2_X{1,2}   input(A1,A2)       output(ZN) ZN=XNOR2(A1,A2)      ;
-""")
-"""An newer NANGATE-variant that uses 'Z' as output pin names for AND and OR gates.
-"""
-
-
-NANGATE_ZN = TechLib(_nangate_common + r"""
-INV_X{1,2,4,8,16,32}  input(A) output(ZN) ZN=INV1(A) ;
-
-AND2_X{1,2,4}  input(A1,A2)       output(ZN) ZN=AND2(A1,A2)        ;
-AND3_X{1,2,4}  input(A1,A2,A3)    output(ZN) ZN=AND3(A1,A2,A3)     ;
-AND4_X{1,2,4}  input(A1,A2,A3,A4) output(ZN) ZN=AND4(A1,A2,A3,A4)  ;
-OR2_X{1,2,4}   input(A1,A2)       output(ZN) ZN=OR2(A1,A2)         ;
-OR3_X{1,2,4}   input(A1,A2,A3)    output(ZN) ZN=OR3(A1,A2,A3)      ;
-OR4_X{1,2,4}   input(A1,A2,A3,A4) output(ZN) ZN=OR4(A1,A2,A3,A4)   ;
-XOR2_X{1,2}    input(A,B)         output(Z)  Z=XOR2(A,B)           ;
-XNOR2_X{1,2}   input(A,B)         output(ZN) ZN=XNOR2(A,B)         ;
 """)
-"""An older NANGATE-variant that uses 'ZN' as output pin names for AND and OR gates.
+"""Nangate 45nm Open Cell Library (NangateOpenCellLibrary_PDKv1_3_v2010_12).
+This NANGATE-variant that uses 'ZN' as output pin names for AND and OR gates.
 """
 
 
diff --git a/src/kyupy/verilog.py b/src/kyupy/verilog.py
index 1c6a0d1..bcb982f 100644
--- a/src/kyupy/verilog.py
+++ b/src/kyupy/verilog.py
@@ -73,7 +73,7 @@ class VerilogTransformer(Transformer):
         elif "'" in args[0]:
             width, rest = args[0].split("'")
             width = int(width)
-            base, const = rest[0], rest[1:]
+            base, const = rest[0], rest[1:].replace('x','0')
             const = int(const, {'b': 2, 'd':10, 'h':16}[base.lower()])
             l = []
             for _ in range(width):
@@ -92,6 +92,14 @@ class VerilogTransformer(Transformer):
                 sigs.append(a)
         return sigs
 
+    def ternaryif(self, args):
+        sel = args[0]
+        ctrue = args[1]
+        cfalse = args[2]
+        print(f"got ternary if {args[0]} {args[1]}")
+        
+        return args[1]
+
     def declaration(self, kind, args):
         rnge = None
         if isinstance(args[0], range):
@@ -123,6 +131,9 @@ class VerilogTransformer(Transformer):
         assignments = []
         for stmt in args[2:]:  # pass 1: instantiate cells and driven signals
             if isinstance(stmt, Instantiation):
+                if stmt.type not in self.tlib.cells:
+                    log.warn(f'Ignoring cell of unknown kind "{stmt.type}"')
+                    continue
                 n = Node(c, stmt.name, kind=stmt.type)
                 for p, s in stmt.pins.items():
                     if self.tlib.pin_is_output(n.kind, p):
@@ -141,35 +152,50 @@ class VerilogTransformer(Transformer):
                         c.io_nodes[positions[name]] = n
                     if sd.kind == 'input':
                         Line(c, n, Node(c, name))
-        for target, source in assignments:  # pass 1.5: process signal assignments
-            target_sigs = []
-            if not isinstance(target, list): target = [target]
-            for s in target:
-                if s in sig_decls:
-                    target_sigs += sig_decls[s].names
-                else:
-                    target_sigs.append(s)
-            source_sigs = []
-            if not isinstance(source, list): source = [source]
-            for s in source:
-                if s in sig_decls:
-                    source_sigs += sig_decls[s].names
-                else:
-                    source_sigs.append(s)
-            for t, s in zip(target_sigs, source_sigs):
-                if t in c.forks:
-                    assert s not in c.forks, 'assignment between two driven signals'
-                    Line(c, c.forks[t], Node(c, s))
-                elif s in c.forks:
-                    assert t not in c.forks, 'assignment between two driven signals'
-                    Line(c, c.forks[s], Node(c, t))
-                elif s.startswith("1'b"):
-                    cnode = Node(c, f'__const{s[3]}_{const_count}__', f'__const{s[3]}__')
-                    const_count += 1
-                    Line(c, cnode, Node(c, t))
+        deferred_assignments = set()
+        ignored = 0
+        while len(assignments) > 0:
+            more_assignments = []
+            for target, source in assignments:  # pass 1.5: process signal assignments
+                target_sigs = []
+                if not isinstance(target, list): target = [target]
+                for s in target:
+                    if s in sig_decls:
+                        target_sigs += sig_decls[s].names
+                    else:
+                        target_sigs.append(s)
+                source_sigs = []
+                if not isinstance(source, list): source = [source]
+                for s in source:
+                    if s in sig_decls:
+                        source_sigs += sig_decls[s].names
+                    else:
+                        source_sigs.append(s)
+                for t, s in zip(target_sigs, source_sigs):
+                    if t in c.forks:
+                        assert s not in c.forks, f'assignment between two driven signals: source={s} target={t}'
+                        Line(c, c.forks[t], Node(c, s))
+                    elif s in c.forks:
+                        assert t not in c.forks, 'assignment between two driven signals'
+                        Line(c, c.forks[s], Node(c, t))
+                    elif s.startswith("1'b"):
+                        cnode = Node(c, f'__const{s[3]}_{const_count}__', f'__const{s[3]}__')
+                        const_count += 1
+                        Line(c, cnode, Node(c, t))
+                    else:
+                        if (t, s) in deferred_assignments:
+                            #log.info(f'ignoring: assign {t} = {s}')
+                            ignored += 1
+                        else:
+                            more_assignments.append((t, s))
+                            deferred_assignments.add((t, s)) 
+            assignments = more_assignments
+        if ignored > 0:
+            log.warn(f'ignored {ignored} assignments')
         for stmt in args[2:]:  # pass 2: connect signals to readers
             if isinstance(stmt, Instantiation):
                 for p, s in stmt.pins.items():
+                    if stmt.name not in c.cells: continue
                     n = c.cells[stmt.name]
                     if self.tlib.pin_is_output(n.kind, p): continue
                     if s.startswith("1'b"):
@@ -221,10 +247,11 @@ GRAMMAR = r"""
     pin: namedpin | sigsel
     namedpin: "." name "(" sigsel? ")"
     range: "[" /[0-9]+/ (":" /[0-9]+/)? "]"
-    sigsel: name range? | concat
+    sigsel: name range? | concat | ternaryif
     concat: "{" sigsel ( "," sigsel )*  "}"
+    ternaryif: sigsel "?" sigsel ":" sigsel
     _namelist: name ( "," name )*
-    name: ( /[a-z_][a-z0-9_]*/i | /\\[^\t \r\n]+[\t \r\n]/i | /[0-9]+'[bdh][0-9a-f]+/i )
+    name: ( /[a-z_][a-z0-9_]*/i | /\\[^\t \r\n]+[\t \r\n]/i | /[0-9]+'[bdh][x0-9a-f]+/i )
     %import common.NEWLINE
     COMMENT: /\/\*(\*(?!\/)|[^*])*\*\// | /\(\*(\*(?!\))|[^*])*\*\)/ |  "//" /(.)*/ NEWLINE
     %ignore ( /\r?\n/ | COMMENT )+
diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py
index 93a107f..170445b 100644
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@@ -13,10 +13,11 @@ Two simulators are available: :py:class:`WaveSim` runs on the CPU, and the deriv
 """
 
 import math
+from collections import defaultdict
 
 import numpy as np
 
-from . import numba, cuda, sim, cdiv
+from . import log, numba, cuda, sim, cdiv, eng
 
 
 TMAX = np.float32(2 ** 127)
@@ -59,8 +60,8 @@ class WaveSim(sim.SimOps):
         self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype)
         self.delays[:, :delays.shape[1]] = delays
 
-        self.c = np.zeros((self.c_len, sims), dtype=np.float32) + TMAX
-        self.s = np.zeros((11, self.s_len, sims), dtype=np.float32)
+        self.c = np.full((self.c_len, self.sims), TMAX, dtype=np.float32)
+        self.s = np.zeros((11, self.s_len, self.sims), dtype=np.float32)
         """Information about the logic values and transitions around the sequential elements (flip-flops) and ports.
 
         The first 3 values are read by :py:func:`s_to_c`.
@@ -98,12 +99,24 @@ class WaveSim(sim.SimOps):
         self.simctl_int[0] = range(sims)  # unique seed for each sim by default, zero this to pick same delays for all sims.
         self.simctl_int[1] = 2  # random picking by default.
 
-        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.simctl_int)])
+        self.simctl_float = np.zeros((1, sims), dtype=np.float32) + 1
+        """Float array for per-simulation delay configuration.
+
+        * ``simctl_float[0]`` factor to be multiplied with each delay (default=1.0).
+        """
+
+        self.e = np.zeros((self.c_locs_len, sims, 2), dtype=np.uint8)  # aux data for each line and sim
+
+        self.error_counts = np.zeros(self.s_len, dtype=np.uint32)  # number of capture errors by PPO
+        self.lsts = np.zeros(self.s_len, dtype=np.float32)  # LST by PPO
+        self.overflows = np.zeros(self.s_len, dtype=np.uint32)  # Overflows by PPO
+
+        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.e, self.c_locs, self.c_caps, self.ops, self.simctl_int, self.simctl_float)])
 
     def __repr__(self):
         dev = 'GPU' if hasattr(self.c, 'copy_to_host') else 'CPU'
         return f'{{name: "{self.circuit.name}", device: "{dev}", sims: {self.sims}, ops: {len(self.ops)}, ' + \
-               f'levels: {len(self.level_starts)}, nbytes: {self.nbytes}}}'
+               f'levels: {len(self.level_starts)}, nbytes: {eng(self.nbytes)}}}'
 
     def s_to_c(self):
         """Transfers values of sequential elements and primary inputs to the combinational portion.
@@ -116,7 +129,7 @@ class WaveSim(sim.SimOps):
         self.c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX])
         self.c[self.pippi_c_locs+2] = TMAX
 
-    def c_prop(self, sims=None, seed=1):
+    def c_prop(self, sims=None, seed=1, delta=0):
         """Propagates all waveforms from the (pseudo) primary inputs to the (pseudo) primary outputs.
 
         :param sims: Number of parallel simulations to execute. If None, all available simulations are performed.
@@ -124,7 +137,7 @@ class WaveSim(sim.SimOps):
         """
         sims = min(sims or self.sims, self.sims)
         for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, 0, sims, self.delays, self.simctl_int, seed)
+            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, 0, sims, self.delays, self.simctl_int, self.simctl_float, seed, delta)
 
     def c_to_s(self, time=TMAX, sd=0.0, seed=1):
         """Simulates a capture operation at all sequential elements and primary outputs.
@@ -152,7 +165,7 @@ class WaveSim(sim.SimOps):
         self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs]
 
 
-def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
+def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, simctl_float, seed, delta):
     overflows = int(0)
 
     lut = op[0]
@@ -162,6 +175,18 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
     c_idx = op[4]
     d_idx = op[5]
 
+    input_epoch = (ebuf[a_idx, sim, 1]|
+                   ebuf[b_idx, sim, 1]|
+                   ebuf[c_idx, sim, 1]|
+                   ebuf[d_idx, sim, 1])
+
+    output_epoch = ebuf[z_idx, sim, 1]
+
+    if (delta):
+        if input_epoch == 0 and output_epoch == 0: return 0, 0
+
+    #out_changed = output_epoch
+
     if len(delays) > 1:
         if simctl_int[1] == 0:
             delays = delays[seed]
@@ -174,6 +199,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
             delays = delays[_rnd % len(delays)]
     else:
         delays = delays[0]
+        
+    
 
     a_mem = c_locs[a_idx]
     b_mem = c_locs[b_idx]
@@ -192,10 +219,10 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
 
     z_val = z_cur
 
-    a = cbuf[a_mem + a_cur, sim] + delays[a_idx, 0, z_val]
-    b = cbuf[b_mem + b_cur, sim] + delays[b_idx, 0, z_val]
-    c = cbuf[c_mem + c_cur, sim] + delays[c_idx, 0, z_val]
-    d = cbuf[d_mem + d_cur, sim] + delays[d_idx, 0, z_val]
+    a = cbuf[a_mem + a_cur, sim] + delays[a_idx, 0, z_val] * simctl_float[0]
+    b = cbuf[b_mem + b_cur, sim] + delays[b_idx, 0, z_val] * simctl_float[0]
+    c = cbuf[c_mem + c_cur, sim] + delays[c_idx, 0, z_val] * simctl_float[0]
+    d = cbuf[d_mem + d_cur, sim] + delays[d_idx, 0, z_val] * simctl_float[0]
 
     previous_t = TMIN
 
@@ -206,27 +233,27 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
         if a == current_t:
             a_cur += 1
             inputs ^= 1
-            thresh = delays[a_idx, a_cur & 1, z_val]
-            a = cbuf[a_mem + a_cur, sim] + delays[a_idx, a_cur & 1, z_val]
-            next_t = cbuf[a_mem + a_cur, sim] + delays[a_idx, (a_cur & 1) ^ 1, z_val ^ 1]
+            thresh = delays[a_idx, a_cur & 1 ^ 1, z_val] * simctl_float[0]
+            a = cbuf[a_mem + a_cur, sim] + delays[a_idx, a_cur & 1, z_val] * simctl_float[0]
+            next_t = cbuf[a_mem + a_cur, sim] + delays[a_idx, (a_cur & 1) ^ 1, z_val ^ 1] * simctl_float[0]
         elif b == current_t:
             b_cur += 1
             inputs ^= 2
-            thresh = delays[b_idx, b_cur & 1, z_val]
-            b = cbuf[b_mem + b_cur, sim] + delays[b_idx, b_cur & 1, z_val]
-            next_t = cbuf[b_mem + b_cur, sim] + delays[b_idx, (b_cur & 1) ^ 1, z_val ^ 1]
+            thresh = delays[b_idx, b_cur & 1 ^ 1, z_val] * simctl_float[0]
+            b = cbuf[b_mem + b_cur, sim] + delays[b_idx, b_cur & 1, z_val] * simctl_float[0]
+            next_t = cbuf[b_mem + b_cur, sim] + delays[b_idx, (b_cur & 1) ^ 1, z_val ^ 1] * simctl_float[0]
         elif c == current_t:
             c_cur += 1
             inputs ^= 4
-            thresh = delays[c_idx, c_cur & 1, z_val]
-            c = cbuf[c_mem + c_cur, sim] + delays[c_idx, c_cur & 1, z_val]
-            next_t = cbuf[c_mem + c_cur, sim] + delays[c_idx, (c_cur & 1) ^ 1, z_val ^ 1]
+            thresh = delays[c_idx, c_cur & 1 ^ 1, z_val] * simctl_float[0]
+            c = cbuf[c_mem + c_cur, sim] + delays[c_idx, c_cur & 1, z_val] * simctl_float[0]
+            next_t = cbuf[c_mem + c_cur, sim] + delays[c_idx, (c_cur & 1) ^ 1, z_val ^ 1] * simctl_float[0]
         else:
             d_cur += 1
             inputs ^= 8
-            thresh = delays[d_idx, d_cur & 1, z_val]
-            d = cbuf[d_mem + d_cur, sim] + delays[d_idx, d_cur & 1, z_val]
-            next_t = cbuf[d_mem + d_cur, sim] + delays[d_idx, (d_cur & 1) ^ 1, z_val ^ 1]
+            thresh = delays[d_idx, d_cur & 1 ^ 1, z_val] * simctl_float[0]
+            d = cbuf[d_mem + d_cur, sim] + delays[d_idx, d_cur & 1, z_val] * simctl_float[0]
+            next_t = cbuf[d_mem + d_cur, sim] + delays[d_idx, (d_cur & 1) ^ 1, z_val ^ 1] * simctl_float[0]
 
         if (z_cur & 1) != ((lut >> inputs) & 1):
             # we generate an edge in z_mem, if ...
@@ -235,32 +262,45 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
                 or (current_t - previous_t) > thresh  # -OR- the generated hazard is wider than pulse threshold.
                 ):
                 if z_cur < (z_cap - 1):  # enough space in z_mem?
+                    #if delta and (cbuf[z_mem + z_cur, sim] != current_t):
+                    #    out_changed = 1
                     cbuf[z_mem + z_cur, sim] = current_t
                     previous_t = current_t
                     z_cur += 1
                 else:
                     overflows += 1
-                    previous_t = cbuf[z_mem + z_cur - 1, sim]
                     z_cur -= 1
+                    previous_t = cbuf[z_mem + z_cur, sim]
             else:
                 z_cur -= 1
                 previous_t = cbuf[z_mem + z_cur - 1, sim] if z_cur > 0 else TMIN
 
             # output value of cell changed. update all delayed inputs.
             z_val = z_val ^ 1
-            a = cbuf[a_mem + a_cur, sim] + delays[a_idx, a_cur & 1, z_val]
-            b = cbuf[b_mem + b_cur, sim] + delays[b_idx, b_cur & 1, z_val]
-            c = cbuf[c_mem + c_cur, sim] + delays[c_idx, c_cur & 1, z_val]
-            d = cbuf[d_mem + d_cur, sim] + delays[d_idx, d_cur & 1, z_val]
+            a = cbuf[a_mem + a_cur, sim] + delays[a_idx, a_cur & 1, z_val] * simctl_float[0]
+            b = cbuf[b_mem + b_cur, sim] + delays[b_idx, b_cur & 1, z_val] * simctl_float[0]
+            c = cbuf[c_mem + c_cur, sim] + delays[c_idx, c_cur & 1, z_val] * simctl_float[0]
+            d = cbuf[d_mem + d_cur, sim] + delays[d_idx, d_cur & 1, z_val] * simctl_float[0]
 
         current_t = min(a, b, c, d)
 
+    #if delta and (cbuf[z_mem + z_cur, sim] != TMAX):
+    #    out_changed = 1
+
     # generate or propagate overflow flag
     cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
 
     nrise = max(0, (z_cur+1) // 2 - (cbuf[z_mem, sim] == TMIN))
     nfall = z_cur // 2
 
+    e = int(((cbuf[z_mem, sim] == TMIN) << 1) & 2)  # initial value
+    e |= z_val  # final value
+    e |= (nrise + nfall)<<2  # number of transitions
+
+    ebuf[z_idx, sim, 0] = e
+
+    ebuf[z_idx, sim, 1] = input_epoch #& out_changed
+
     return nrise, nfall
 
 
@@ -268,11 +308,11 @@ wave_eval_cpu = numba.njit(_wave_eval)
 
 
 @numba.njit
-def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed):
+def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, simctl_float, seed, delta):
     for op_idx in range(op_start, op_stop):
         op = ops[op_idx]
         for sim in range(sim_start, sim_stop):
-            nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed)
+            nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], simctl_float[:, sim], seed, delta)
             a_loc = op[6]
             a_wr = op[7]
             a_wf = op[8]
@@ -344,32 +384,51 @@ class WaveSimCuda(WaveSim):
         self.c_caps = cuda.to_device(self.c_caps)
         self.delays = cuda.to_device(self.delays)
         self.simctl_int = cuda.to_device(self.simctl_int)
+        self.simctl_float = cuda.to_device(self.simctl_float)
         self.abuf = cuda.to_device(self.abuf)
+        self.e = cuda.to_device(self.e)
+        self.error_counts = cuda.to_device(self.error_counts)
+        self.lsts = cuda.to_device(self.lsts)
+        self.overflows = cuda.to_device(self.overflows)
+
+        self.aux = cuda.to_device(np.zeros(8*1024, dtype=np.int32))
 
         self._block_dim = (32, 16)
 
     def __getstate__(self):
         state = self.__dict__.copy()
-        state['c'] = np.array(self.c)
+        del state['c']
         state['s'] = np.array(self.s)
         state['ops'] = np.array(self.ops)
         state['c_locs'] = np.array(self.c_locs)
         state['c_caps'] = np.array(self.c_caps)
         state['delays'] = np.array(self.delays)
         state['simctl_int'] = np.array(self.simctl_int)
+        state['simctl_float'] = np.array(self.simctl_float)
         state['abuf'] = np.array(self.abuf)
+        state['e'] = np.array(self.e)
+        state['error_counts'] = np.array(self.error_counts)
+        state['lsts'] = np.array(self.lsts)
+        state['overflows'] = np.array(self.overflows)
+        state['aux'] = np.array(self.aux)
         return state
 
     def __setstate__(self, state):
         self.__dict__.update(state)
-        self.c = cuda.to_device(self.c)
+        self.c = cuda.to_device(np.full((self.c_len, self.sims), TMAX, dtype=np.float32))
         self.s = cuda.to_device(self.s)
         self.ops = cuda.to_device(self.ops)
         self.c_locs = cuda.to_device(self.c_locs)
         self.c_caps = cuda.to_device(self.c_caps)
         self.delays = cuda.to_device(self.delays)
         self.simctl_int = cuda.to_device(self.simctl_int)
+        self.simctl_float = cuda.to_device(self.simctl_float)
         self.abuf = cuda.to_device(self.abuf)
+        self.e = cuda.to_device(self.e)
+        self.error_counts = cuda.to_device(self.error_counts)
+        self.lsts = cuda.to_device(self.lsts)
+        self.overflows = cuda.to_device(self.overflows)
+        self.aux = cuda.to_device(self.aux)
 
     def s_to_c(self):
         grid_dim = self._grid_dim(self.sims, self.s_len)
@@ -377,14 +436,24 @@ class WaveSimCuda(WaveSim):
 
     def _grid_dim(self, x, y): return cdiv(x, self._block_dim[0]), cdiv(y, self._block_dim[1])
 
-    def c_prop(self, sims=None, seed=1):
+    def c_prop(self, sims=None, seed=1, op_from=0, op_to=None, delta=0):
         sims = min(sims or self.sims, self.sims)
         for op_start, op_stop in zip(self.level_starts, self.level_stops):
+            if op_from > op_start: continue
+            if op_to is not None and op_to <= op_start: break
             grid_dim = self._grid_dim(sims, op_stop - op_start)
-            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, int(0),
-                sims, self.delays, self.simctl_int, seed)
+            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, int(0),
+                sims, self.delays, self.simctl_int, self.simctl_float, seed, delta)
         cuda.synchronize()
 
+    def c_prop_level(self, level, sims=None, seed=1, delta=0):
+        sims = min(sims or self.sims, self.sims)
+        op_start = self.level_starts[level]
+        op_stop = self.level_stops[level]
+        grid_dim = self._grid_dim(sims, op_stop - op_start)
+        wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, int(0),
+            sims, self.delays, self.simctl_int, self.simctl_float, seed, delta)
+
     def c_to_s(self, time=TMAX, sd=0.0, seed=1):
         grid_dim = self._grid_dim(self.sims, self.s_len)
         wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.c_locs, self.c_caps, self.ppo_offset,
@@ -394,6 +463,77 @@ class WaveSimCuda(WaveSim):
         grid_dim = self._grid_dim(self.sims, self.s_len)
         ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.c_locs, time, self.ppi_offset, self.ppo_offset)
 
+    def acc_error_counts(self, sims=None):
+        sims = min(sims or self.sims, self.sims)
+        grid_dim = cdiv(self.s_len, 256)
+        acc_error_counts_gpu[grid_dim, 256](self.s, sims, self.error_counts)
+
+    def reset_error_counts(self):
+        self.error_counts[:] = 0
+
+    def get_error_counts(self):
+        return np.array(self.error_counts)
+
+    def acc_overflows(self, sims=None):
+        sims = min(sims or self.sims, self.sims)
+        grid_dim = cdiv(self.s_len, 256)
+        acc_overflows_gpu[grid_dim, 256](self.s, sims, self.overflows)
+
+    def reset_overflows(self):
+        self.overflows[:] = 0
+
+    def get_overflows(self):
+        return np.array(self.overflows)
+
+    def acc_lsts(self, sims=None):
+        sims = min(sims or self.sims, self.sims)
+        grid_dim = cdiv(self.s_len, 256)
+        acc_lsts_gpu[grid_dim, 256](self.s, sims, self.lsts)
+
+    def reset_lsts(self):
+        self.lsts[:] = 0.0
+
+    def get_lsts(self):
+        return np.array(self.lsts)
+
+
+@cuda.jit()
+def memcpy_gpu (src, dst, nitems):
+    tid = cuda.grid(1)
+    stride = cuda.gridDim.x * cuda.blockDim.x
+    for i in range(tid, nitems, stride):
+        dst.flat[i] = src.flat[i]
+
+
+@cuda.jit()
+def acc_error_counts_gpu(s, sims, error_counts):
+    x = cuda.grid(1)
+    if x >= s.shape[1]: return
+    cnt = 0
+    for i in range(sims):
+        cnt += (s[6,x,i] != s[8,x,i])
+    error_counts[x] += cnt
+
+
+@cuda.jit()
+def acc_overflows_gpu(s, sims, overflows):
+    x = cuda.grid(1)
+    if x >= s.shape[1]: return
+    cnt = 0
+    for i in range(sims):
+        cnt += s[10,x,i]
+    overflows[x] += cnt
+
+
+@cuda.jit()
+def acc_lsts_gpu(s, sims, lsts):
+    x = cuda.grid(1)
+    if x >= s.shape[1]: return
+    lst = 0
+    for i in range(sims):
+        lst = max(lst, s[5,x,i])
+    lsts[x] = max(lsts[x], lst)
+
 
 @cuda.jit()
 def wave_assign_gpu(c, s, c_locs, ppi_offset):
@@ -423,7 +563,7 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True)
 
 
 @cuda.jit()
-def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed):
+def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, simctl_float, seed, delta):
     x, y = cuda.grid(2)
     sim = sim_start + x
     op_idx = op_start + y
@@ -435,7 +575,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, abuf, sim_start,
     a_wr = op[7]
     a_wf = op[8]
 
-    nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed)
+    nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], simctl_float[:, sim], seed, delta)
 
     # accumulate WSA into abuf
     if a_loc >= 0:
diff --git a/tests/b15_4ig.sa_rf.stil.gz b/tests/b15_4ig.sa_rf.stil.gz
new file mode 100644
index 0000000..437a3e0
Binary files /dev/null and b/tests/b15_4ig.sa_rf.stil.gz differ
diff --git a/tests/conftest.py b/tests/conftest.py
index c73bdc5..7bf9779 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,8 +13,44 @@ def b15_2ig_circuit(mydir):
     from kyupy.techlib import SAED32
     return verilog.load(mydir / 'b15_2ig.v.gz', branchforks=True, tlib=SAED32)
 
+@pytest.fixture(scope='session')
+def b15_2ig_circuit_resolved(b15_2ig_circuit):
+    from kyupy.techlib import SAED32
+    cr = b15_2ig_circuit.copy()
+    cr.resolve_tlib_cells(SAED32)
+    return cr
+
+@pytest.fixture(scope='session')
+def b15_4ig_circuit(mydir):
+    from kyupy import verilog
+    from kyupy.techlib import SAED32
+    return verilog.load(mydir / 'b15_4ig.v.gz', branchforks=True, tlib=SAED32)
+
+@pytest.fixture(scope='session')
+def b15_4ig_circuit_resolved(b15_4ig_circuit):
+    from kyupy.techlib import SAED32
+    cr = b15_4ig_circuit.copy()
+    cr.resolve_tlib_cells(SAED32)
+    return cr
+
 @pytest.fixture(scope='session')
 def b15_2ig_delays(mydir, b15_2ig_circuit):
     from kyupy import sdf
     from kyupy.techlib import SAED32
     return sdf.load(mydir / 'b15_2ig.sdf.gz').iopaths(b15_2ig_circuit, tlib=SAED32)[1:2]
+
+@pytest.fixture(scope='session')
+def b15_2ig_sa_nf_test_resp(mydir, b15_2ig_circuit_resolved):
+    from kyupy import stil
+    s = stil.load(mydir / 'b15_2ig.sa_nf.stil.gz')
+    tests = s.tests(b15_2ig_circuit_resolved)[:,1:]
+    resp = s.responses(b15_2ig_circuit_resolved)[:,1:]
+    return (tests, resp)
+
+@pytest.fixture(scope='session')
+def b15_4ig_sa_rf_test_resp(mydir, b15_4ig_circuit_resolved):
+    from kyupy import stil
+    s = stil.load(mydir / 'b15_4ig.sa_rf.stil.gz')
+    tests = s.tests(b15_4ig_circuit_resolved)[:,1:]
+    resp = s.responses(b15_4ig_circuit_resolved)[:,1:]
+    return (tests, resp)
diff --git a/tests/gates.sdf b/tests/gates.sdf
index 1652ff0..529a06b 100644
--- a/tests/gates.sdf
+++ b/tests/gates.sdf
@@ -7,22 +7,49 @@
 (TEMPERATURE 25.00:25.00:25.00)
 (TIMESCALE 1ns)
 (CELL
-  (CELLTYPE "NAND2X1")
+  (CELLTYPE "NAND2_X1")
   (INSTANCE nandgate)
   (DELAY
     (ABSOLUTE
-    (IOPATH IN1 QN (0.099:0.103:0.103) (0.122:0.127:0.127))
-    (IOPATH IN2 QN (0.083:0.086:0.086) (0.100:0.104:0.104))
+    (IOPATH A1 ZN (0.099:0.103:0.103) (0.122:0.127:0.127))
+    (IOPATH A2 ZN (0.083:0.086:0.086) (0.100:0.104:0.104))
     )
   )
 )
 (CELL
-  (CELLTYPE "AND2X1")
+  (CELLTYPE "AND2_X1")
   (INSTANCE andgate)
   (DELAY
     (ABSOLUTE
-    (IOPATH IN1 Q (0.367:0.378:0.378) (0.351:0.377:0.377))
-    (IOPATH IN2 Q (0.366:0.375:0.375) (0.359:0.370:0.370))
+    (IOPATH A1 ZN (0.367:0.378:0.378) (0.351:0.377:0.377))
+    (IOPATH A2 ZN (0.366:0.375:0.375) (0.359:0.370:0.370))
+    )
+  )
+)
+(CELL
+  (CELLTYPE "OAI21_X1")
+  (INSTANCE oai21gate)
+  (DELAY 
+    (ABSOLUTE
+    (IOPATH B1 ZN (0.000:0.025:0.025) (0.000:0.013:0.013))
+    (IOPATH B2 ZN (0.000:0.030:0.030) (0.000:0.016:0.016))
+    ( COND (B1 == 1'b0) && (B2 == 1'b1) (IOPATH A ZN (0.000:0.018:0.018)))
+    (COND (B1 == 1'b1) && (B2 == 1'b0) (IOPATH A ZN (0.000:0.018:0.018) (0.000:0.016:0.016)))
+    (COND (B1 == 1'b1) && (B2 == 1'b1) (IOPATH A ZN (0.000:0.019:0.019) (0.000:0.014:0.014)))
+    )
+  )
+)
+(CELL
+  (CELLTYPE "MUX2_X1")
+  (INSTANCE mux2gate)
+  (DELAY 
+    (ABSOLUTE
+    (COND (B == 1'b0) && (S == 1'b0) (IOPATH A Z (0.000:0.037:0.037) (0.000:0.058:0.058)))
+    (COND (B == 1'b1) && (S == 1'b0) (IOPATH A Z (0.000:0.037:0.037) (0.000:0.058:0.058)))
+    (COND (A == 1'b0) && (S == 1'b1) (IOPATH B Z (0.000:0.035:0.035) (0.000:0.056:0.056)))
+    (COND (A == 1'b1) && (S == 1'b1) (IOPATH B Z (0.000:0.035:0.035) (0.000:0.056:0.056)))
+    (COND (A == 1'b0) && (B == 1'b1) (IOPATH S Z (0.000:0.047:0.047) (0.000:0.073:0.073)))
+    (COND (A == 1'b1) && (B == 1'b0) (IOPATH S Z (0.000:0.072:0.072) (0.000:0.064:0.064)))
     )
   )
 )
diff --git a/tests/gates.v b/tests/gates.v
index 2fa07cd..0925bf6 100644
--- a/tests/gates.v
+++ b/tests/gates.v
@@ -1,11 +1,15 @@
-module gates (a, b, o0, o1 );
+module gates (a, b, c, o0, o1, o2, o3 );
 input a;
 input b;
+input c;
 output o0;
 output o1;
+output o2;
+output o3;
 
-AND2X1 andgate (.IN1 ( a ) , .IN2 ( b ) , .Q ( o0 ) ) ;
-NAND2X1 nandgate (.IN1 ( a ) , .IN2 ( b ) , .QN ( o1 ) ) ;
-
+AND2_X1 andgate (.A1 ( a ) , .A2 ( b ) , .ZN ( o0 ) ) ;
+NAND2_X1 nandgate (.A1 ( a ) , .A2 ( b ) , .ZN ( o1 ) ) ;
+OAI21_X1 oai21gate (.B1(a), .B2(b), .A(c), .ZN(o2) ) ;
+MUX2_X1 mux2gate (.A(a), .B(b), .S(c), .Z(o3)) ;
 
 endmodule
\ No newline at end of file
diff --git a/tests/test_circuit.py b/tests/test_circuit.py
index 5aa3074..d4edcd3 100644
--- a/tests/test_circuit.py
+++ b/tests/test_circuit.py
@@ -1,9 +1,30 @@
 import pickle
 
-from kyupy.circuit import Circuit, Node, Line
+from kyupy.circuit import GrowingList, Circuit, Node, Line
 from kyupy import verilog, bench
 from kyupy.techlib import SAED32
 
+def test_growing_list():
+    gl = GrowingList()
+    assert gl.free_idx == 0
+    gl[0] = 1
+    assert gl.free_idx == 1
+    gl[2] = 1
+    assert gl.free_idx == 1
+    gl[0] = None
+    assert gl.free_idx == 0
+    gl[0] = 1
+    assert gl.free_idx == 1
+    gl[1] = 1
+    assert gl.free_idx == 3
+    gl.append(1)
+    assert gl.free_idx == 4
+    gl[2] = None
+    assert gl.free_idx == 2
+    gl[2] = 1
+    gl[1] = None
+    assert gl.free_idx == 1
+
 def test_lines():
     c = Circuit()
     n1 = Node(c, 'n1')
diff --git a/tests/test_logic_sim.py b/tests/test_logic_sim.py
index 5849310..85a11ff 100644
--- a/tests/test_logic_sim.py
+++ b/tests/test_logic_sim.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from kyupy.logic_sim import LogicSim
+from kyupy.logic_sim import LogicSim, LogicSim6V
 from kyupy import bench, logic, sim
 from kyupy.logic import mvarray, bparray, bp_to_mv, mv_to_bp
 
@@ -94,6 +94,30 @@ def test_4v():
         '--0XX', '--X1X', '--XXX', '--XXX'))
 
 
+def test_6v():
+    c = bench.parse('input(x, y) output(a, o, n, xo, no) a=AND2(x,y) o=OR2(x,y) n=INV1(x) xo=XOR2(x,y) no=NOR2(x,y)')
+    s = LogicSim6V(c, 36)
+    assert s.s_len == 7
+    mva = mvarray(
+        '0000101', '0101110', '0R0R1RF', '0F0F1FR', '0P0P1PN', '0N0N1NP',
+        '1001010', '1111000', '1RR10F0', '1FF10R0', '1PP10N0', '1NN10P0',
+        'R00RFRF', 'R1R1FF0', 'RRRRFPF', 'RFPNFNP', 'RPPRFRF', 'RNRNFFP',
+        'F00FRFR', 'F1F1RR0', 'FRPNRNP', 'FFFFRPR', 'FPPFRFR', 'FNFNRRP',
+        'P00PNPN', 'P1P1NN0', 'PRPRNRF', 'PFPFNFR', 'PPPPNPN', 'PNPNNNP',
+        'N00NPNP', 'N1N1PP0', 'NRRNPFP', 'NFFNPRP', 'NPPNPNP', 'NNNNPPP')
+    tests = np.copy(mva)
+    tests[2:] = logic.ZERO
+    s.s[0] = tests
+    s.s_to_c()
+    s.c_prop()
+    s.c_to_s()
+    resp = s.s[1].copy()
+
+    exp_resp = np.copy(mva)
+    exp_resp[:2] = logic.ZERO
+    np.testing.assert_allclose(resp, exp_resp)
+
+
 def test_8v():
     c = bench.parse('input(x, y) output(a, o, n, xo) a=and(x,y) o=or(x,y) n=not(x) xo=xor(x,y)')
     s = LogicSim(c, 64, m=8)
@@ -173,3 +197,64 @@ def test_b01(mydir):
     s.c_prop()
     s.c_to_s()
     bp_to_mv(s.s[1])
+
+
+def sim_and_compare(c, test_resp, m=8):
+    tests, resp = test_resp
+    lsim = LogicSim(c, m=m, sims=tests.shape[1])
+    lsim.s[0] = logic.mv_to_bp(tests)
+    lsim.s_to_c()
+    lsim.c_prop()
+    lsim.c_to_s()
+    resp_sim = logic.bp_to_mv(lsim.s[1])[:,:tests.shape[1]]
+    idxs, pats = np.nonzero(((resp == logic.ONE) & (resp_sim != logic.ONE)) | ((resp == logic.ZERO) & (resp_sim != logic.ZERO)))
+    for i, (idx, pat) in enumerate(zip(idxs, pats)):
+        if i >= 10:
+            print(f'...')
+            break
+        print(f'mismatch pattern:{pat} ppio:{idx} exp:{logic.mv_str(resp[idx,pat])} act:{logic.mv_str(resp_sim[idx,pat])}')
+    assert len(idxs) == 0
+
+def sim_and_compare_6v(c, test_resp):
+    tests, resp = test_resp
+    lsim = LogicSim6V(c, sims=tests.shape[1])
+    lsim.s[0] = tests
+    lsim.s_to_c()
+    lsim.c_prop()
+    lsim.c_to_s()
+    resp_sim = lsim.s[1]
+    idxs, pats = np.nonzero(((resp == logic.ONE) & (resp_sim != logic.ONE)) | ((resp == logic.ZERO) & (resp_sim != logic.ZERO)))
+    for i, (idx, pat) in enumerate(zip(idxs, pats)):
+        if i >= 10:
+            print(f'...')
+            break
+        print(f'mismatch pattern:{pat} ppio:{idx} exp:{logic.mv_str(resp[idx,pat])} act:{logic.mv_str(resp_sim[idx,pat])}')
+    assert len(idxs) == 0
+
+
+def test_b15_2ig_sa_2v(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp):
+    sim_and_compare(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp, m=2)
+
+
+def test_b15_2ig_sa_4v(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp):
+    sim_and_compare(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp, m=4)
+
+
+def test_b15_2ig_sa_6v(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp):
+    sim_and_compare_6v(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp)
+
+
+def test_b15_2ig_sa_8v(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp):
+    sim_and_compare(b15_2ig_circuit_resolved, b15_2ig_sa_nf_test_resp, m=8)
+
+
+def test_b15_4ig_sa_2v(b15_4ig_circuit_resolved, b15_4ig_sa_rf_test_resp):
+    sim_and_compare(b15_4ig_circuit_resolved, b15_4ig_sa_rf_test_resp, m=2)
+
+
+def test_b15_4ig_sa_4v(b15_4ig_circuit_resolved, b15_4ig_sa_rf_test_resp):
+    sim_and_compare(b15_4ig_circuit_resolved, b15_4ig_sa_rf_test_resp, m=4)
+
+
+def test_b15_4ig_sa_8v(b15_4ig_circuit_resolved, b15_4ig_sa_rf_test_resp):
+    sim_and_compare(b15_4ig_circuit_resolved, b15_4ig_sa_rf_test_resp, m=8)
diff --git a/tests/test_sdf.py b/tests/test_sdf.py
index e94285b..9ca4627 100644
--- a/tests/test_sdf.py
+++ b/tests/test_sdf.py
@@ -2,7 +2,7 @@ import numpy as np
 
 from kyupy import sdf, verilog, bench
 from kyupy.wave_sim import WaveSim, TMAX, TMIN
-from kyupy.techlib import SAED32, SAED90
+from kyupy.techlib import SAED32, NANGATE45
 
 def test_parse():
     test = '''
@@ -80,9 +80,9 @@ def test_b15(mydir):
 
 
 def test_gates(mydir):
-    c = verilog.load(mydir / 'gates.v', tlib=SAED90)
+    c = verilog.load(mydir / 'gates.v', tlib=NANGATE45)
     df = sdf.load(mydir / 'gates.sdf')
-    lt = df.iopaths(c, tlib=SAED90)[1]
+    lt = df.iopaths(c, tlib=NANGATE45)[1]
     nand_a = c.cells['nandgate'].ins[0]
     nand_b = c.cells['nandgate'].ins[1]
     and_a = c.cells['andgate'].ins[0]
diff --git a/tests/test_verilog.py b/tests/test_verilog.py
index 87bbe73..816c12b 100644
--- a/tests/test_verilog.py
+++ b/tests/test_verilog.py
@@ -1,5 +1,5 @@
 from kyupy import verilog
-from kyupy.techlib import SAED90, SAED32
+from kyupy.techlib import SAED90, SAED32, NANGATE45
 
 def test_b01(mydir):
     with open(mydir / 'b01.v', 'r') as f:
@@ -26,12 +26,12 @@ def test_b15(mydir):
 
 
 def test_gates(mydir):
-    c = verilog.load(mydir / 'gates.v', tlib=SAED90)
-    assert len(c.nodes) == 10
-    assert len(c.lines) == 10
+    c = verilog.load(mydir / 'gates.v', tlib=NANGATE45)
+    assert len(c.nodes) == 18
+    assert len(c.lines) == 21
     stats = c.stats
-    assert stats['input'] == 2
-    assert stats['output'] == 2
+    assert stats['input'] == 3
+    assert stats['output'] == 4
     assert stats['__seq__'] == 0
 
 
diff --git a/tests/test_wave_sim.py b/tests/test_wave_sim.py
index 9a09b32..2510c28 100644
--- a/tests/test_wave_sim.py
+++ b/tests/test_wave_sim.py
@@ -5,22 +5,56 @@ from kyupy.logic_sim import LogicSim
 from kyupy import logic, bench, sim
 from kyupy.logic import mvarray
 
+def test_xnor2_delays():
+    op = (sim.XNOR2, 2, 0, 1, 3, 3, -1, 0, 0)
+    #op = (0b0111, 4, 0, 1)
+    c = np.full((4*16, 1), TMAX, dtype=np.float32)  # 4 waveforms of capacity 16
+    c_locs = np.zeros((4,), dtype='int')
+    c_caps = np.zeros((4,), dtype='int')
+    ebuf = np.zeros((4, 1, 2), dtype=np.int32)
+
+    for i in range(4): c_locs[i], c_caps[i] = i*16, 16  # 1:1 mapping
+    delays = np.zeros((1, 4, 2, 2))
+    delays[0, 0, 0, 0] = 0.031  # A rise -> Z rise
+    delays[0, 0, 0, 1] = 0.027  # A rise -> Z fall
+    delays[0, 0, 1, 0] = 0.033  # A fall -> Z rise
+    delays[0, 0, 1, 1] = 0.037  # A fall -> Z fall
+    delays[0, 1, 0, 0] = 0.032  # B rise -> Z rise
+    delays[0, 1, 0, 1] = 0.030  # B rise -> Z fall
+    delays[0, 1, 1, 0] = 0.038  # B fall -> Z rise
+    delays[0, 1, 1, 1] = 0.036  # B fall -> Z fall
+
+    simctl_int = np.asarray([0], dtype=np.int32)
+
+    def wave_assert(inputs, output):
+        for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
+        wave_eval_cpu(op, c, c_locs, c_caps, ebuf, 0, delays, simctl_int, 0, 0)
+        for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[2,i], v)
+
+    wave_assert([[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX])      # XNOR(1,1) => 1
+    wave_assert([[TMAX,TMAX],[TMIN,TMAX]], [TMAX])           # XNOR(0,1) => 0
+    # using Afall/Zfall for pulse length, bug: was using Arise/Zfall
+    #wave_assert([[0.07, 0.10, TMAX], [0.0, TMAX]], [TMIN, 0.03, 0.101, 0.137, TMAX])
+    wave_assert([[0.07, 0.10, TMAX], [0.0, TMAX]], [TMIN, 0.03, TMAX])
+    wave_assert([[0.06, 0.10, TMAX], [0.0, TMAX]], [TMIN, 0.03, 0.091, 0.137, TMAX])
+
 def test_nand_delays():
     op = (sim.NAND4, 4, 0, 1, 2, 3, -1, 0, 0)
     #op = (0b0111, 4, 0, 1)
-    c = np.full((5*16, 1), TMAX)  # 5 waveforms of capacity 16
+    c = np.full((5*16, 1), TMAX, dtype=np.float32)  # 5 waveforms of capacity 16
     c_locs = np.zeros((5,), dtype='int')
     c_caps = np.zeros((5,), dtype='int')
+    ebuf = np.zeros((4, 1, 2), dtype=np.int32)
 
     for i in range(5): c_locs[i], c_caps[i] = i*16, 16  # 1:1 mapping
 
     # SDF specifies IOPATH delays with respect to output polarity
     # SDF pulse rejection value is determined by IOPATH causing last transition and polarity of last transition
     delays = np.zeros((1, 5, 2, 2))
-    delays[0, 0, 0, 0] = 0.1  # A -> Z rise delay
-    delays[0, 0, 0, 1] = 0.2  # A -> Z fall delay
-    delays[0, 0, 1, 0] = 0.1  # A -> Z negative pulse limit (terminate in rising Z)
-    delays[0, 0, 1, 1] = 0.2  # A -> Z positive pulse limit
+    delays[0, 0, 0, 0] = 0.1  # A rise -> Z rise
+    delays[0, 0, 0, 1] = 0.2  # A rise -> Z fall
+    delays[0, 0, 1, 0] = 0.1  # A fall -> Z rise
+    delays[0, 0, 1, 1] = 0.2  # A fall -> Z fall
     delays[0, 1, :, 0] = 0.3  # as above for B -> Z
     delays[0, 1, :, 1] = 0.4
     delays[0, 2, :, 0] = 0.5  # as above for C -> Z
@@ -32,7 +66,7 @@ def test_nand_delays():
 
     def wave_assert(inputs, output):
         for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
-        wave_eval_cpu(op, c, c_locs, c_caps, 0, delays, simctl_int)
+        wave_eval_cpu(op, c, c_locs, c_caps, ebuf, 0, delays, simctl_int, 0, 0)
         for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[4,i], v)
 
     wave_assert([[TMAX,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(0,0,1,1) => 1
@@ -145,7 +179,7 @@ def compare_to_logic_sim(wsim: WaveSim):
     lsim.s_to_c()
     lsim.c_prop()
     lsim.c_to_s()
-    exp = logic.bp_to_mv(lsim.s[1])
+    exp = logic.bp_to_mv(lsim.s[1])[:,:tests.shape[-1]]
 
     resp[resp == logic.PPULSE] = logic.ZERO
     resp[resp == logic.NPULSE] = logic.ONE
@@ -156,13 +190,13 @@ def compare_to_logic_sim(wsim: WaveSim):
     np.testing.assert_allclose(resp, exp)
 
 
-def test_b15(b15_2ig_circuit, b15_2ig_delays):
-    compare_to_logic_sim(WaveSim(b15_2ig_circuit, b15_2ig_delays, 8))
+def test_b15(b15_2ig_circuit_resolved, b15_2ig_delays):
+    compare_to_logic_sim(WaveSim(b15_2ig_circuit_resolved, b15_2ig_delays, 8))
 
 
-def test_b15_strip_forks(b15_2ig_circuit, b15_2ig_delays):
-    compare_to_logic_sim(WaveSim(b15_2ig_circuit, b15_2ig_delays, 8, strip_forks=True))
+def test_b15_strip_forks(b15_2ig_circuit_resolved, b15_2ig_delays):
+    compare_to_logic_sim(WaveSim(b15_2ig_circuit_resolved, b15_2ig_delays, 8, strip_forks=True))
 
 
-def test_b15_cuda(b15_2ig_circuit, b15_2ig_delays):
-    compare_to_logic_sim(WaveSimCuda(b15_2ig_circuit, b15_2ig_delays, 8, strip_forks=True))
+def test_b15_cuda(b15_2ig_circuit_resolved, b15_2ig_delays):
+    compare_to_logic_sim(WaveSimCuda(b15_2ig_circuit_resolved, b15_2ig_delays, 8, strip_forks=True))