From 6520ee23ef276dce74584c57f32b8afa0c15f1ee Mon Sep 17 00:00:00 2001
From: stefan <mail@s-holst.de>
Date: Tue, 14 Mar 2023 17:40:50 +0900
Subject: [PATCH] cleanup and new intro notebook

---
 Demo.ipynb                  | 1354 -------------------------
 README.rst                  |   12 +-
 docs/conf.py                |    4 +-
 examples/Introduction.ipynb | 1873 +++++++++++++++++++++++++++++++++++
 src/kyupy/bench.py          |    2 +-
 src/kyupy/circuit.py        |   23 +-
 src/kyupy/logic.py          |    6 +
 src/kyupy/logic_sim.py      |  135 +--
 src/kyupy/wave_sim.py       |   58 +-
 src/kyupy/wave_sim_old.py   |  961 ------------------
 10 files changed, 1926 insertions(+), 2502 deletions(-)
 delete mode 100644 Demo.ipynb
 create mode 100644 examples/Introduction.ipynb
 delete mode 100644 src/kyupy/wave_sim_old.py
diff --git a/Demo.ipynb b/Demo.ipynb
deleted file mode 100644
index dde96bc..0000000
--- a/Demo.ipynb
+++ /dev/null
@@ -1,1354 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Loading and Exploring Gate-Level Circuits"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Example of parsing the bench data format to make simple gate-level circuits."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from kyupy import bench\n",
-    "\n",
-    "# load a file\n",
-    "b01 = bench.load('tests/b01.bench')\n",
-    "\n",
-    "# ... or specify the circuit as string \n",
-    "mycircuit = bench.parse('input(a,b) output(o1,o2,o3) x=buf(a) o1=not(x) o2=buf(x) o3=buf(x)')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Circuits are objects of the class `Circuit`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<Circuit tests/b01.bench cells=45 forks=47 lines=130 ports=4>"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "b01"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<Circuit cells=4 forks=6 lines=8 ports=5>"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mycircuit"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Circuits are containers for two types of elements: nodes and lines.\n",
-    "* A `Node` is a named entity in a circuit (e.g. a gate, a standard cell, a named signal, or a fan-out point) that has connections to other nodes.\n",
-    "* A `Line` is a directional 1:1 connection between two Nodes.\n",
-    "\n",
-    "Use the `dump()` method to get a string representation of all nodes and their connections."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "None(0,1,2,3,4)\n",
-      "0:__fork__\"a\"  >1\n",
-      "1:__fork__\"b\"  \n",
-      "2:__fork__\"o1\" <2 \n",
-      "3:__fork__\"o2\" <4 \n",
-      "4:__fork__\"o3\" <6 \n",
-      "5:buf\"x\" <1 >0\n",
-      "6:__fork__\"x\" <0 >3 >5 >7\n",
-      "7:not\"o1\" <3 >2\n",
-      "8:buf\"o2\" <5 >4\n",
-      "9:buf\"o3\" <7 >6\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(mycircuit.dump())"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The first line of the dump starts with the circuit name (\"None\" for `mycircuit`), followed by the node-IDs of all the ports (inputs and outputs) of the circuit.\n",
-    "\n",
-    "Each of the following lines describes one node.\n",
-    "Each node in the circuit has a unique ID, a type, a name, and line-connections. This information is given on each line in that order.\n",
-    "\n",
-    "A line in the circuit has a unique ID, a driver node and a receiver node. The connections in the dump show the direction (\">\" for output, \"<\" for input) and the line-ID. For example in `mycircuit`: Node-0 has one output connected to Line-1, and this Line-1 is connected to the input of Node-5.\n",
-    "\n",
-    "The `io_nodes` is the list of nodes forming the ports (inputs and outputs):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[0:__fork__\"a\"  >1,\n",
-       " 1:__fork__\"b\"  ,\n",
-       " 2:__fork__\"o1\" <2 ,\n",
-       " 3:__fork__\"o2\" <4 ,\n",
-       " 4:__fork__\"o3\" <6 ]"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mycircuit.io_nodes"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Nodes\n",
-    "\n",
-    "There are two types of nodes: __forks__ and __cells__.\n",
-    "\n",
-    "Forks have the special type `__fork__` while cells can be of various types (`buf`, `not`, `and`, `nor`, etc.).\n",
-    "Forks are used to label signals with names and to connect a one cell to multiple other cells (fan-out).\n",
-    "The names among all forks and among all cells within a circuit are unique.\n",
-    "Thus, a fork and a cell are allowed to share the same name.\n",
-    "\n",
-    "Nodes in circuits can be accessed by ID or by name."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "7:not\"o1\" <3 >2"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mycircuit.nodes[7]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "6:__fork__\"x\" <0 >3 >5 >7"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mycircuit.forks['x']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "5:buf\"x\" <1 >0"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mycircuit.cells['x']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Nodes have an `index` (the node ID), a `kind` (the type), a `name`, as well as `ins` (input pins) and `outs` (output pins)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(6, '__fork__', 'x', [0], [3, 5, 7])"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "n = mycircuit.nodes[6]\n",
-    "n.index, n.kind, n.name, n.ins, n.outs"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The inputs and outputs of a node are lists containing `Line` objects."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "kyupy.circuit.Line"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "type(n.ins[0])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Lines\n",
-    "\n",
-    "A line is a directional connection between one driving node (`driver`) and one reading node (`reader`).\n",
-    "\n",
-    "A line also knows to which node pins it is connected to: `driver_pin`, `reader_pin`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(5, 6:__fork__\"x\" <0 >3 >5 >7, 8:buf\"o2\" <5 >4, 1, 0)"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "l = mycircuit.nodes[6].outs[1]\n",
-    "l.index, l.driver, l.reader, l.driver_pin, l.reader_pin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Basic Analysis Examples\n",
-    "### Cell type statistics"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "defaultdict(<class 'int'>, {'DFF': 5, 'AND': 1, 'NAND': 28, 'OR': 1, 'NOT': 10})\n"
-     ]
-    }
-   ],
-   "source": [
-    "from collections import defaultdict\n",
-    "\n",
-    "counts = defaultdict(int)\n",
-    "\n",
-    "for n in b01.cells.values():\n",
-    "    counts[n.kind] += 1\n",
-    "\n",
-    "print(counts)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Tracing a scan chain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<Circuit b14 cells=15873 forks=15842 lines=46891 ports=91>"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from kyupy import verilog\n",
-    "\n",
-    "b14 = verilog.load('tests/b14.v.gz')\n",
-    "b14"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "chain length 287\n",
-      "output test_so000\n",
-      "NBUFFX8_RVT HFSBUF_36_76\n",
-      "SDFFARX1_RVT wr_reg\n",
-      "INVX4_RVT HFSINV_691_254\n",
-      "INVX0_RVT HFSINV_2682_255\n",
-      "SDFFARX1_RVT state_reg\n",
-      "NBUFFX2_RVT ZBUF_55_inst_860\n",
-      "SDFFARX1_RVT reg3_reg_28_\n",
-      "SDFFARX1_RVT reg3_reg_27_\n",
-      "SDFFARX1_RVT reg3_reg_26_\n",
-      "...\n",
-      "NBUFFX2_RVT ZBUF_1656_inst_2160\n",
-      "SDFFARX1_RVT IR_reg_3_\n",
-      "NBUFFX2_RVT ZBUF_85_inst_865\n",
-      "SDFFARX1_RVT IR_reg_2_\n",
-      "SDFFARX1_RVT IR_reg_1_\n",
-      "SDFFARX1_RVT IR_reg_0_\n",
-      "NBUFFX2_RVT ZBUF_17_inst_905\n",
-      "NBUFFX4_RVT ZBUF_275_inst_906\n",
-      "SDFFARX1_RVT B_reg\n",
-      "input test_si000\n"
-     ]
-    }
-   ],
-   "source": [
-    "chain = []\n",
-    "cell = b14.cells['test_so000']\n",
-    "chain.append(cell)\n",
-    "while len(cell.ins) > 0:\n",
-    "    cell = cell.ins[2 if 'SDFF' in cell.kind else 0].driver\n",
-    "    if '__fork__' not in cell.kind:\n",
-    "        chain.append(cell)\n",
-    "        \n",
-    "print('chain length', len(chain))\n",
-    "for c in chain[:10]:\n",
-    "    print(c.kind, c.name)\n",
-    "print('...')\n",
-    "for c in chain[-10:]:\n",
-    "    print(c.kind, c.name)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Determining Logic Depth of Nodes"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<Circuit b14 cells=15873 forks=15842 lines=46891 ports=91>"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from kyupy import verilog\n",
-    "\n",
-    "b14 = verilog.load('tests/b14.v.gz')\n",
-    "b14"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Calculate logic level (logic depth, distance from inputs or scan flip-flops) for each node in the circuit.\n",
-    "Inputs and flip-flops themselves are level 0, **cells** driven by just inputs and flip-flops are level 1, and so on.\n",
-    "**Fork** nodes have the same level as their driver, because they do not increase the logic depth."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Maximum logic depth: 112\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "levels = np.zeros(len(b14.nodes), dtype='uint16')  # store level for each node.\n",
-    "\n",
-    "for cell in b14.topological_order():\n",
-    "    if 'DFF' in cell.kind or 'input' == cell.kind:\n",
-    "        levels[cell] = 0\n",
-    "    elif '__fork__' == cell.kind:\n",
-    "        levels[cell] = levels[cell.ins[0].driver]  # forks only have exactly one driver\n",
-    "    else:\n",
-    "        levels[cell] = max([levels[line.driver] for line in cell.ins]) + 1\n",
-    "        \n",
-    "print(f'Maximum logic depth: {np.max(levels)}')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "List nodes with the highest depth and which nodes they are driving."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "depth: 112 node: __fork__     n2692  driving: SDFFARX1_RVT reg1_reg_29_  \n",
-      "depth: 112 node: NAND2X0_RVT  U465   driving: __fork__     n2692         \n",
-      "depth: 112 node: NAND2X0_RVT  U562   driving: __fork__     n2724         \n",
-      "depth: 112 node: __fork__     n2724  driving: SDFFARX1_RVT reg0_reg_29_  \n",
-      "depth: 112 node: __fork__     n2608  driving: SDFFARX1_RVT B_reg         \n",
-      "depth: 112 node: NAND2X0_RVT  U170   driving: __fork__     n2608         \n",
-      "depth: 111 node: NAND2X0_RVT  U5550  driving: __fork__     n2693         \n",
-      "depth: 111 node: __fork__     n2660  driving: SDFFARX1_RVT reg2_reg_29_  \n",
-      "depth: 111 node: AND2X2_RVT   U5560  driving: __fork__     n2660         \n",
-      "depth: 111 node: __fork__     n2725  driving: SDFFARX1_RVT reg0_reg_28_  \n",
-      "depth: 111 node: __fork__     n2693  driving: SDFFARX1_RVT reg1_reg_28_  \n",
-      "depth: 111 node: __fork__     n362   driving: NAND2X0_RVT  U170          \n",
-      "depth: 111 node: NAND2X0_RVT  U173   driving: __fork__     n362          \n",
-      "depth: 111 node: __fork__     n600   driving: NAND2X0_RVT  U562          \n",
-      "depth: 111 node: NAND2X0_RVT  U563   driving: __fork__     n600          \n",
-      "depth: 111 node: NAND2X0_RVT  U565   driving: __fork__     n2725         \n",
-      "depth: 111 node: NAND2X0_RVT  U466   driving: __fork__     n535          \n",
-      "depth: 111 node: __fork__     n535   driving: NAND2X0_RVT  U465          \n",
-      "depth: 110 node: __fork__     n4691  driving: AND2X2_RVT   U5560         \n",
-      "depth: 110 node: NAND2X0_RVT  U5736  driving: __fork__     n790          \n"
-     ]
-    }
-   ],
-   "source": [
-    "nodes_by_depth = np.argsort(levels)[::-1]\n",
-    "\n",
-    "for n_idx in nodes_by_depth[:20]:\n",
-    "    n = b14.nodes[n_idx]\n",
-    "    readers = ', '.join([f'{l.reader.kind:12s} {l.reader.name:14s}' for l in n.outs])\n",
-    "    print(f'depth: {levels[n_idx]} node: {n.kind:12s} {n.name:6s} driving: {readers}')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Working With Test Data and Logic Simulation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Load a stuck-at fault test pattern set and expected fault-free responses from a STIL file."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from kyupy import verilog, stil\n",
-    "from kyupy.logic import MVArray, BPArray\n",
-    "from kyupy.logic_sim import LogicSim\n",
-    "\n",
-    "b14 = verilog.load('tests/b14.v.gz')\n",
-    "s = stil.load('tests/b14.stuck.stil.gz')\n",
-    "stuck_tests = s.tests(b14)\n",
-    "stuck_responses = s.responses(b14)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Tests and responses are instances of `MVArray`. Its `length` is the number of test vectors stored, its `width` is the number of values in a vector. By default, the stil parser returns 8-valued test vectors (`m=8`)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<MVArray length=1081 width=306 m=8 mem=323.0kiB>"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "stuck_tests"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The internal storage (an `ndarray` of `uint8`) is accessible via `data`. The first axis is the width, and the last axis goes along the test set."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(306, 1081)"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "stuck_tests.data.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The subscript accessor returns a string representation of the given test vector number. Possible values are '0', '1', '-', 'X', 'R', 'F', 'P', and 'N'."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'P0--------------------11011111011001100111010101011101----------------------------------00-10111011010110011101110010111010111011101100010000110101111111011010101001010101010101010101001010110101001010101010101010110100000111111111111111011010100100101010010010101101010101001010100111010001010010000011100'"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "stuck_tests[1]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'--10000010010100010111--------------------------------0101010010101010110101001001010100--011111110011011111000111010101010111011101100010000110101111111011010101001010101010101010101001010110101001010101010101010110100000111111111111111011010100100101010010010101101010101001010101000111111111111111011101'"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "stuck_responses[1]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The order of values in the vectors correspond to the circuit's interface followed by the scan flip-flops as they appear in `b14.cells`.\n",
-    "The test data can be used directly in the simulators as they use the same ordering convention.\n",
-    "\n",
-    "The logic simulator uses bit-parallel storage of logic values, but our loaded test data uses one `uint8` per logic value.\n",
-    "To convert the storage layout, we instanciate a `BPArray` for the input stimuli.\n",
-    "The storage layout is more compact, but individual values cannot be easily accessed anymore."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<BPArray length=1081 width=306 m=8 mem=121.9kiB>"
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "stuck_tests_bp = BPArray(stuck_tests)\n",
-    "stuck_tests_bp"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(306, 3, 136)"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "stuck_tests_bp.data.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The following code performs a 8-valued logic simulation and stores the results in a new instance of `BPArray`.\n",
-    "The packed array is unpacked into an `MVArray` for value access."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "responses_bp = BPArray((stuck_tests_bp.width, len(stuck_tests_bp)))\n",
-    "simulator = LogicSim(b14, sims=len(stuck_tests_bp))\n",
-    "simulator.assign(stuck_tests_bp)\n",
-    "simulator.propagate()\n",
-    "simulator.capture(responses_bp)\n",
-    "responses = MVArray(responses_bp)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'--10000010010100010111--------------------------------0101010010101010110101001001010100--011111110011011111000111010101010111011101100010000110101111111011010101001010101010101010101001010110101001010101010101010110100000111111111111111011010100100101010010010101101010101001010101000111111111111111011101'"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "responses[1]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Compare simulation results to expected fault-free responses loaded from STIL. The first test fails, because it is a flush test while simulation implicitly assumes a standard test with a capture clock."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "mismatch for test pattern 0\n",
-      "1080 of 1081 responses matched with simulator\n"
-     ]
-    }
-   ],
-   "source": [
-    "matches = 0\n",
-    "for i in range(len(responses)):\n",
-    "    if responses[i] == stuck_responses[i]:\n",
-    "        matches += 1\n",
-    "    else:\n",
-    "        print(f'mismatch for test pattern {i}')\n",
-    "print(f'{matches} of {len(responses)} responses matched with simulator')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Transition faults require test vector pairs for testing. These pairs are generated by `tests_loc`, assuming a launch-on-capture scheme (two functional clock cycles after scan-in)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "s = stil.load('tests/b14.transition.stil.gz')\n",
-    "trans_tests = s.tests_loc(b14)\n",
-    "trans_responses = s.responses(b14)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<MVArray length=1392 width=306 m=8 mem=416.0kiB>"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "trans_tests"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Possible values in the string representation are: '0', '1', '-', 'X', 'R' (rising transition), 'F' (falling transition), 'P' (positive pulse(s), 010), 'N' (negative pulse(s), 101)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'00--------------------RRRRRRFRRRRRRRRRRRFFRFRRRRRRRRRR----------------------------------00-00000001110100011111011010000000000000000011001001100101111110101110110001000100010100110111111101101000000111110011100010111000111R1111111111111111111111110001100100000110100000111010101110RFF00F000F0F00F00000FF01F'"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "trans_tests[1]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We validate these patterns with an 8-valued logic simulation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "trans_tests_bp = BPArray(trans_tests)\n",
-    "responses_bp = BPArray((trans_tests_bp.width, len(trans_tests_bp)))\n",
-    "simulator = LogicSim(b14, sims=len(trans_tests_bp))\n",
-    "simulator.assign(trans_tests_bp)\n",
-    "simulator.propagate()\n",
-    "simulator.capture(responses_bp)\n",
-    "responses = MVArray(responses_bp)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'--F00000F00F0F000F00FF--------------------------------01110101011100000101100000100110R0--0RRRRRRRNNNRNRPRNNNNNRFFRFRRRRRRR000000000011001001100101111110101110110001000100010100110111111101101000000111110011100010111000NNNNNNNNNNNNNNNNNNNNNNNNNNNNP0011001000001101000001110101011101RRRRRRRRRRRRRRRRRRRRP01R'"
-      ]
-     },
-     "execution_count": 32,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "responses[1]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The responses loaded from STIL only contain the final logic values. Use simple character replacements before comparing these. First test is again a flush test."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "mismatch for test pattern 0\n",
-      "1391 of 1392 responses matched with simulator\n"
-     ]
-    }
-   ],
-   "source": [
-    "matches = 0\n",
-    "for i in range(len(responses)):\n",
-    "    if trans_responses[i] == responses[i].replace('P','0').replace('N','1').replace('R','1').replace('F','0'):\n",
-    "        matches += 1\n",
-    "    else:\n",
-    "        print(f'mismatch for test pattern {i}')\n",
-    "print(f'{matches} of {len(responses)} responses matched with simulator')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Working With Delay Information and Timing Simulation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Delay data for gates and interconnect can be loaded from SDF files. In kyupy's timing simulators, delays are associated with the lines between nodes, not with the nodes themselves. Each line in the circuit has a rising delay, a falling delay, a negative pulse threshold, and a positive pulse threshold. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from kyupy import sdf\n",
-    "\n",
-    "df = sdf.load('tests/b14.sdf.gz')\n",
-    "lt = df.annotation(b14, dataset=0, interconnect=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The returned delay information is an `ndarray` with a set of delay values for each line in the circuit."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(46891, 2, 2)"
-      ]
-     },
-     "execution_count": 35,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "lt.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Number of non-0 values loaded:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "120628"
-      ]
-     },
-     "execution_count": 36,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "(lt != 0).sum()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The available timing simulators are `WaveSim` and `WaveSimCuda`.\n",
-    "They work similarly to `LogicSim` in that they evaluate all cells in topological order.\n",
-    "Instead of propagating a logic value, however, they propagate waveforms.\n",
-    "\n",
-    "`WaveSim` uses the numba just-in-time compiler for acceleration on CPU.\n",
-    "It falls back to pure python if numba is not available. `WaveSimCuda` uses numba for GPU acceleration.\n",
-    "If no CUDA card is available, it will fall back to pure python (not jit-compiled for CPU!).\n",
-    "Pure python is too slow for most purposes.\n",
-    "\n",
-    "Both simulators operate data-parallel.\n",
-    "The following instanciates a new engine for 32 independent timing simulations and each signal line in the circuit can carry at most 16 transitions. All simulators share the same circuit and the same line delay specification."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from kyupy.wave_sim import WaveSimCuda, TMAX\n",
-    "import numpy as np\n",
-    "\n",
-    "wsim = WaveSimCuda(b14, lt, sims=32, wavecaps=16)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "These are various memories allocated, with waveforms usually being the largest. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Waveforms              : 93908.5 kiB\n",
-      "State Allocation Table : 1113.4 kiB\n",
-      "Circuit Timing         : 1484.5 kiB\n",
-      "Circuit Netlist        : 732.7 kiB\n",
-      "Capture Data           : 267.8 kiB\n",
-      "Test Stimuli Data      : 3.6 kiB\n"
-     ]
-    }
-   ],
-   "source": [
-    "def print_mem(name, arr):\n",
-    "    print(f'{name}: {arr.size * arr.itemsize / 1024:.1f} kiB')\n",
-    "    \n",
-    "print_mem('Waveforms              ', wsim.state)\n",
-    "print_mem('State Allocation Table ', wsim.sat)\n",
-    "print_mem('Circuit Timing         ', wsim.timing)\n",
-    "print_mem('Circuit Netlist        ', wsim.ops)\n",
-    "print_mem('Capture Data           ', wsim.cdata)\n",
-    "print_mem('Test Stimuli Data      ', wsim.tdata)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This is a typical simulation loop where the number of patterns is larger than the number of simulators available.\n",
-    "We simulate `trans_tests_bp`.\n",
-    "The timing simulator accepts 8-valued `BPArray`s, but it will return response (capture) data in a different format."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sims = 128  # len(trans_tests_bp)  # Feel free to simulate all tests if CUDA is set up correctly.\n",
-    "\n",
-    "cdata = np.zeros((len(wsim.interface), sims, 7))  # space to store all capture data\n",
-    "\n",
-    "for offset in range(0, sims, wsim.sims):\n",
-    "    wsim.assign(trans_tests_bp, offset=offset)\n",
-    "    wsim.propagate(sims=sims-offset)\n",
-    "    wsim.capture(time=2.5, cdata=cdata, offset=offset)  # capture at time 2.5"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The capture data contains for each PI, PO, and scan flip-flop (axis 0), and each test (axis 1) seven values:\n",
-    "\n",
-    "0. Probability of capturing a 1 at the given capture time (same as next value, if no standard deviation given).\n",
-    "1. A capture value decided by random sampling according to above probability.\n",
-    "2. The final value (assume a very late capture time).\n",
-    "3. True, if there was a premature capture (capture error), i.e. final value is different from captured value.\n",
-    "4. Earliest arrival time. The time at which the output transitioned from its initial value.\n",
-    "5. Latest stabilization time. The time at which the output transitioned to its final value.\n",
-    "6. Overflow indicator. If non-zero, some signals in the input cone of this output had more transitions than specified in `wavecaps`. Some transitions have been discarded, the final values in the waveforms are still valid."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(306, 128, 7)"
-      ]
-     },
-     "execution_count": 40,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "cdata.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For validating against known logic values, take `cdata[...,1]`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "mismatch for test pattern 0\n",
-      "127 of 128 responses matched with simulator\n"
-     ]
-    }
-   ],
-   "source": [
-    "matches = 0\n",
-    "\n",
-    "for i in range(cdata.shape[1]):\n",
-    "    response = ''.join('1' if x > 0.5 else '0' for x in cdata[..., i, 1])\n",
-    "    if trans_responses[i].replace('-','0') == response:\n",
-    "        matches += 1\n",
-    "    else:\n",
-    "        print(f'mismatch for test pattern {i}')\n",
-    "print(f'{matches} of {cdata.shape[1]} responses matched with simulator')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The circuit delay is the maximum among all latest stabilization times:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "2.17240047454834"
-      ]
-     },
-     "execution_count": 42,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "cdata[...,5].max()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Check for overflows. If too many of them occur, increase `wavecaps` during engine instanciation:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "2.0"
-      ]
-     },
-     "execution_count": 43,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "cdata[...,6].sum()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Check for capture failures:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.0"
-      ]
-     },
-     "execution_count": 44,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "cdata[...,3].sum()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# CUDA Support Notes\n",
-    "\n",
-    "Try this code to check if CUDA is set up correctly.\n",
-    "\n",
-    "If there is an error related to `nvvm`, you probably need to set up some environment variables:\n",
-    "```\n",
-    "%env LD_LIBRARY_PATH=/usr/local/cuda/lib64\n",
-    "%env CUDA_HOME=/usr/local/cuda\n",
-    "```\n",
-    "If problems persist, refer to documentations for numba and cuda. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Found 2 CUDA devices\n",
-      "id 0    b'NVIDIA GeForce RTX 3090'                              [SUPPORTED]\n",
-      "                      compute capability: 8.6\n",
-      "                           pci device id: 0\n",
-      "                              pci bus id: 3\n",
-      "id 1       b'NVIDIA TITAN V'                              [SUPPORTED]\n",
-      "                      compute capability: 7.0\n",
-      "                           pci device id: 0\n",
-      "                              pci bus id: 2\n",
-      "Summary:\n",
-      "\t2/2 devices are supported\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 45,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from numba import cuda\n",
-    "\n",
-    "cuda.detect()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.8"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/README.rst b/README.rst
index 739e7b8..0957c8a 100644
--- a/README.rst
+++ b/README.rst
@@ -16,13 +16,17 @@ Getting Started
 ---------------
 
 KyuPy is available in `PyPI <https://pypi.org/project/kyupy>`_.
-It requires Python 3.6 or newer, `lark-parser <https://pypi.org/project/lark-parser>`_, and `numpy`_.
+It requires Python 3.8 or newer, `lark-parser <https://pypi.org/project/lark-parser>`_, and `numpy`_.
 Although optional, `numba`_ should be installed for best performance.
-GPU/CUDA support in numba may `require some additional setup <https://numba.pydata.org/numba-doc/latest/cuda/index.html>`_.
+GPU/CUDA support in numba may `require some additional setup <https://numba.readthedocs.io/en/stable/cuda/index.html>`_.
 If numba is not available, KyuPy will automatically fall back to slow, pure Python execution.
 
-The Jupyter Notebook `Demo.ipynb <https://github.com/s-holst/kyupy/blob/main/Demo.ipynb>`_ contains some useful examples to get familiar with the API.
+The Jupyter Notebook `Introduction.ipynb <https://github.com/s-holst/kyupy/blob/main/examples/Introduction.ipynb>`_ contains some useful examples to get familiar with the API.
+
+
+Development
+-----------
 
 To work with the latest pre-release source code, clone the `KyuPy GitHub repository <https://github.com/s-holst/kyupy>`_.
-Run ``pip3 install --user -e .`` within your local checkout to make the package available in your Python environment.
+Run ``pip install -e .`` within your local checkout to make the package available in your Python environment.
 The source code comes with tests that can be run with ``pytest``.
diff --git a/docs/conf.py b/docs/conf.py
index 540783b..e50d1bf 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -20,11 +20,11 @@ sys.path.insert(0, os.path.abspath('../src'))
 # -- Project information -----------------------------------------------------
 
 project = 'KyuPy'
-copyright = '2020-2021, Stefan Holst'
+copyright = '2020-2023, Stefan Holst'
 author = 'Stefan Holst'
 
 # The full version, including alpha/beta/rc tags
-release = '0.0.3'
+release = '0.0.4'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/examples/Introduction.ipynb b/examples/Introduction.ipynb
new file mode 100644
index 0000000..cdbbcab
--- /dev/null
+++ b/examples/Introduction.ipynb
@@ -0,0 +1,1873 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# KyuPy Introduction\n",
+    "\n",
+    "Working with KyuPy's basic data structures.\n",
+    "\n",
+    "## Gate-Level Circuits\n",
+    "\n",
+    "KyuPy has parsers for:\n",
+    "\n",
+    "* The [ISCAS'89 Benchmark Format](https://www.researchgate.net/profile/Franc-Brglez/publication/224723140_Combination_profiles_of_sequential_benchmark_circuits) \".bench\"\n",
+    "* Non-hierarchical gate-level verilog\n",
+    "\n",
+    "Files can be loaded using `load(file)`, strings can be parsed using `parse(text)`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# 0000000.000 W Numba unavailable. Falling back to pure Python.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from kyupy import bench, verilog\n",
+    "\n",
+    "# load a file\n",
+    "b14 = verilog.load('../tests/b14.v.gz')\n",
+    "\n",
+    "# ... or specify the circuit as string \n",
+    "adder = bench.parse('''\n",
+    "INPUT(a, b)\n",
+    "OUTPUT(s)\n",
+    "cin = DFF(cout)\n",
+    "axb = XOR(a, b)\n",
+    "s = XOR(axb, cin)\n",
+    "aab = AND(a, b)\n",
+    "axbacin = AND(axb, cin)\n",
+    "cout = OR(aab, axbacin)\n",
+    "''', name='adder')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "They return KyuPy's intermediate prepresentation of the circuit graph (objects of class `Circuit`):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{name: \"b14\", cells: 15873, forks: 15842, lines: 46891, io_nodes: 91}"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "b14"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{name: \"adder\", cells: 6, forks: 8, lines: 17, io_nodes: 3}"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Apparently, circuits contain `cells`, `forks`, `lines`, and `io_nodes`.\n",
+    "\n",
+    "### Cells and Forks\n",
+    "\n",
+    "Let's explore cells and forks for the adder circuit.\n",
+    "\n",
+    "There are dictionary-mappings from names to these objects:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'cin': 4:DFF\"cin\" <1 >0,\n",
+       " 'axb': 6:XOR\"axb\" <3 <4 >2,\n",
+       " 's': 8:XOR\"s\" <6 <7 >5,\n",
+       " 'aab': 9:AND\"aab\" <9 <10 >8,\n",
+       " 'axbacin': 11:AND\"axbacin\" <12 <13 >11,\n",
+       " 'cout': 13:OR\"cout\" <15 <16 >14}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.cells"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'a': 0:__fork__\"a\"  >3 >9,\n",
+       " 'b': 1:__fork__\"b\"  >4 >10,\n",
+       " 's': 2:__fork__\"s\" <5 ,\n",
+       " 'cout': 3:__fork__\"cout\" <14 >1,\n",
+       " 'cin': 5:__fork__\"cin\" <0 >7 >13,\n",
+       " 'axb': 7:__fork__\"axb\" <2 >6 >12,\n",
+       " 'aab': 10:__fork__\"aab\" <8 >15,\n",
+       " 'axbacin': 12:__fork__\"axbacin\" <11 >16}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.forks"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(For bench-files, the names of gates equal the names of the signals they produce. In verilog files, the names can be different.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "6:XOR\"axb\" <3 <4 >2"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.cells['axb']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7:__fork__\"axb\" <2 >6 >12"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.forks['axb']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Cells and forks are instances of class `Node`, which represent *things* that are connected to one or more other *things* in the circuit.\n",
+    "\n",
+    "* A **cell** represents a gate or a standard cell.\n",
+    "* A **fork** represents a named signal or a fan-out point (connecting the output of one cell to multiple other cells or forks).\n",
+    "\n",
+    "`Node`-objects have an `index`, a `kind`, and a `name`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(6, 'XOR', 'axb')"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.cells['axb'].index, adder.cells['axb'].kind, adder.cells['axb'].name"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "*Forks* are `Node`-objects of the special kind `__fork__`.\n",
+    "\n",
+    "*Cells* are `Node`-objects of any other kind. A kind is just a string and can be anything.\n",
+    "\n",
+    "The namespaces of *forks* and *cells* are separate:\n",
+    "* A *cell* and a *fork* **can** have the same name.\n",
+    "* Two *cells* or two *forks* **cannot** have the same name."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(7, '__fork__', 'axb')"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.forks['axb'].index, adder.forks['axb'].kind, adder.forks['axb'].name"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `index` of a *node* in a circuit is unique and consecutive.\n",
+    "\n",
+    "Also *forks* and *cells* have all separate indices.\n",
+    "\n",
+    "Nodes can be accessed by their index using the `nodes` list:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[0:__fork__\"a\"  >3 >9,\n",
+       " 1:__fork__\"b\"  >4 >10,\n",
+       " 2:__fork__\"s\" <5 ,\n",
+       " 3:__fork__\"cout\" <14 >1,\n",
+       " 4:DFF\"cin\" <1 >0,\n",
+       " 5:__fork__\"cin\" <0 >7 >13,\n",
+       " 6:XOR\"axb\" <3 <4 >2,\n",
+       " 7:__fork__\"axb\" <2 >6 >12,\n",
+       " 8:XOR\"s\" <6 <7 >5,\n",
+       " 9:AND\"aab\" <9 <10 >8,\n",
+       " 10:__fork__\"aab\" <8 >15,\n",
+       " 11:AND\"axbacin\" <12 <13 >11,\n",
+       " 12:__fork__\"axbacin\" <11 >16,\n",
+       " 13:OR\"cout\" <15 <16 >14]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.nodes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(6:XOR\"axb\" <3 <4 >2, 7:__fork__\"axb\" <2 >6 >12)"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.nodes[6], adder.nodes[7]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Lines\n",
+    "\n",
+    "A `Line` is a directional 1:1 connection between two Nodes.\n",
+    "\n",
+    "A line has a circuit-unique and consecutive `index` just like nodes.\n",
+    "\n",
+    "Line and node indices are different!\n",
+    "\n",
+    "There is a `lines` list. If a line is printed, it just outputs its index:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.lines"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A line one `driver`-node and one `reader`-node:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(6:XOR\"axb\" <3 <4 >2, 7:__fork__\"axb\" <2 >6 >12)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.lines[2].driver, adder.lines[2].reader"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Nodes show their connections to the lines with direction (\"<\" for input, \">\" for output) and the line index.\n",
+    "\n",
+    "In the example above, line 2 connects the output of cell \"axb\" to the input of fork \"axb\".\n",
+    "\n",
+    "The input connections and output connections of a node are ordered lists of lines called `ins` and `outs`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "([3, 4], [2])"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.cells['axb'].ins, adder.cells['axb'].outs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A line also stores its positions in the connection lists in `driver_pin` and `reader_pin`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0, 0, 1)"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.lines[2].driver_pin, adder.lines[2].reader_pin, adder.lines[4].reader_pin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### IO_Nodes\n",
+    "\n",
+    "Any node in the circuit can be designated as a primary input or primary output by adding it to the `io_nodes` list:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[0:__fork__\"a\"  >3 >9, 1:__fork__\"b\"  >4 >10, 2:__fork__\"s\" <5 ]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.io_nodes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It is common that io_nodes either have only output connections (in a role as primary-input) or only input connections (in a role as primary-output).\n",
+    "\n",
+    "Inputs and outputs appear in the order they were defined in the loaded file. Inputs and outputs are often interspersed.\n",
+    "\n",
+    "A related list is `s_nodes`. It contains the io_nodes at the beginning and adds all sequential elements (flip-flops, latches)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[0:__fork__\"a\"  >3 >9,\n",
+       " 1:__fork__\"b\"  >4 >10,\n",
+       " 2:__fork__\"s\" <5 ,\n",
+       " 4:DFF\"cin\" <1 >0]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.s_nodes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Basic Circuit Navigation\n",
+    "\n",
+    "A circuit can be traversed easily using the properties of `Circuit`, `Node`, and `Line`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "6:XOR\"axb\" <3 <4 >2"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.io_nodes[0].outs[0].reader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "6:XOR\"axb\" <3 <4 >2\n",
+      "9:AND\"aab\" <9 <10 >8\n"
+     ]
+    }
+   ],
+   "source": [
+    "for line in adder.io_nodes[0].outs:\n",
+    "    print(line.reader)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'cout'"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.cells['cin'].ins[0].driver.name"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's continue with `b14` loaded before. It has 91 io_nodes:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "({name: \"b14\", cells: 15873, forks: 15842, lines: 46891, io_nodes: 91},\n",
+       " [31587:input\"clock\"  >15805,\n",
+       "  31589:input\"reset\"  >15806,\n",
+       "  31591:output\"addr[19]\" <46836 ,\n",
+       "  31592:output\"addr[18]\" <46837 ,\n",
+       "  31593:output\"addr[17]\" <46838 ,\n",
+       "  31594:output\"addr[16]\" <46839 ,\n",
+       "  31595:output\"addr[15]\" <46840 ,\n",
+       "  31596:output\"addr[14]\" <46841 ,\n",
+       "  31597:output\"addr[13]\" <46842 ,\n",
+       "  31598:output\"addr[12]\" <46843 ,\n",
+       "  31599:output\"addr[11]\" <46844 ,\n",
+       "  31600:output\"addr[10]\" <46845 ,\n",
+       "  31601:output\"addr[9]\" <46846 ,\n",
+       "  31602:output\"addr[8]\" <46847 ,\n",
+       "  31603:output\"addr[7]\" <46848 ,\n",
+       "  31604:output\"addr[6]\" <46849 ,\n",
+       "  31605:output\"addr[5]\" <46850 ,\n",
+       "  31606:output\"addr[4]\" <46851 ,\n",
+       "  31607:output\"addr[3]\" <46852 ,\n",
+       "  31608:output\"addr[2]\" <46853 ])"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "b14, b14.io_nodes[:20]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "and even more sequential nodes:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "306"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(b14.s_nodes)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `io_locs(prefix)` and `s_locs(prefix)` methods return the locations of signals, busses and registers in `io_nodes` and `s_nodes`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "b14.io_locs('reset')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2]"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "b14.io_locs('addr')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[1130:SDFFARX1_RVT\"IR_reg_0_\" <16917 <16919 <16918 <16920 <16921 >566 >567,\n",
+       " 1202:SDFFARX1_RVT\"IR_reg_1_\" <17052 <17054 <17053 <17055 <17056 >611 >612,\n",
+       " 1124:SDFFARX1_RVT\"IR_reg_2_\" <16907 <16909 <16908 <16910 <16911 >562 >563,\n",
+       " 1127:SDFFARX1_RVT\"IR_reg_3_\" <16912 <16914 <16913 <16915 <16916 >564 >565,\n",
+       " 1199:SDFFARX1_RVT\"IR_reg_4_\" <17047 <17049 <17048 <17050 <17051 >609 >610,\n",
+       " 1196:SDFFARX1_RVT\"IR_reg_5_\" <17042 <17044 <17043 <17045 <17046 >607 >608,\n",
+       " 1193:SDFFARX1_RVT\"IR_reg_6_\" <17037 <17039 <17038 <17040 <17041 >605 >606,\n",
+       " 1190:SDFFARX1_RVT\"IR_reg_7_\" <17032 <17034 <17033 <17035 <17036 >603 >604,\n",
+       " 1187:SDFFARX1_RVT\"IR_reg_8_\" <17027 <17029 <17028 <17030 <17031 >601 >602,\n",
+       " 1184:SDFFARX1_RVT\"IR_reg_9_\" <17022 <17024 <17023 <17025 <17026 >599 >600,\n",
+       " 1181:SDFFARX1_RVT\"IR_reg_10_\" <17017 <17019 <17018 <17020 <17021 >597 >598,\n",
+       " 1178:SDFFARX1_RVT\"IR_reg_11_\" <17012 <17014 <17013 <17015 <17016 >595 >596,\n",
+       " 1175:SDFFARX1_RVT\"IR_reg_12_\" <17007 <17009 <17008 <17010 <17011 >593 >594,\n",
+       " 1172:SDFFARX1_RVT\"IR_reg_13_\" <17002 <17004 <17003 <17005 <17006 >591 >592,\n",
+       " 1169:SDFFARX1_RVT\"IR_reg_14_\" <16997 <16999 <16998 <17000 <17001 >589 >590,\n",
+       " 1166:SDFFARX1_RVT\"IR_reg_15_\" <16992 <16994 <16993 <16995 <16996 >587 >588,\n",
+       " 1163:SDFFARX1_RVT\"IR_reg_16_\" <16987 <16989 <16988 <16990 <16991 >585 >586,\n",
+       " 1133:SDFFARX1_RVT\"IR_reg_17_\" <16922 <16924 <16923 <16925 <16926 >568 >569,\n",
+       " 1136:SDFFARX1_RVT\"IR_reg_18_\" <16927 <16929 <16928 <16930 <16931 >570,\n",
+       " 1138:SDFFARX1_RVT\"IR_reg_19_\" <16932 <16934 <16933 <16935 <16936 >571 >572,\n",
+       " 1141:SDFFARX1_RVT\"IR_reg_20_\" <16937 <16939 <16938 <16940 <16941 >573,\n",
+       " 1143:SDFFARX1_RVT\"IR_reg_21_\" <16942 <16944 <16943 <16945 <16946 >574 >575,\n",
+       " 1146:SDFFARX1_RVT\"IR_reg_22_\" <16947 <16949 <16948 <16950 <16951 >576 >577,\n",
+       " 1149:SDFFARX1_RVT\"IR_reg_23_\" <16952 <16954 <16953 <16955 <16956 >578,\n",
+       " 1151:SDFFARX1_RVT\"IR_reg_24_\" <16957 <16959 <16958 <16960 <16961 >579,\n",
+       " 1153:SDFFARX1_RVT\"IR_reg_25_\" <16962 <16964 <16963 <16965 <16966 >580,\n",
+       " 1155:SDFFARX1_RVT\"IR_reg_26_\" <16967 <16969 <16968 <16970 <16971 >581,\n",
+       " 1157:SDFFARX1_RVT\"IR_reg_27_\" <16972 <16974 <16973 <16975 <16976 >582,\n",
+       " 1159:SDFFARX1_RVT\"IR_reg_28_\" <16977 <16979 <16978 <16980 <16981 >583,\n",
+       " 1161:SDFFARX1_RVT\"IR_reg_29_\" <16982 <16984 <16983 <16985 <16986 >584,\n",
+       " 1122:SDFFARX1_RVT\"IR_reg_30_\" <16902 <16904 <16903 <16905 <16906 >561,\n",
+       " 1120:SDFFARX1_RVT\"IR_reg_31_\" <16897 <16899 <16898 <16900 <16901 >560]"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "[b14.s_nodes[i] for i in b14.s_locs('IR_reg')]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Example: Tracing out a scan chain.**\n",
+    "\n",
+    "We start at the output of the scan chain \"test_so000\", then go backwards through the circuit.\n",
+    "\n",
+    "When we encounter a scan cell \"SDFF\", we continue with the \"SI\" pin, which has index 2."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "length (with forks): 573\n",
+      "length (without forks): 287\n",
+      "length only SDFF: 215\n",
+      "output\"test_so000\" __fork__\"test_so000\" NBUFFX8_RVT\"HFSBUF_36_76\" __fork__\"aps_rename_215_\" SDFFARX1_RVT\"wr_reg\" __fork__\"HFSNET_169\" INVX4_RVT\"HFSINV_691_254\" __fork__\"HFSNET_170\" INVX0_RVT\"HFSINV_2682_255\" __fork__\"state\" ... __fork__\"IR[0]\" SDFFARX1_RVT\"IR_reg_0_\" __fork__\"ZBUF_17_16\" NBUFFX2_RVT\"ZBUF_17_inst_905\" __fork__\"ZBUF_275_16\" NBUFFX4_RVT\"ZBUF_275_inst_906\" __fork__\"B\" SDFFARX1_RVT\"B_reg\" __fork__\"test_si000\" input\"test_si000\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "chain = [cell := b14.cells['test_so000']]\n",
+    "while len(cell.ins) > 0:\n",
+    "    chain.append(cell := cell.ins[2 if cell.kind.startswith('SDFF') else 0].driver)\n",
+    "        \n",
+    "print(f'length (with forks): {len(chain)}')\n",
+    "print(f'length (without forks): {len(list(filter(lambda n: n.kind != \"__fork__\", chain)))}')\n",
+    "print(f'length only SDFF: {len(list(filter(lambda n: n.kind.startswith(\"SDFF\"), chain)))}')\n",
+    "\n",
+    "names = [f'{c.kind}\"{c.name}\"' for c in chain]\n",
+    "print(' '.join(names[:10]) + ' ... ' + ' '.join(names[-10:]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There is a generator for **traversing the circuit in topological order**.\n",
+    "\n",
+    "The following loop prints all nodes:\n",
+    "* starting with primary inputs (nodes that don't have any input connections) and sequential elements,\n",
+    "* and continuing with nodes who's inputs are connected only to already printed nodes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0:__fork__\"a\"  >3 >9\n",
+      "1:__fork__\"b\"  >4 >10\n",
+      "4:DFF\"cin\" <1 >0\n",
+      "6:XOR\"axb\" <3 <4 >2\n",
+      "9:AND\"aab\" <9 <10 >8\n",
+      "5:__fork__\"cin\" <0 >7 >13\n",
+      "7:__fork__\"axb\" <2 >6 >12\n",
+      "10:__fork__\"aab\" <8 >15\n",
+      "8:XOR\"s\" <6 <7 >5\n",
+      "11:AND\"axbacin\" <12 <13 >11\n",
+      "2:__fork__\"s\" <5 \n",
+      "12:__fork__\"axbacin\" <11 >16\n",
+      "13:OR\"cout\" <15 <16 >14\n",
+      "3:__fork__\"cout\" <14 >1\n"
+     ]
+    }
+   ],
+   "source": [
+    "for n in adder.topological_order():\n",
+    "    print(n)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Example: Determining logic level (distance from inputs or sequential elements) of nodes.**\n",
+    "\n",
+    "Inputs and flip-flops themselves are level 0, *cells* driven by just inputs and flip-flops are level 1, and so on.\n",
+    "*Fork* nodes have the same level as their driver, because they do not increase the logic depth."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum logic depth: 112\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "levels = np.zeros(len(b14.nodes), dtype=np.uint32)  # store level for each node.\n",
+    "\n",
+    "for n in b14.topological_order():\n",
+    "    if 'DFF' in n.kind or len(n.ins) == 0:\n",
+    "        levels[n] = 0\n",
+    "    elif n.kind == '__fork__':\n",
+    "        levels[n] = levels[n.ins[0].driver]  # forks only have exactly one driver\n",
+    "    else:\n",
+    "        levels[n] = max([levels[line.driver] for line in n.ins]) + 1\n",
+    "        \n",
+    "print(f'Maximum logic depth: {np.max(levels)}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "List nodes with the highest depth and which nodes they are driving."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "depth: 112 node: __fork__     n2692  driving: SDFFARX1_RVT reg1_reg_29_  \n",
+      "depth: 112 node: NAND2X0_RVT  U465   driving: __fork__     n2692         \n",
+      "depth: 112 node: NAND2X0_RVT  U562   driving: __fork__     n2724         \n",
+      "depth: 112 node: __fork__     n2724  driving: SDFFARX1_RVT reg0_reg_29_  \n",
+      "depth: 112 node: __fork__     n2608  driving: SDFFARX1_RVT B_reg         \n",
+      "depth: 112 node: NAND2X0_RVT  U170   driving: __fork__     n2608         \n",
+      "depth: 111 node: NAND2X0_RVT  U5550  driving: __fork__     n2693         \n",
+      "depth: 111 node: __fork__     n2660  driving: SDFFARX1_RVT reg2_reg_29_  \n",
+      "depth: 111 node: AND2X2_RVT   U5560  driving: __fork__     n2660         \n",
+      "depth: 111 node: __fork__     n2725  driving: SDFFARX1_RVT reg0_reg_28_  \n",
+      "depth: 111 node: __fork__     n2693  driving: SDFFARX1_RVT reg1_reg_28_  \n",
+      "depth: 111 node: __fork__     n362   driving: NAND2X0_RVT  U170          \n",
+      "depth: 111 node: NAND2X0_RVT  U173   driving: __fork__     n362          \n",
+      "depth: 111 node: __fork__     n600   driving: NAND2X0_RVT  U562          \n",
+      "depth: 111 node: NAND2X0_RVT  U563   driving: __fork__     n600          \n",
+      "depth: 111 node: NAND2X0_RVT  U565   driving: __fork__     n2725         \n",
+      "depth: 111 node: NAND2X0_RVT  U466   driving: __fork__     n535          \n",
+      "depth: 111 node: __fork__     n535   driving: NAND2X0_RVT  U465          \n",
+      "depth: 110 node: __fork__     n4691  driving: AND2X2_RVT   U5560         \n",
+      "depth: 110 node: NAND2X0_RVT  U5736  driving: __fork__     n790          \n"
+     ]
+    }
+   ],
+   "source": [
+    "nodes_by_depth = np.argsort(levels)[::-1]\n",
+    "\n",
+    "for n_idx in nodes_by_depth[:20]:\n",
+    "    n = b14.nodes[n_idx]\n",
+    "    readers = ', '.join([f'{l.reader.kind:12s} {l.reader.name:14s}' for l in n.outs])\n",
+    "    print(f'depth: {levels[n_idx]} node: {n.kind:12s} {n.name:6s} driving: {readers}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Working With Logic Values\n",
+    "\n",
+    "Sequential states of circuits, signals, and test patterns contain logic values.\n",
+    "\n",
+    "KyuPy provides some useful tools to deal with 2-valued, 4-valued, and 8-valued logic data.\n",
+    "\n",
+    "All logic values are stored in numpy arrays of dtype `np.uint8`.\n",
+    "\n",
+    "There are two storage formats:\n",
+    "* `mv` (for \"multi-valued\"): Each logic value is stored as uint8\n",
+    "* `bp` (for \"bit-parallel\"): Groups of 8 logic values are stored as three uint8"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### `mv` Arrays\n",
+    "\n",
+    "Suppose we want to simulate the adder circuit with 2 inputs, 1 output and 1 flip-flop."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[0:__fork__\"a\"  >3 >9,\n",
+       " 1:__fork__\"b\"  >4 >10,\n",
+       " 2:__fork__\"s\" <5 ,\n",
+       " 4:DFF\"cin\" <1 >0]"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adder.s_nodes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can construct a set of vectors using the `mvarray` helper function.\n",
+    "\n",
+    "Each vector has 2 elements, one for each io_node and sequential element.\n",
+    "\n",
+    "This would be an exhaustive vector set (the output in `s_nodes` remains unassigned (\"-\")):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0, 3, 0, 3, 0, 3, 0, 3],\n",
+       "       [0, 0, 3, 3, 0, 0, 3, 3],\n",
+       "       [2, 2, 2, 2, 2, 2, 2, 2],\n",
+       "       [0, 0, 0, 0, 3, 3, 3, 3]], dtype=uint8)"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from kyupy.logic import mvarray\n",
+    "\n",
+    "inputs = mvarray('00-0', '10-0', '01-0', '11-0', '00-1', '10-1', '01-1', '11-1')\n",
+    "inputs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The numeric values in this array are defined in `kyupy.logic`.\n",
+    "\n",
+    "The **last** axis is always the number of vectors. It may be unintuitive at first, but it is more convenient for data-parallel simulations.\n",
+    "\n",
+    "The **second-to-last** axis corresponds to `s_nodes`. I.e., the first row is for input 'a', the second row for input 'b', and so on."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4, 8)"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inputs.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Get a string representation of a vector set. Possible values are '0', '1', '-', 'X', 'R', 'F', 'P', and 'N'."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "00-0\n",
+      "10-0\n",
+      "01-0\n",
+      "11-0\n",
+      "00-1\n",
+      "10-1\n",
+      "01-1\n",
+      "11-1\n"
+     ]
+    }
+   ],
+   "source": [
+    "from kyupy.logic import mv_str\n",
+    "\n",
+    "print(mv_str(inputs))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load a stuck-at fault test pattern set and expected fault-free responses from a STIL file. It contains 1081 test vectors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from kyupy import stil\n",
+    "\n",
+    "s = stil.load('../tests/b14.stuck.stil.gz')\n",
+    "stuck_tests = s.tests(b14)\n",
+    "stuck_responses = s.responses(b14)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "306"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(b14.s_nodes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(306, 1081)"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stuck_tests.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(306, 1081)"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stuck_responses.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "00--------------------00101011101101111011100010101100----------------------------------00-11110011001100110001100110011000100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110110011001100110011001100110011001001100110011001100111000\n",
+      "P0--------------------11011111011001100111010101011101----------------------------------00-10111011010110011101110010111010111011101100010000110101111111011010101001010101010101010101001010110101001010101010101010110100000111111111111111011010100100101010010010101101010101001010100111010001010010000011100\n",
+      "P0--------------------00001101000101011100111111111111----------------------------------00-10000001000000010100000000000000110110110111111010101000101100101110001111101001110110100000110101001000100101000101010101001000011110110111111111000001111000010000101100010000100010100100011111101010001101000100011\n",
+      "P0--------------------11011111111110101011001111101111----------------------------------01-10000000000000000000000000000000001010011100110011010111011100010001110110011100101000011010101011111111111111111111111111111111011000011101010110111110010101100101010011100010001000010101010011010111100010100110111\n",
+      "P0--------------------00011111111110101001011001011111----------------------------------01-10000000001001010000000000000000111101001010000110000000010011010001100000010110001101100011010111000011011011010001000000000011111111111101000101011111111110000100101010000001010101011100000101001011010101010110001\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(mv_str(stuck_tests[:,:5]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--11001100110011001100--------------------------------0011001100110011001100110011001110--011110011001100110001100110011000100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110110011001100110011001100110011001001100110011001100111000\n",
+      "--10000010010100010111--------------------------------0101010010101010110101001001010100--011111110011011111000111010101010111011101100010000110101111111011010101001010101010101010101001010110101001010101010101010110100000111111111111111011010100100101010010010101101010101001010101000111111111111111011101\n",
+      "--01000101100010101111--------------------------------1000100101000100001000110100001010--001000001111111101000000000000000110110110111111010101000101100101110001111101001110110100000110101001000100101000101010101001000011001110111111111000001111000010000101100010000100010100100011111000111110100111000010\n",
+      "--11001010001111010110--------------------------------1010101000010001000111001010100101--110000000000000000000000000000000001010011100110011010111011100010001110110011100101000011010101000000000000000000000000000000000011100011101010110111110010100100101010011100010001000010101010011010111100010100110000\n",
+      "--11010101010110100101--------------------------------0000111010101010000001010100100000--001000000001110101100000000000000111101001010000110000000010011010001100000010110001101100011010111000011011011010001000000000011111000000011000101011111111110000100101010000001010101011100001000110000001011000111000\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(mv_str(stuck_responses[:,:5]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The order of values in the vectors correspond to the circuit's `s_nodes`.\n",
+    "The test data can be used directly in the simulators as they use the same ordering convention.\n",
+    "\n",
+    "`stuck_tests` has values for all primary inputs and scan flip-flops, `stuck_responses` contains the expected values for all primary outputs and scan flip-flops.\n",
+    "\n",
+    "Since this is a static test, only '0' and '1' are used with the exception of the clock input, which has a positive pulse 'P'.\n",
+    "\n",
+    "A transition fault test is a dynamic test that also contains 'R' for rising transition and 'F' for falling transition:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s = stil.load('../tests/b14.transition.stil.gz')\n",
+    "transition_tests = s.tests_loc(b14)\n",
+    "transition_responses = s.responses(b14)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "XX--------------------XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX----------------------------------XX-11110011001100110001100110011000100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110110011001100110011001100110011001001100110011001100111000\n",
+      "00--------------------RRRRRRFRRRRRRRRRRRFFRFRRRRRRRRRR----------------------------------00-00000001110100011111011010000000000000000011001001100101111110101110110001000100010100110111111101101000000111110011100010111000111R1111111111111111111111110001100100000110100000111010101110RFF00F000F0F00F00000FF01F\n",
+      "00--------------------11R111110R0RR0R1110R01R1R001FRRR----------------------------------0F-RR0R00000000RR11R0RRR000R0R000R0010100010001011000111001000010001010111010101010100000000100001011100100001100011110110100000010011000011111100010111100010010111110100011100100011010000010111F11F0F01RRR110F0R01R011R\n",
+      "00--------------------RRFRRFR100FR10R010F10FR1111F111R----------------------------------0R-F01R0F01F100R01FF1F10R101FR1RR1R01001110010101110110000000100101111101001110100010011111111110100010001000100110100001111110011001001011011010111111111111111F10110101001010001001000001011001R0RF0R0R1FRR0RFR1RRRR101R\n",
+      "00--------------------FFR1RRF11FFRRR1R0FR100FF1R0FRFFF----------------------------------01-111FRRFF0RRRRRF01FFFRFRFFRRRFFRR11110100111100111000011001000111111000100011010000010000011111011111111110110111100100110001001111111000111111111000011101111000010010101000000101010101110000R1R1RRF001R1R1R1R10FF001R\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(mv_str(transition_tests[:,:5]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--11001100110011001100--------------------------------0011001100110011001100110011001110--011110011001100110001100110011000100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110110011001100110011001100110011001001100110011001100111000\n",
+      "--00000000000000000000--------------------------------0111010101110000010110000010011010--011111111111110111111100101111111000000000011001001100101111110101110110001000100010100110111111101101000000111110011100010111000111111111111111111111111111100011001000001101000001110101011101111111111111111111110011\n",
+      "--11010001111110000110--------------------------------1101000001011000100111000101111110--000010111111100000100011101011100010100010001011000111001000010001010111010101010100000000100001011100100001100011110110100000010011100011111100010111100010010111110100011100100011010000010110011000011111100010111110\n",
+      "--11111101011011010010--------------------------------1001101000001001000101001010110110--000110001010010100101011010111111101101001011001110000000000001001111010011101000100111111111101000100010001001101000011111100110010110110110101111111111111110101101010010100010010000010110010010010110110101111111010\n",
+      "--00011111111100011111--------------------------------0000111010101010000001010100100010--000011011100000110111010110001100000100110000110011111001101111001110001000110100000100000111110111111111101101111001001100010011111001001111111110000111011110000100101010000001010101011100001100101100011010110110010\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(mv_str(transition_responses[:,:5]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### `bp` Arrays\n",
+    "\n",
+    "The logic simulator uses bit-parallel storage of logic values, but our loaded test data uses one `uint8` per logic value.\n",
+    "\n",
+    "Use `mv_to_bp` to convert mv data to the bit-parallel storage layout.\n",
+    "Bit-parallel storage is more compact, but individual values cannot be easily accessed anymore."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from kyupy.logic import mv_to_bp, bp_to_mv\n",
+    "\n",
+    "stuck_tests_bp = mv_to_bp(stuck_tests)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(306, 3, 136)"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stuck_tests_bp.data.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Instead of 1081 bytes per s_node, bit-parallel storage only uses 3*136=408 bytes.\n",
+    "\n",
+    "The reverse operation is `bp_to_mv`. Note that the number of vectors may be rounded up to the next multiple of 8:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(306, 1088)"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "bp_to_mv(stuck_tests_bp).shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Logic Simulation\n",
+    "\n",
+    "The following code performs a 8-valued logic simulation on all 1081 vectors for one clock cycle."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from kyupy.logic_sim import LogicSim\n",
+    "\n",
+    "sim = LogicSim(b14, sims=stuck_tests.shape[-1])  # 1081 simulations in parallel\n",
+    "sim.s[0] = stuck_tests_bp\n",
+    "sim.s_to_c()\n",
+    "sim.c_prop()\n",
+    "sim.c_to_s()\n",
+    "sim_responses = bp_to_mv(sim.s[1])[...,:stuck_tests.shape[-1]]  # trim from 1088 -> 1081"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--11001100110011001100--------------------------------0011001100110011001100110011001110--000110110110111010111011100010100100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110110011001100110011001100110011001100110011001100110011001\n",
+      "--10000010010100010111--------------------------------0101010010101010110101001001010100--011111110011011111000111010101010111011101100010000110101111111011010101001010101010101010101001010110101001010101010101010110100000111111111111111011010100100101010010010101101010101001010101000111111111111111011101\n",
+      "--01000101100010101111--------------------------------1000100101000100001000110100001010--001000001111111101000000000000000110110110111111010101000101100101110001111101001110110100000110101001000100101000101010101001000011001110111111111000001111000010000101100010000100010100100011111000111110100111000010\n",
+      "--11001010001111010110--------------------------------1010101000010001000111001010100101--110000000000000000000000000000000001010011100110011010111011100010001110110011100101000011010101000000000000000000000000000000000011100011101010110111110010100100101010011100010001000010101010011010111100010100110000\n",
+      "--11010101010110100101--------------------------------0000111010101010000001010100100000--001000000001110101100000000000000111101001010000110000000010011010001100000010110001101100011010111000011011011010001000000000011111000000011000101011111111110000100101010000001010101011100001000110000001011000111000\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(mv_str(sim_responses[:,:5]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compare simulation results to expected fault-free responses loaded from STIL.\n",
+    "\n",
+    "The first test fails, because it is a flush test while simulation implicitly assumes a standard test with a capture clock.\n",
+    "\n",
+    "The remaining 1080 responses are identical."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1080"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.sum(np.min(sim_responses == stuck_responses, axis=0))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Same simulation for the transition-fault test set:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sim = LogicSim(b14, sims=transition_tests.shape[-1])  # 1392 simulations in parallel\n",
+    "sim.s[0] = mv_to_bp(transition_tests)\n",
+    "sim.s_to_c()\n",
+    "sim.c_prop()\n",
+    "sim.c_to_s()\n",
+    "sim_responses = bp_to_mv(sim.s[1])[...,:transition_tests.shape[-1]]  # trim to 1392"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--11001100110011001100--------------------------------0011001100110011001100110011001110--0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110110011001100110011001100110011001100110011001100110011001\n",
+      "--F00000F00F0F000F00FF--------------------------------01110101011100000101100000100110R0--0RRRRRRRNNNRNRPRNNNNNRFFRFRRRRRRR000000000011001001100101111110101110110001000100010100110111111101101000000111110011100010111000NNNNNNNNNNNNNNNNNNNNNNNNNNNNP0011001000001101000001110101011101RRRRRRRRRRRRRRRRRRRRP01R\n",
+      "--R10R0F011RRR10F0F11F--------------------------------1101000001011000100111000101111110--0FFPNPRRRRRRRFFFFFRFFFRRRFRFRRRFPPNPNPPPNPPPNPNNPPPNNNPPNPPPPNPPPNPNPNNNPNPNPNPNPNPPPPPPPPNPPPPNPNNNPPNPPPPNNPPPNNNNPNNPNPPPPPPNPPNNRPPPNNNNNNPPPNPNNNNPPPNPPNPNNNNNPNPPPNNNPPNPPPNNPNP0P00N0NNFPNNPPPPNNNNNNPPPNPNNRNNF\n",
+      "--RRRR1RFR0RRF1R0R0FR0--------------------------------10011010000010010001010010101101RF--FPPNNPPPNPNPPNPNPPNPNPNNPNPNNNNNNRFRRFNFPRFRNPFNNRFFPPPPPPPFPPNPFNNNNPNPPNNNPNPPPNPPNNNNNNNNNNPNPPPNPPPNPPPNPPNNPNPPPPNNNNNNPPNNPPNPRNPNNPNNPNPNNNNNNNNNNNNNNNPNPNNPNPNPPNPNPPPNPPNPPPPPNPNNPPNFPNPPNPNNPNNPNPNNNNNNNPNF\n",
+      "--FF01R1R1R1R100FRR1R1--------------------------------00001110101010100000010101001000R0--0FFFRNFRRRFFFFFRRFRRRFRFRRFFFRRFFFFFNPFRRFFFFRRFFNRRRRFFRRFRRRNFFNNNPPPNPPPNNPNPPPPPNPPPPPNNNNNPNNNNNNNNNNPNNPNNNNPPNPPNNPPPNPPNNNNNFFRPPNNNNNNNNNPPPPNNNPNNNNPPPPNPPNPN0NP0PPPPN0N0NPN0NNNPPPPNNFFNFRRPFFNNFNFNNPRRPPNF\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(mv_str(sim_responses[:,:5]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The simulator responses contain 'R' for rising transition, 'F' for falling transition, 'P' for possible positive pulse(s) (010) and 'N' for possible negative pulse(s) (101).\n",
+    "\n",
+    "We need to map each of these cases to the final logic values before we can compare:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--11001100110011001100--------------------------------0011001100110011001100110011001110--0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110011001100110110011001100110011001100110011001100110011001100110011001\n",
+      "--00000000000000000000--------------------------------0111010101110000010110000010011010--011111111111110111111100101111111000000000011001001100101111110101110110001000100010100110111111101101000000111110011100010111000111111111111111111111111111100011001000001101000001110101011101111111111111111111110011\n",
+      "--11010001111110000110--------------------------------1101000001011000100111000101111110--000010111111100000100011101011100010100010001011000111001000010001010111010101010100000000100001011100100001100011110110100000010011100011111100010111100010010111110100011100100011010000010110011000011111100010111110\n",
+      "--11111101011011010010--------------------------------1001101000001001000101001010110110--000110001010010100101011010111111101101001011001110000000000001001111010011101000100111111111101000100010001001101000011111100110010110110110101111111111111110101101010010100010010000010110010010010110110101111111010\n",
+      "--00011111111100011111--------------------------------0000111010101010000001010100100010--000011011100000110111010110001100000100110000110011111001101111001110001000110100000100000111110111111111101101111001001100010011111001001111111110000111011110000100101010000001010101011100001100101100011010110110010\n"
+     ]
+    }
+   ],
+   "source": [
+    "sim_responses_final = np.choose(sim_responses, mvarray('0X-10101'))  # '0X-1PRFN'\n",
+    "print(mv_str(sim_responses_final[:,:5]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Again, first test is a flush test, so we expect 1391 matches."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1391"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.sum(np.min(sim_responses_final == transition_responses, axis=0))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Working With Delay Information and Timing Simulation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Delay data for gates and interconnect can be loaded from SDF files. In kyupy's timing simulators, delays are associated with the lines between nodes, not with the nodes themselves. Each line in the circuit has a rising delay, a falling delay, a negative pulse threshold, and a positive pulse threshold. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from kyupy import sdf\n",
+    "\n",
+    "df = sdf.load('../tests/b14.sdf.gz')\n",
+    "lt = df.annotation(b14, dataset=0, interconnect=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The returned delay information is an `ndarray` with a set of delay values for each line in the circuit."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(46891, 2, 2)"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lt.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Number of non-0 values loaded:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "120628"
+      ]
+     },
+     "execution_count": 55,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(lt != 0).sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The available timing simulators are `WaveSim` and `WaveSimCuda`.\n",
+    "They work similarly to `LogicSim` in that they evaluate all cells in topological order.\n",
+    "Instead of propagating a logic value, however, they propagate waveforms.\n",
+    "\n",
+    "`WaveSim` uses the numba just-in-time compiler for acceleration on CPU.\n",
+    "It falls back to pure python if numba is not available. `WaveSimCuda` uses numba for GPU acceleration.\n",
+    "If no CUDA card is available, it will fall back to pure python (not jit-compiled for CPU!).\n",
+    "Pure python is too slow for most purposes.\n",
+    "\n",
+    "Both simulators operate data-parallel.\n",
+    "The following instanciates a new engine for 32 independent timing simulations and each signal line in the circuit can carry at most 16 transitions. All simulators share the same circuit and the same line delay specification."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from kyupy.wave_sim import WaveSimCuda, TMAX\n",
+    "import numpy as np\n",
+    "\n",
+    "wsim = WaveSimCuda(b14, lt, sims=32, c_caps=16)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "These are various memories allocated, with waveforms usually being the largest. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Waveforms              : 93908.5 kiB\n",
+      "State Allocation Table : 1113.4 kiB\n",
+      "Circuit Timing         : 1484.5 kiB\n",
+      "Circuit Netlist        : 1099.0 kiB\n",
+      "Sequential State       : 420.8 kiB\n"
+     ]
+    }
+   ],
+   "source": [
+    "def print_mem(name, arr):\n",
+    "    print(f'{name}: {arr.nbytes / 1024:.1f} kiB')\n",
+    "    \n",
+    "print_mem('Waveforms              ', wsim.c)\n",
+    "print_mem('State Allocation Table ', wsim.vat)\n",
+    "print_mem('Circuit Timing         ', wsim.timing)\n",
+    "print_mem('Circuit Netlist        ', wsim.ops)\n",
+    "print_mem('Sequential State       ', wsim.s)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is a typical simulation loop where the number of patterns is larger than the number of simulators available.\n",
+    "We simulate `trans_tests_bp`.\n",
+    "The timing simulator accepts 8-valued `BPArray`s, but it will return response (capture) data in a different format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sims = 128  # trans_tests.shape[-1]  # Feel free to simulate all tests if CUDA is set up correctly.\n",
+    "\n",
+    "cdata = np.zeros((len(wsim.interface), sims, 7))  # space to store all capture data\n",
+    "\n",
+    "for offset in range(0, sims, wsim.sims):\n",
+    "    wsim.assign(trans_tests_bp, offset=offset)\n",
+    "    wsim.propagate(sims=sims-offset)\n",
+    "    wsim.capture(time=2.5, cdata=cdata, offset=offset)  # capture at time 2.5"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The capture data contains for each PI, PO, and scan flip-flop (axis 0), and each test (axis 1) seven values:\n",
+    "\n",
+    "0. Probability of capturing a 1 at the given capture time (same as next value, if no standard deviation given).\n",
+    "1. A capture value decided by random sampling according to above probability.\n",
+    "2. The final value (assume a very late capture time).\n",
+    "3. True, if there was a premature capture (capture error), i.e. final value is different from captured value.\n",
+    "4. Earliest arrival time. The time at which the output transitioned from its initial value.\n",
+    "5. Latest stabilization time. The time at which the output transitioned to its final value.\n",
+    "6. Overflow indicator. If non-zero, some signals in the input cone of this output had more transitions than specified in `wavecaps`. Some transitions have been discarded, the final values in the waveforms are still valid."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(306, 128, 7)"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cdata.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For validating against known logic values, take `cdata[...,1]`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mismatch for test pattern 0\n",
+      "127 of 128 responses matched with simulator\n"
+     ]
+    }
+   ],
+   "source": [
+    "matches = 0\n",
+    "\n",
+    "for i in range(cdata.shape[1]):\n",
+    "    response = ''.join('1' if x > 0.5 else '0' for x in cdata[..., i, 1])\n",
+    "    if trans_responses[i].replace('-','0') == response:\n",
+    "        matches += 1\n",
+    "    else:\n",
+    "        print(f'mismatch for test pattern {i}')\n",
+    "print(f'{matches} of {cdata.shape[1]} responses matched with simulator')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The circuit delay is the maximum among all latest stabilization times:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2.17240047454834"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cdata[...,5].max()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check for overflows. If too many of them occur, increase `wavecaps` during engine instanciation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2.0"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cdata[...,6].sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check for capture failures:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.0"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cdata[...,3].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/kyupy/bench.py b/src/kyupy/bench.py
index fb1ba1d..df8bea9 100644
--- a/src/kyupy/bench.py
+++ b/src/kyupy/bench.py
@@ -21,7 +21,7 @@ class BenchTransformer(Transformer):
 
     def start(self, _): return self.c
 
-    def parameters(self, args): return [self.c.get_or_add_fork(name) for name in args]
+    def parameters(self, args): return [self.c.get_or_add_fork(str(name)) for name in args]
 
     def interface(self, args): self.c.io_nodes.extend(args[0])
 
diff --git a/src/kyupy/circuit.py b/src/kyupy/circuit.py
index 29413f2..8802cec 100644
--- a/src/kyupy/circuit.py
+++ b/src/kyupy/circuit.py
@@ -286,16 +286,8 @@ class Circuit:
     def __eq__(self, other):
         return self.nodes == other.nodes and self.lines == other.lines and self.io_nodes == other.io_nodes
 
-    def dump(self):
-        """Returns a string representation of the circuit and all its nodes.
-        """
-        header = f'{self.name}({",".join([str(n.index) for n in self.io_nodes])})\n'
-        return header + '\n'.join([str(n) for n in self.nodes])
-
     def __repr__(self):
-        name = f' {self.name}' if self.name else ''
-        return f'<Circuit{name} cells={len(self.cells)} forks={len(self.forks)} ' + \
-               f'lines={len(self.lines)} ports={len(self.io_nodes)}>'
+        return f'{{name: "{self.name}", cells: {len(self.cells)}, forks: {len(self.forks)}, lines: {len(self.lines)}, io_nodes: {len(self.io_nodes)}}}'
 
     @property
     def stats(self):
@@ -303,18 +295,15 @@ class Circuit:
         stats['__node__'] = len(self.nodes)
         stats['__cell__'] = len(self.cells)
         stats['__fork__'] = len(self.forks)
-        stats['__port__'] = len(self.io_nodes)
+        stats['__io__'] = len(self.io_nodes)
         stats['__line__'] = len(self.lines)
         for n in self.cells.values():
             stats[n.kind] += 1
-            if 'dff' in n.kind.lower():
-                stats['__dff__'] += 1
-            elif 'latch' in n.kind.lower():
-                stats['__latch__'] += 1
-            elif 'put' not in n.kind.lower():  # no input or output
-                stats['__comb__'] += 1
+            if 'dff' in n.kind.lower(): stats['__dff__'] += 1
+            elif 'latch' in n.kind.lower(): stats['__latch__'] += 1
+            elif 'put' not in n.kind.lower(): stats['__comb__'] += 1 # no input or output
         stats['__seq__'] = stats['__dff__'] + stats['__latch__']
-        return stats
+        return dict(stats)
 
     def topological_order(self):
         """Generator function to iterate over all nodes in topological order.
diff --git a/src/kyupy/logic.py b/src/kyupy/logic.py
index 0d1ccdb..7baa503 100644
--- a/src/kyupy/logic.py
+++ b/src/kyupy/logic.py
@@ -121,6 +121,12 @@ def mvarray(*a):
     return mva[..., 0, :]
 
 
+def mv_str(mva, delim='\n'):
+    sa = np.choose(mva, np.array([*'0X-1PRFN'], dtype=np.unicode_))
+    if mva.ndim == 1: return ''.join(sa)
+    return delim.join([''.join(c) for c in sa.swapaxes(-1,-2)])
+
+
 def mv_to_bp(mva):
     if mva.ndim == 1: mva = mva[..., np.newaxis]
     return np.packbits(unpackbits(mva)[...,:3], axis=-2, bitorder='little').swapaxes(-1,-2)
diff --git a/src/kyupy/logic_sim.py b/src/kyupy/logic_sim.py
index a830e95..c1f9d45 100644
--- a/src/kyupy/logic_sim.py
+++ b/src/kyupy/logic_sim.py
@@ -51,40 +51,6 @@ class LogicSim(SimOps):
         self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
         self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
 
-        #dffs = [n for n in circuit.nodes if 'dff' in n.kind.lower()]
-        #latches = [n for n in circuit.nodes if 'latch' in n.kind.lower()]
-        #self.interface = list(circuit.io_nodes) + dffs + latches
-
-        #self.width = len(self.interface)
-        #"""The number of bits in the circuit state (number of ports + number of state-elements)."""
-
-        #self.state = np.zeros((len(circuit.lines), mdim, nbytes), dtype='uint8')
-        #self.state_epoch = np.zeros(len(circuit.nodes), dtype='int8') - 1
-        #self.tmp = np.zeros((5, mdim, nbytes), dtype='uint8')
-        #self.zero = np.zeros((mdim, nbytes), dtype='uint8')
-        #self.epoch = 0
-
-        #self.latch_dict = dict((n.index, i) for i, n in enumerate(latches))
-        #self.latch_state = np.zeros((len(latches), mdim, nbytes), dtype='uint8')
-
-        # known_fct = [(f[:-4], getattr(self, f)) for f in dir(self) if f.endswith('_fct')]
-        # self.node_fct = []
-        # for n in circuit.nodes:
-        #     t = n.kind.lower().replace('__fork__', 'fork')
-        #     t = t.replace('nbuff', 'fork')
-        #     t = t.replace('input', 'fork')
-        #     t = t.replace('output', 'fork')
-        #     t = t.replace('__const0__', 'const0')
-        #     t = t.replace('__const1__', 'const1')
-        #     t = t.replace('tieh', 'const1')
-        #     t = t.replace('ibuff', 'not')
-        #     t = t.replace('inv', 'not')
-
-        #     fcts = [f for n, f in known_fct if t.startswith(n)]
-        #     if len(fcts) < 1:
-        #         raise ValueError(f'Unknown node kind {n.kind}')
-        #     self.node_fct.append(fcts[0])
-
     def __repr__(self):
         return f'<LogicSim {self.circuit.name} sims={self.sims} m={self.m} state_mem={hr_bytes(self.c.nbytes)}>'
 
@@ -96,22 +62,6 @@ class LogicSim(SimOps):
         :returns: The given stimuli object.
         """
         self.c[self.pippi_c_locs] = self.s[0, self.pippi_s_locs, :self.mdim]
-        # for node, stim in zip(self.interface, stimuli.data if hasattr(stimuli, 'data') else stimuli):
-        #     if len(node.outs) == 0: continue
-        #     if node.index in self.latch_dict:
-        #         self.latch_state[self.latch_dict[node.index]] = stim
-        #     else:
-        #         outputs = [self.state[line] if line else self.tmp[3] for line in node.outs]
-        #         self.node_fct[node]([stim], outputs)
-        #     for line in node.outs:
-        #         if line is not None: self.state_epoch[line.reader] = self.epoch
-        # for n in self.circuit.nodes:
-        #     if n.kind in ('__const1__', '__const0__'):
-        #         outputs = [self.state[line] if line else self.tmp[3] for line in n.outs]
-        #         self.node_fct[n]([], outputs)
-        #         for line in n.outs:
-        #             if line is not None: self.state_epoch[line.reader] = self.epoch
-        # return stimuli
 
     def c_to_s(self): #, responses, ff_transitions=False):
         """Capture the current values at the primary outputs and in the state-elements (flip-flops).
@@ -129,25 +79,6 @@ class LogicSim(SimOps):
         if self.mdim == 1:
             self.s[1, self.poppo_s_locs, 1:2] = self.c[self.poppo_c_locs]
 
-        # for node, resp in zip(self.interface, responses.data if hasattr(responses, 'data') else responses):
-        #     if len(node.ins) == 0: continue
-        #     if node.index in self.latch_dict:
-        #         resp[...] = self.state[node.outs[0]]
-        #     else:
-        #         resp[...] = self.state[node.ins[0]]
-        #     if not ff_transitions: continue
-        #     # outs of DFFs contain the previously assigned value (previous state)
-        #     if self.m > 2 and 'dff' in node.kind.lower() and len(node.outs) > 0:
-        #         if node.outs[0] is None:
-        #             resp[1, :] = ~self.state[node.outs[1], 0, :]  # assume QN is connected, take inverse of that.
-        #         else:
-        #             resp[1, :] = self.state[node.outs[0], 0, :]
-        #         if self.m > 4:
-        #             resp[..., 2, :] = resp[..., 0, :] ^ resp[..., 1, :]
-        #         # FIXME: We don't handle X or - correctly.
-
-        # return responses
-
     def c_prop(self, sims=None, inject_cb=None):
         """Propagate the input values towards the outputs (Perform all logic operations in topological order).
 
@@ -201,19 +132,9 @@ class LogicSim(SimOps):
                 elif op == SimPrim.XNOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
                 else: print(f'unknown SimPrim {op}')
                 if inject_cb is not None: inject_cb(o0, self.s[o0])
-        # for node in self.circuit.topological_order():
-        #     if self.state_epoch[node] != self.epoch: continue
-        #     inputs = [self.state[line] if line else self.zero for line in node.ins]
-        #     outputs = [self.state[line] if line else self.tmp[3] for line in node.outs]
-        #     if node.index in self.latch_dict:
-        #         inputs.append(self.latch_state[self.latch_dict[node.index]])
-        #     self.node_fct[node](inputs, outputs)
-        #     for line in node.outs:
-        #         if inject_cb is not None: inject_cb(line, self.state[line])
-        #         self.state_epoch[line.reader] = self.epoch
-        # self.epoch = (self.epoch + 1) % 128
 
     def s_ppo_to_ppi(self):
+        # TODO: handle latches correctly
         if self.m == 2:
             self.s[0, self.ppio_s_locs, 0] = self.s[1, self.ppio_s_locs, 0]
         else:
@@ -236,60 +157,6 @@ class LogicSim(SimOps):
         self.c_to_s()
         self.s_ppo_to_ppi()
 
-    # def fork_fct(self, inputs, outputs):
-    #     for o in outputs: o[...] = inputs[0]
-
-    # def const0_fct(self, _, outputs):
-    #     for o in outputs: o[...] = 0
-
-    # def const1_fct(self, _, outputs):
-    #     for o in outputs:
-    #         o[...] = 0
-    #         logic.bp_not(o, o)
-
-    # def not_fct(self, inputs, outputs):
-    #     logic.bp_not(outputs[0], inputs[0])
-
-    # def and_fct(self, inputs, outputs):
-    #     logic.bp_and(outputs[0], *inputs)
-
-    # def or_fct(self, inputs, outputs):
-    #     logic.bp_or(outputs[0], *inputs)
-
-    # def xor_fct(self, inputs, outputs):
-    #     logic.bp_xor(outputs[0], *inputs)
-
-    # def sdff_fct(self, inputs, outputs):
-    #     logic.bp_buf(outputs[0], inputs[0])
-    #     if len(outputs) > 1:
-    #         logic.bp_not(outputs[1], inputs[0])
-
-    # def dff_fct(self, inputs, outputs):
-    #     logic.bp_buf(outputs[0], inputs[0])
-    #     if len(outputs) > 1:
-    #         logic.bp_not(outputs[1], inputs[0])
-
-    # def latch_fct(self, inputs, outputs):
-    #     logic.bp_latch(outputs[0], inputs[0], inputs[1], inputs[2])
-    #     if len(outputs) > 1:
-    #         logic.bp_not(outputs[1], inputs[0])
-
-    # def nand_fct(self, inputs, outputs):
-    #     logic.bp_and(outputs[0], *inputs)
-    #     logic.bp_not(outputs[0], outputs[0])
-
-    # def nor_fct(self, inputs, outputs):
-    #     logic.bp_or(outputs[0], *inputs)
-    #     logic.bp_not(outputs[0], outputs[0])
-
-    # def xnor_fct(self, inputs, outputs):
-    #     logic.bp_xor(outputs[0], *inputs)
-    #     logic.bp_not(outputs[0], outputs[0])
-
-    # def aoi21_fct(self, inputs, outputs):
-    #     logic.bp_and(self.tmp[0], inputs[0], inputs[1])
-    #     logic.bp_or(outputs[0], self.tmp[0], inputs[2])
-    #     logic.bp_not(outputs[0], outputs[0])
 
 @numba.njit
 def _prop_cpu(ops, vat, c):
diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py
index a941eac..af1f02b 100644
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@@ -52,7 +52,7 @@ class WaveSim(SimOps):
         assert c_caps > 0 and c_caps % 4 == 0
         super().__init__(circuit, c_caps=c_caps//4, c_reuse=c_reuse, strip_forks=strip_forks)
         self.sims = sims
-        
+
         self.c_len *= 4
         self.vat[...,0:2] *= 4
 
@@ -81,7 +81,7 @@ class WaveSim(SimOps):
           transitions than specified in ``c_caps``. Some transitions have been discarded, the
           final values in the waveforms are still valid.
         """
-                     
+
         self.params = np.zeros((sims, 4), dtype=np.float32)
         self.params[...,0] = 1.0
 
@@ -203,7 +203,7 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
     if int(param[1]) == c_idx: c += param[2+z_cur]
     d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
     if int(param[1]) == d_idx: d += param[2+z_cur]
-    
+
     previous_t = TMIN
 
     current_t = min(a, b, c, d)
@@ -220,8 +220,8 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
                 a += param[2+(z_val^1)]
                 thresh += param[2+z_val]
             inputs ^= 1
-            next_t = a   
-        
+            next_t = a
+
         elif b == current_t:
             b_cur += 1
             b = cbuf[b_mem + b_cur, st_idx]
@@ -232,7 +232,7 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
                 thresh += param[2+z_val]
             inputs ^= 2
             next_t = b
-                
+
         elif c == current_t:
             c_cur += 1
             c = cbuf[c_mem + c_cur, st_idx]
@@ -242,8 +242,8 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
                 c += param[2+(z_val^1)]
                 thresh += param[2+z_val]
             inputs ^= 4
-            next_t = c 
-                     
+            next_t = c
+
         else:
             d_cur += 1
             d = cbuf[d_mem + d_cur, st_idx]
@@ -253,8 +253,8 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
                 d += param[2+(z_val^1)]
                 thresh += param[2+z_val]
             inputs ^= 8
-            next_t = d 
-       
+            next_t = d
+
         if (z_cur & 1) != ((lut >> inputs) & 1):
             # we generate a toggle in z_mem, if:
             #   ( it is the first toggle in z_mem OR
@@ -272,12 +272,12 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
             else:
                 z_cur -= 1
                 previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
-                
+
         current_t = min(a, b, c, d)
 
     # generate overflow flag or propagate from input
     cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
-    
+
 
 @numba.njit
 def level_eval_cpu(ops, op_start, op_stop, c, vat, st_start, st_stop, line_times, params, sd, seed):
@@ -350,7 +350,7 @@ class WaveSimCuda(WaveSim):
         self.vat = cuda.to_device(self.vat)
         self.timing = cuda.to_device(self.timing)
         self.params = cuda.to_device(self.params)
-        
+
         self._block_dim = (32, 16)
 
     def s_to_c(self):
@@ -361,7 +361,7 @@ class WaveSimCuda(WaveSim):
         gx = math.ceil(x / self._block_dim[0])
         gy = math.ceil(y / self._block_dim[1])
         return gx, gy
-    
+
     def c_prop(self, sims=None, sd=0.0, seed=1):
         sims = min(sims or self.sims, self.sims)
         for op_start, op_stop in zip(self.level_starts, self.level_stops):
@@ -369,12 +369,12 @@ class WaveSimCuda(WaveSim):
             wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.vat, int(0),
                 sims, self.timing, self.params, sd, seed)
         cuda.synchronize()
-    
+
     def c_to_s(self, time=TMAX, sd=0.0, seed=1):
         grid_dim = self._grid_dim(self.sims, self.s_len)
         wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppo_offset,
             time, sd * math.sqrt(2), seed)
-    
+
     def s_ppo_to_ppi(self, time=0.0):
         grid_dim = self._grid_dim(self.sims, self.s_len)
         ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.vat, time, self.ppi_offset, self.ppo_offset)
@@ -436,7 +436,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
     d_idx = ops[op_idx, 5]
 
     param = param[st_idx]
-    
+
     # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
     overflows = int(0)
 
@@ -451,7 +451,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
     a_cur = int(0)
     b_cur = int(0)
     c_cur = int(0)
-    d_cur = int(0)                                          
+    d_cur = int(0)
     z_cur = lut & 1
     if z_cur == 1:
         cbuf[z_mem, st_idx] = TMIN
@@ -464,7 +464,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
     if int(param[1]) == c_idx: c += param[2+z_cur]
     d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
     if int(param[1]) == d_idx: d += param[2+z_cur]
-    
+
     previous_t = TMIN
 
     current_t = min(a, b, c, d)
@@ -481,8 +481,8 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
                 a += param[2+(z_val^1)]
                 thresh += param[2+z_val]
             inputs ^= 1
-            next_t = a   
-        
+            next_t = a
+
         elif b == current_t:
             b_cur += 1
             b = cbuf[b_mem + b_cur, st_idx]
@@ -493,7 +493,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
                 thresh += param[2+z_val]
             inputs ^= 2
             next_t = b
-                
+
         elif c == current_t:
             c_cur += 1
             c = cbuf[c_mem + c_cur, st_idx]
@@ -503,8 +503,8 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
                 c += param[2+(z_val^1)]
                 thresh += param[2+z_val]
             inputs ^= 4
-            next_t = c 
-                     
+            next_t = c
+
         else:
             d_cur += 1
             d = cbuf[d_mem + d_cur, st_idx]
@@ -514,8 +514,8 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
                 d += param[2+(z_val^1)]
                 thresh += param[2+z_val]
             inputs ^= 8
-            next_t = d 
-       
+            next_t = d
+
         if (z_cur & 1) != ((lut >> inputs) & 1):
             # we generate a toggle in z_mem, if:
             #   ( it is the first toggle in z_mem OR
@@ -533,7 +533,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
             else:
                 z_cur -= 1
                 previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
-                
+
         current_t = min(a, b, c, d)
 
     # generate overflow flag or propagate from input
@@ -603,10 +603,10 @@ def ppo_to_ppi_gpu(s, vat, time, ppi_offset, ppo_offset):
     x, y = cuda.grid(2)
     if y >= s.shape[0]: return
     if x >= s.shape[1]: return
-    
+
     if vat[ppi_offset + y, 0] < 0: return
     if vat[ppo_offset + y, 0] < 0: return
-    
+
     s[y, x, 0] = s[y, x, 2]
     s[y, x, 1] = time
     s[y, x, 2] = s[y, x, 8]
diff --git a/src/kyupy/wave_sim_old.py b/src/kyupy/wave_sim_old.py
deleted file mode 100644
index fdfdd1c..0000000
--- a/src/kyupy/wave_sim_old.py
+++ /dev/null
@@ -1,961 +0,0 @@
-"""High-throughput combinational logic timing simulators.
-
-These simulators work similarly to :py:class:`~kyupy.logic_sim.LogicSim`.
-They propagate values through the combinational circuit from (pseudo) primary inputs to (pseudo) primary outputs.
-Instead of propagating logic values, these simulators propagate signal histories (waveforms).
-They are designed to run many simulations in parallel and while their latencies are quite high, they can achieve
-high throughput.
-
-The simulators are not event-based and are not capable of simulating sequential circuits directly.
-
-Two simulators are available: :py:class:`WaveSim` runs on the CPU, and the derived class
-:py:class:`WaveSimCuda` runs on the GPU.
-"""
-
-import math
-from bisect import bisect, insort_left
-
-import numpy as np
-
-from . import numba, cuda, hr_bytes
-
-
-TMAX = np.float32(2 ** 127)
-"""A large 32-bit floating point value used to mark the end of a waveform."""
-TMAX_OVL = np.float32(1.1 * 2 ** 127)
-"""A large 32-bit floating point value used to mark the end of a waveform that
-may be incomplete due to an overflow."""
-TMIN = np.float32(-2 ** 127)
-"""A large negative 32-bit floating point value used at the beginning of waveforms that start with logic-1."""
-
-
-class Heap:
-    def __init__(self):
-        self.chunks = dict()  # map start location to chunk size
-        self.released = list()  # chunks that were released
-        self.current_size = 0
-        self.max_size = 0
-
-    def alloc(self, size):
-        for idx, loc in enumerate(self.released):
-            if self.chunks[loc] == size:
-                del self.released[idx]
-                return loc
-            if self.chunks[loc] > size:  # split chunk
-                chunksize = self.chunks[loc]
-                self.chunks[loc] = size
-                self.chunks[loc + size] = chunksize - size
-                self.released[idx] = loc + size  # move released pointer: loc -> loc+size
-                return loc
-        # no previously released chunk; make new one
-        loc = self.current_size
-        self.chunks[loc] = size
-        self.current_size += size
-        self.max_size = max(self.max_size, self.current_size)
-        return loc
-
-    def free(self, loc):
-        size = self.chunks[loc]
-        if loc + size == self.current_size:  # end of managed area, remove chunk
-            del self.chunks[loc]
-            self.current_size -= size
-            # check and remove prev chunk if free
-            if len(self.released) > 0:
-                prev = self.released[-1]
-                if prev + self.chunks[prev] == self.current_size:
-                    chunksize = self.chunks[prev]
-                    del self.chunks[prev]
-                    del self.released[-1]
-                    self.current_size -= chunksize
-            return
-        released_idx = bisect(self.released, loc)
-        if released_idx < len(self.released) and loc + size == self.released[released_idx]:  # next chunk is free, merge
-            chunksize = size + self.chunks[loc + size]
-            del self.chunks[loc + size]
-            self.chunks[loc] = chunksize
-            size = self.chunks[loc]
-            self.released[released_idx] = loc
-        else:
-            insort_left(self.released, loc)  # put in a new release
-        if released_idx > 0:  # check if previous chunk is free
-            prev = self.released[released_idx - 1]
-            if prev + self.chunks[prev] == loc:  # previous chunk is adjacent to freed one, merge
-                chunksize = size + self.chunks[prev]
-                del self.chunks[loc]
-                self.chunks[prev] = chunksize
-                del self.released[released_idx]
-
-    def __repr__(self):
-        r = []
-        for loc in sorted(self.chunks.keys()):
-            size = self.chunks[loc]
-            released_idx = bisect(self.released, loc)
-            is_released = released_idx > 0 and len(self.released) > 0 and self.released[released_idx - 1] == loc
-            r.append(f'{loc:5d}: {"free" if is_released else "used"} {size}')
-        return "\n".join(r)
-
-
-class WaveSim:
-    """A waveform-based combinational logic timing simulator running on CPU.
-
-    :param circuit: The circuit to simulate.
-    :param timing: The timing annotation of the circuit (see :py:func:`kyupy.sdf.DelayFile.annotation` for details)
-    :param sims: The number of parallel simulations.
-    :param wavecaps: The number of floats available in each waveform. Waveforms are encoding the signal switching
-        history by storing transition times. The waveform capacity roughly corresponds to the number of transitions
-        that can be stored. A capacity of ``n`` can store at least ``n-2`` transitions. If more transitions are
-        generated during simulation, the latest glitch is removed (freeing up two transition times) and an overflow
-        flag is set. If an integer is given, all waveforms are set to that same capacity. With an array of length
-        ``len(circuit.lines)`` the capacity can be controlled for each intermediate waveform individually.
-    :param strip_forks: If enabled, the simulator will not evaluate fork nodes explicitly. This saves simulation time
-        by reducing the number of nodes to simulate, but (interconnect) delay annotations of lines read by fork nodes
-        are ignored.
-    :param keep_waveforms: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
-        memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
-    """
-    def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
-        self.circuit = circuit
-        self.sims = sims
-        self.overflows = 0
-        self.interface = list(circuit.io_nodes) + [n for n in circuit.nodes if 'dff' in n.kind.lower()]
-
-        self.lst_eat_valid = False
-
-        self.cdata = np.zeros((len(self.interface), sims, 7), dtype='float32')
-                     
-        self.sdata = np.zeros((sims, 4), dtype='float32')
-        self.sdata[...,0] = 1.0
-
-        if isinstance(wavecaps, int):
-            wavecaps = [wavecaps] * len(circuit.lines)
-
-        intf_wavecap = 4  # sufficient for storing only 1 transition.
-
-        # indices for state allocation table (sat)
-        self.zero_idx = len(circuit.lines)
-        self.tmp_idx = self.zero_idx + 1
-        self.ppi_offset = self.tmp_idx + 1
-        self.ppo_offset = self.ppi_offset + len(self.interface)
-        self.sat_length = self.ppo_offset + len(self.interface)
-
-        # translate circuit structure into self.ops
-        ops = []
-        interface_dict = dict((n, i) for i, n in enumerate(self.interface))
-        for n in circuit.topological_order():
-            if n in interface_dict:
-                inp_idx = self.ppi_offset + interface_dict[n]
-                if len(n.outs) > 0 and n.outs[0] is not None:  # first output of a PI/PPI
-                    ops.append((0b1010, n.outs[0].index, inp_idx, self.zero_idx))
-                if 'dff' in n.kind.lower():  # second output of DFF is inverted
-                    if len(n.outs) > 1 and n.outs[1] is not None:
-                        ops.append((0b0101, n.outs[1].index, inp_idx, self.zero_idx))
-                else:  # if not DFF, no output is inverted.
-                    for o_line in n.outs[1:]:
-                        if o_line is not None:
-                            ops.append((0b1010, o_line.index, inp_idx, self.zero_idx))
-            else:  # regular node, not PI/PPI or PO/PPO
-                o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx
-                i0_idx = n.ins[0].index if len(n.ins) > 0 and n.ins[0] is not None else self.zero_idx
-                i1_idx = n.ins[1].index if len(n.ins) > 1 and n.ins[1] is not None else self.zero_idx
-                kind = n.kind.lower()
-                if kind == '__fork__':
-                    if not strip_forks:
-                        for o_line in n.outs:
-                            if o_line is not None:
-                                ops.append((0b1010, o_line.index, i0_idx, i1_idx))
-                elif kind.startswith('nand'):
-                    ops.append((0b0111, o0_idx, i0_idx, i1_idx))
-                elif kind.startswith('nor'):
-                    ops.append((0b0001, o0_idx, i0_idx, i1_idx))
-                elif kind.startswith('and'):
-                    ops.append((0b1000, o0_idx, i0_idx, i1_idx))
-                elif kind.startswith('or'):
-                    ops.append((0b1110, o0_idx, i0_idx, i1_idx))
-                elif kind.startswith('xor'):
-                    ops.append((0b0110, o0_idx, i0_idx, i1_idx))
-                elif kind.startswith('xnor'):
-                    ops.append((0b1001, o0_idx, i0_idx, i1_idx))
-                elif kind.startswith('not') or kind.startswith('inv') or kind.startswith('ibuf'):
-                    ops.append((0b0101, o0_idx, i0_idx, i1_idx))
-                elif kind.startswith('buf') or kind.startswith('nbuf'):
-                    ops.append((0b1010, o0_idx, i0_idx, i1_idx))
-                elif kind.startswith('__const1__') or kind.startswith('tieh'):
-                    ops.append((0b0101, o0_idx, i0_idx, i1_idx))
-                elif kind.startswith('__const0__') or kind.startswith('tiel'):
-                    ops.append((0b1010, o0_idx, i0_idx, i1_idx))
-                else:
-                    print('unknown gate type', kind)
-        self.ops = np.asarray(ops, dtype='int32')
-
-        # create a map from fanout lines to stem lines for fork stripping
-        stems = np.zeros(self.sat_length, dtype='int32') - 1  # default to -1: 'no fanout line'
-        if strip_forks:
-            for f in circuit.forks.values():
-                prev_line = f.ins[0]
-                while prev_line.driver.kind == '__fork__':
-                    prev_line = prev_line.driver.ins[0]
-                stem_idx = prev_line.index
-                for ol in f.outs:
-                    stems[ol] = stem_idx
-
-        # calculate level (distance from PI/PPI) and reference count for each line
-        levels = np.zeros(self.sat_length, dtype='int32')
-        ref_count = np.zeros(self.sat_length, dtype='int32')
-        level_starts = [0]
-        current_level = 1
-        for i, op in enumerate(self.ops):
-            # if we fork-strip, always take the stems for determining fan-in level
-            i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
-            i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
-            if levels[i0_idx] >= current_level or levels[i1_idx] >= current_level:
-                current_level += 1
-                level_starts.append(i)
-            levels[op[1]] = current_level  # set level of the output line
-            ref_count[i0_idx] += 1
-            ref_count[i1_idx] += 1
-        self.level_starts = np.asarray(level_starts, dtype='int32')
-        self.level_stops = np.asarray(level_starts[1:] + [len(self.ops)], dtype='int32')
-
-        # state allocation table. maps line and interface indices to self.state memory locations
-        self.sat = np.zeros((self.sat_length, 3), dtype='int')
-        self.sat[:, 0] = -1
-
-        h = Heap()
-
-        # allocate and keep memory for special fields
-        self.sat[self.zero_idx] = h.alloc(intf_wavecap), intf_wavecap, 0
-        self.sat[self.tmp_idx] = h.alloc(intf_wavecap), intf_wavecap, 0
-        ref_count[self.zero_idx] += 1
-        ref_count[self.tmp_idx] += 1
-
-        # allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later)
-        for i, n in enumerate(self.interface):
-            if len(n.outs) > 0:
-                self.sat[self.ppi_offset + i] = h.alloc(intf_wavecap), intf_wavecap, 0
-                ref_count[self.ppi_offset + i] += 1
-            if len(n.ins) > 0:
-                i0_idx = stems[n.ins[0]] if stems[n.ins[0]] >= 0 else n.ins[0]
-                ref_count[i0_idx] += 1
-
-        # allocate memory for the rest of the circuit
-        for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            free_list = []
-            for op in self.ops[op_start:op_stop]:
-                # if we fork-strip, always take the stems
-                i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
-                i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
-                ref_count[i0_idx] -= 1
-                ref_count[i1_idx] -= 1
-                if ref_count[i0_idx] <= 0: free_list.append(self.sat[i0_idx, 0])
-                if ref_count[i1_idx] <= 0: free_list.append(self.sat[i1_idx, 0])
-                o_idx = op[1]
-                cap = wavecaps[o_idx]
-                self.sat[o_idx] = h.alloc(cap), cap, 0
-            if not keep_waveforms:
-                for loc in free_list:
-                    h.free(loc)
-
-        # copy memory location and capacity from stems to fanout lines
-        for lidx, stem in enumerate(stems):
-            if stem >= 0:  # if at a fanout line
-                self.sat[lidx] = self.sat[stem]
-
-        # copy memory location to PO/PPO area
-        for i, n in enumerate(self.interface):
-            if len(n.ins) > 0:
-                self.sat[self.ppo_offset + i] = self.sat[n.ins[0]]
-
-        # pad timing
-        self.timing = np.zeros((self.sat_length, 2, 2))
-        self.timing[:len(timing)] = timing
-
-        # allocate self.state
-        self.state = np.zeros((h.max_size, sims), dtype='float32') + TMAX
-
-        m1 = np.array([2 ** x for x in range(7, -1, -1)], dtype='uint8')
-        m0 = ~m1
-        self.mask = np.rollaxis(np.vstack((m0, m1)), 1)
-
-    def __repr__(self):
-        total_mem = self.state.nbytes + self.sat.nbytes + self.ops.nbytes + self.cdata.nbytes
-        return f'<WaveSim {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
-               f'levels={len(self.level_starts)} mem={hr_bytes(total_mem)}>'
-
-    def get_line_delay(self, line, polarity):
-        """Returns the current delay of the given ``line`` and ``polarity`` in the simulation model."""
-        return self.timing[line, 0, polarity]
-
-    def set_line_delay(self, line, polarity, delay):
-        """Sets a new ``delay`` for the given ``line`` and ``polarity`` in the simulation model."""
-        self.timing[line, 0, polarity] = delay
-
-    def assign(self, vectors, time=0.0, offset=0):
-        """Assigns new values to the primary inputs and state-elements.
-
-        :param vectors: The values to assign preferably in 8-valued logic. The values are converted to
-            appropriate waveforms with or one transition (``RISE``, ``FALL``) no transitions
-            (``ZERO``, ``ONE``, and others).
-        :type vectors: :py:class:`~kyupy.logic.BPArray`
-        :param time: The transition time of the generated waveforms.
-        :param offset: The offset into the vector set. The vector assigned to the first simulator is
-            ``vectors[offset]``.
-        """
-        nvectors = min(len(vectors) - offset, self.sims)
-        for i in range(len(self.interface)):
-            ppi_loc = self.sat[self.ppi_offset + i, 0]
-            if ppi_loc < 0: continue
-            for p in range(nvectors):
-                vector = p + offset
-                a = vectors.data[i, :, vector // 8]
-                m = self.mask[vector % 8]
-                toggle = 0
-                if len(a) <= 2:
-                    if a[0] & m[1]:
-                        self.state[ppi_loc, p] = TMIN
-                        toggle += 1
-                else:
-                    if a[1] & m[1]:
-                        self.state[ppi_loc, p] = TMIN
-                        toggle += 1
-                    if (a[2] & m[1]) and ((a[0] & m[1]) != (a[1] & m[1])):
-                        self.state[ppi_loc + toggle, p] = time
-                        toggle += 1
-                self.state[ppi_loc + toggle, p] = TMAX
-
-    def propagate(self, sims=None, sd=0.0, seed=1):
-        """Propagates all waveforms from the (pseudo) primary inputs to the (pseudo) primary outputs.
-
-        :param sims: Number of parallel simulations to execute. If None, all available simulations are performed.
-        :param sd: Standard deviation for injection of random delay variation. Active, if value is positive.
-        :param seed: Random seed for delay variations.
-        """
-        sims = min(sims or self.sims, self.sims)
-        for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            self.overflows += level_eval(self.ops, op_start, op_stop, self.state, self.sat, 0, sims,
-                                         self.timing, self.sdata, sd, seed)
-        self.lst_eat_valid = False
-
-    def wave(self, line, vector):
-        # """Returns the desired waveform from the simulation state. Only valid, if simulator was
-        # instantiated with ``keep_waveforms=True``."""
-        if line < 0:
-            return [TMAX]
-        mem, wcap, _ = self.sat[line]
-        if mem < 0:
-            return [TMAX]
-        return self.state[mem:mem + wcap, vector]
-
-    def wave_ppi(self, i, vector):
-        return self.wave(self.ppi_offset + i, vector)
-
-    def wave_ppo(self, o, vector):
-        return self.wave(self.ppo_offset + o, vector)
-
-    def capture(self, time=TMAX, sd=0.0, seed=1, cdata=None, offset=0):
-        """Simulates a capture operation at all state-elements and primary outputs.
-
-        The capture analyzes the propagated waveforms at and around the given capture time and returns
-        various results for each capture operation.
-
-        :param time: The desired capture time. By default, a capture of the settled value is performed.
-        :param sd: A standard deviation for uncertainty in the actual capture time.
-        :param seed: The random seed for a capture with uncertainty.
-        :param cdata: An array to copy capture data into (optional). See the return value for details.
-        :param offset: An offset into the supplied capture data array.
-        :return: The capture data as numpy array.
-
-            The 3-dimensional capture data array contains for each interface node (axis 0),
-            and each test (axis 1), seven values:
-
-            0. Probability of capturing a 1 at the given capture time (same as next value, if no
-               standard deviation given).
-            1. A capture value decided by random sampling according to above probability and given seed.
-            2. The final value (assume a very late capture time).
-            3. True, if there was a premature capture (capture error), i.e. final value is different
-               from captured value.
-            4. Earliest arrival time. The time at which the output transitioned from its initial value.
-            5. Latest stabilization time. The time at which the output transitioned to its final value.
-            6. Overflow indicator. If non-zero, some signals in the input cone of this output had more
-               transitions than specified in ``wavecaps``. Some transitions have been discarded, the
-               final values in the waveforms are still valid.
-        """
-        for i, node in enumerate(self.interface):
-            if len(node.ins) == 0: continue
-            for p in range(self.sims):
-                self.cdata[i, p] = self.capture_wave(self.ppo_offset + i, p, time, sd, seed)
-        if cdata is not None:
-            assert offset < cdata.shape[1]
-            cap_dim = min(cdata.shape[1] - offset, self.sims)
-            cdata[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim]
-        self.lst_eat_valid = True
-        return self.cdata
-
-    def reassign(self, time=0.0):
-        """Re-assigns the last capture to the appropriate pseudo-primary inputs. Generates a new set of
-        waveforms at the PPIs that start with the previous final value of that PPI, and transitions at the
-        given time to the value captured in a previous simulation. :py:func:`~WaveSim.capture` must be called
-        prior to this function. The final value of each PPI is taken from the randomly sampled concrete logic
-        values in the capture data.
-
-        :param time: The transition time at the inputs (usually 0.0).
-        """
-        for i in range(len(self.interface)):
-            ppi_loc = self.sat[self.ppi_offset + i, 0]
-            ppo_loc = self.sat[self.ppo_offset + i, 0]
-            if ppi_loc < 0 or ppo_loc < 0: continue
-            for sidx in range(self.sims):
-                ival = self.val(self.ppi_offset + i, sidx, TMAX) > 0.5
-                oval = self.cdata[i, sidx, 1] > 0.5
-                toggle = 0
-                if ival:
-                    self.state[ppi_loc, sidx] = TMIN
-                    toggle += 1
-                if ival != oval:
-                    self.state[ppi_loc + toggle, sidx] = time
-                    toggle += 1
-                self.state[ppi_loc + toggle, sidx] = TMAX
-
-    def eat(self, line, vector):
-        eat = TMAX
-        for t in self.wave(line, vector):
-            if t >= TMAX: break
-            if t <= TMIN: continue
-            eat = min(eat, t)
-        return eat
-
-    def lst(self, line, vector):
-        lst = TMIN
-        for t in self.wave(line, vector):
-            if t >= TMAX: break
-            if t <= TMIN: continue
-            lst = max(lst, t)
-        return lst
-
-    def lst_ppo(self, o, vector):
-        if not self.lst_eat_valid:
-            self.capture()
-        return self.cdata[o, vector, 5]
-
-    def toggles(self, line, vector):
-        tog = 0
-        for t in self.wave(line, vector):
-            if t >= TMAX: break
-            if t <= TMIN: continue
-            tog += 1
-        return tog
-
-    def _vals(self, idx, vector, times, sd=0.0):
-        s_sqrt2 = sd * math.sqrt(2)
-        m = 0.5
-        accs = [0.0] * len(times)
-        values = [0] * len(times)
-        for t in self.wave(idx, vector):
-            if t >= TMAX: break
-            for idx, time in enumerate(times):
-                if t < time:
-                    values[idx] = values[idx] ^ 1
-            m = -m
-            if t <= TMIN: continue
-            if s_sqrt2 > 0:
-                for idx, time in enumerate(times):
-                    accs[idx] += m * (1 + math.erf((t - time) / s_sqrt2))
-        if (m < 0) and (s_sqrt2 > 0):
-            for idx, time in enumerate(times):
-                accs[idx] += 1
-        if s_sqrt2 == 0:
-            return values
-        return accs
-
-    def vals(self, line, vector, times, sd=0):
-        return self._vals(line, vector, times, sd)
-
-    def val(self, line, vector, time=TMAX, sd=0):
-        return self.capture_wave(line, vector, time, sd)[0]
-
-    def vals_ppo(self, o, vector, times, sd=0):
-        return self._vals(self.ppo_offset + o, vector, times, sd)
-
-    def val_ppo(self, o, vector, time=TMAX, sd=0):
-        if not self.lst_eat_valid:
-            self.capture(time, sd)
-        return self.cdata[o, vector, 0]
-
-    def capture_wave(self, line, vector, time=TMAX, sd=0.0, seed=1):
-        s_sqrt2 = sd * math.sqrt(2)
-        m = 0.5
-        acc = 0.0
-        eat = TMAX
-        lst = TMIN
-        tog = 0
-        ovl = 0
-        val = int(0)
-        final = int(0)
-        for t in self.wave(line, vector):
-            if t >= TMAX:
-                if t == TMAX_OVL:
-                    ovl = 1
-                break
-            m = -m
-            final ^= 1
-            if t < time:
-                val ^= 1
-            if t <= TMIN: continue
-            if s_sqrt2 > 0:
-                acc += m * (1 + math.erf((t - time) / s_sqrt2))
-            eat = min(eat, t)
-            lst = max(lst, t)
-            tog += 1
-        if s_sqrt2 > 0:
-            if m < 0:
-                acc += 1
-            if acc >= 0.99:
-                val = 1
-            elif acc > 0.01:
-                seed = (seed << 4) + (vector << 20) + (line-self.ppo_offset << 1)
-                seed = int(0xDEECE66D) * seed + 0xB
-                seed = int(0xDEECE66D) * seed + 0xB
-                rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
-                val = rnd < acc
-            else:
-                val = 0
-        else:
-            acc = val
-
-        return acc, val, final, (val != final), eat, lst, ovl
-
-
-@numba.njit
-def level_eval(ops, op_start, op_stop, state, sat, st_start, st_stop, line_times, sdata, sd, seed):
-    overflows = 0
-    for op_idx in range(op_start, op_stop):
-        op = ops[op_idx]
-        for st_idx in range(st_start, st_stop):
-            overflows += wave_eval(op, state, sat, st_idx, line_times, sdata[st_idx], sd, seed)
-    return overflows
-
-
-@numba.njit
-def rand_gauss(seed, sd):
-    clamp = 0.5
-    if sd <= 0.0:
-        return 1.0
-    while True:
-        x = -6.0
-        for _ in range(12):
-            seed = int(0xDEECE66D) * seed + 0xB
-            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
-        x *= sd
-        if abs(x) <= clamp:
-            break
-    return x + 1.0
-
-
-@numba.njit
-def wave_eval(op, state, sat, st_idx, line_times, sdata, sd=0.0, seed=0):
-    lut, z_idx, a_idx, b_idx = op
-    overflows = int(0)
-
-    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
-
-    a_mem = sat[a_idx, 0]
-    b_mem = sat[b_idx, 0]
-    z_mem, z_cap, _ = sat[z_idx]
-
-    a_cur = int(0)
-    b_cur = int(0)
-    z_cur = lut & 1
-    if z_cur == 1:
-        state[z_mem, st_idx] = TMIN
-
-    a = state[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss(_seed ^ a_mem ^ z_cur, sd) * sdata[0]
-    if int(sdata[1]) == a_idx: a += sdata[2+z_cur]
-    b = state[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss(_seed ^ b_mem ^ z_cur, sd) * sdata[0]
-    if int(sdata[1]) == b_idx: b += sdata[2+z_cur]
-    
-    previous_t = TMIN
-
-    current_t = min(a, b)
-    inputs = int(0)
-
-    while current_t < TMAX:
-        z_val = z_cur & 1
-        if b < a:
-            b_cur += 1
-            b = state[b_mem + b_cur, st_idx]
-            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ b_mem ^ z_val ^ 1, sd) * sdata[0]
-            thresh = line_times[b_idx, 1, z_val] * rand_gauss(_seed ^ b_mem ^ z_val, sd) * sdata[0]
-            if int(sdata[1]) == b_idx:
-                b += sdata[2+(z_val^1)]
-                thresh += sdata[2+z_val]
-            inputs ^= 2
-            next_t = b
-        else:
-            a_cur += 1
-            a = state[a_mem + a_cur, st_idx]
-            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ a_mem ^ z_val ^ 1, sd) * sdata[0]
-            thresh = line_times[a_idx, 1, z_val] * rand_gauss(_seed ^ a_mem ^ z_val, sd) * sdata[0]
-            if int(sdata[1]) == a_idx:
-                a += sdata[2+(z_val^1)]
-                thresh += sdata[2+z_val]
-            inputs ^= 1
-            next_t = a
-
-        if (z_cur & 1) != ((lut >> inputs) & 1):
-            # we generate a toggle in z_mem, if:
-            #   ( it is the first toggle in z_mem OR
-            #   following toggle is earlier OR
-            #   pulse is wide enough ) AND enough space in z_mem.
-            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
-                if z_cur < (z_cap - 1):
-                    state[z_mem + z_cur, st_idx] = current_t
-                    previous_t = current_t
-                    z_cur += 1
-                else:
-                    overflows += 1
-                    previous_t = state[z_mem + z_cur - 1, st_idx]
-                    z_cur -= 1
-            else:
-                z_cur -= 1
-                if z_cur > 0:
-                    previous_t = state[z_mem + z_cur - 1, st_idx]
-                else:
-                    previous_t = TMIN
-        current_t = min(a, b)
-
-    if overflows > 0:
-        state[z_mem + z_cur, st_idx] = TMAX_OVL
-    else:
-        state[z_mem + z_cur, st_idx] = a if a > b else b  # propagate overflow flags by storing biggest TMAX from input
-
-    return overflows
-
-
-
-class WaveSimCuda(WaveSim):
-    """A GPU-accelerated waveform-based combinational logic timing simulator.
-
-    The API is the same as for :py:class:`WaveSim`.
-    All internal memories are mirrored into GPU memory upon construction.
-    Some operations like access to single waveforms can involve large communication overheads.
-    """
-    def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
-        super().__init__(circuit, timing, sims, wavecaps, strip_forks, keep_waveforms)
-
-        self.tdata = np.zeros((len(self.interface), 3, (sims - 1) // 8 + 1), dtype='uint8')
-
-        self.d_state = cuda.to_device(self.state)
-        self.d_sat = cuda.to_device(self.sat)
-        self.d_ops = cuda.to_device(self.ops)
-        self.d_timing = cuda.to_device(self.timing)
-        self.d_tdata = cuda.to_device(self.tdata)
-        self.d_cdata = cuda.to_device(self.cdata)
-        self.d_sdata = cuda.to_device(self.sdata)
-
-        self._block_dim = (32, 16)
-
-    def __repr__(self):
-        total_mem = self.state.nbytes + self.sat.nbytes + self.ops.nbytes + self.timing.nbytes + \
-                    self.tdata.nbytes + self.cdata.nbytes
-        return f'<WaveSimCuda {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
-               f'levels={len(self.level_starts)} mem={hr_bytes(total_mem)}>'
-
-    def get_line_delay(self, line, polarity):
-        return self.d_timing[line, 0, polarity]
-
-    def set_line_delay(self, line, polarity, delay):
-        self.d_timing[line, 0, polarity] = delay
-                     
-    def sdata_to_device(self):
-        cuda.to_device(self.sdata, to=self.d_sdata)
-
-    def assign(self, vectors, time=0.0, offset=0):
-        assert (offset % 8) == 0
-        byte_offset = offset // 8
-        assert byte_offset < vectors.data.shape[-1]
-        pdim = min(vectors.data.shape[-1] - byte_offset, self.tdata.shape[-1])
-
-        self.tdata[..., 0:pdim] = vectors.data[..., byte_offset:pdim + byte_offset]
-        if vectors.m == 2:
-            self.tdata[:, 2, 0:pdim] = 0
-        cuda.to_device(self.tdata, to=self.d_tdata)
-
-        grid_dim = self._grid_dim(self.sims, len(self.interface))
-        assign_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppi_offset,
-                                                 len(self.interface), self.d_tdata, time)
-
-    def _grid_dim(self, x, y):
-        gx = math.ceil(x / self._block_dim[0])
-        gy = math.ceil(y / self._block_dim[1])
-        return gx, gy
-
-    def propagate(self, sims=None, sd=0.0, seed=1):
-        sims = min(sims or self.sims, self.sims)
-        for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            grid_dim = self._grid_dim(sims, op_stop - op_start)
-            wave_kernel[grid_dim, self._block_dim](self.d_ops, op_start, op_stop, self.d_state, self.sat, int(0),
-                                                   sims, self.d_timing, self.d_sdata, sd, seed)
-        cuda.synchronize()
-        self.lst_eat_valid = False
-
-    def wave(self, line, vector):
-        if line < 0:
-            return [TMAX]
-        mem, wcap, _ = self.sat[line]
-        if mem < 0:
-            return [TMAX]
-        return self.d_state[mem:mem + wcap, vector]
-
-    def capture(self, time=TMAX, sd=0, seed=1, cdata=None, offset=0):
-        grid_dim = self._grid_dim(self.sims, len(self.interface))
-        capture_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppo_offset,
-                                                  self.d_cdata, time, sd * math.sqrt(2), seed)
-        self.cdata[...] = self.d_cdata
-        if cdata is not None:
-            assert offset < cdata.shape[1]
-            cap_dim = min(cdata.shape[1] - offset, self.sims)
-            cdata[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim]
-        self.lst_eat_valid = True
-        return self.cdata
-
-    def reassign(self, time=0.0):
-        grid_dim = self._grid_dim(self.sims, len(self.interface))
-        reassign_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppi_offset, self.ppo_offset,
-                                                   self.d_cdata, time)
-        cuda.synchronize()
-
-    def wavecaps(self):
-        gx = math.ceil(len(self.circuit.lines) / 512)
-        wavecaps_kernel[gx, 512](self.d_state, self.d_sat, self.sims)
-        self.sat[...] = self.d_sat
-        return self.sat[..., 2]
-
-
-@cuda.jit()
-def wavecaps_kernel(state, sat, sims):
-    idx = cuda.grid(1)
-    if idx >= len(sat): return
-
-    lidx, lcap, _ = sat[idx]
-    if lidx < 0: return
-
-    wcap = 0
-    for sidx in range(sims):
-        for tidx in range(lcap):
-            t = state[lidx + tidx, sidx]
-            if tidx > wcap:
-                wcap = tidx
-            if t >= TMAX: break
-
-    sat[idx, 2] = wcap + 1
-
-
-@cuda.jit()
-def reassign_kernel(state, sat, ppi_offset, ppo_offset, cdata, ppi_time):
-    vector, y = cuda.grid(2)
-    if vector >= state.shape[-1]: return
-    if ppo_offset + y >= len(sat): return
-
-    ppo, _, _ = sat[ppo_offset + y]
-    ppi, ppi_cap, _ = sat[ppi_offset + y]
-    if ppo < 0: return
-    if ppi < 0: return
-
-    ppo_val = int(cdata[y, vector, 1])
-    ppi_val = int(0)
-    for tidx in range(ppi_cap):
-        t = state[ppi + tidx, vector]
-        if t >= TMAX: break
-        ppi_val ^= 1
-
-    # make new waveform at PPI
-    toggle = 0
-    if ppi_val:
-        state[ppi + toggle, vector] = TMIN
-        toggle += 1
-    if ppi_val != ppo_val:
-        state[ppi + toggle, vector] = ppi_time
-        toggle += 1
-    state[ppi + toggle, vector] = TMAX
-
-
-@cuda.jit()
-def capture_kernel(state, sat, ppo_offset, cdata, time, s_sqrt2, seed):
-    x, y = cuda.grid(2)
-    if ppo_offset + y >= len(sat): return
-    line, tdim, _ = sat[ppo_offset + y]
-    if line < 0: return
-    if x >= state.shape[-1]: return
-    vector = x
-    m = 0.5
-    acc = 0.0
-    eat = TMAX
-    lst = TMIN
-    tog = 0
-    ovl = 0
-    val = int(0)
-    final = int(0)
-    for tidx in range(tdim):
-        t = state[line + tidx, vector]
-        if t >= TMAX:
-            if t == TMAX_OVL:
-                ovl = 1
-            break
-        m = -m
-        final ^= 1
-        if t < time:
-            val ^= 1
-        if t <= TMIN: continue
-        if s_sqrt2 > 0:
-            acc += m * (1 + math.erf((t - time) / s_sqrt2))
-        eat = min(eat, t)
-        lst = max(lst, t)
-        tog += 1
-    if s_sqrt2 > 0:
-        if m < 0:
-            acc += 1
-        if acc >= 0.99:
-            val = 1
-        elif acc > 0.01:
-            seed = (seed << 4) + (vector << 20) + (y << 1)
-            seed = int(0xDEECE66D) * seed + 0xB
-            seed = int(0xDEECE66D) * seed + 0xB
-            rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
-            val = rnd < acc
-        else:
-            val = 0
-    else:
-        acc = val
-
-    cdata[y, vector, 0] = acc
-    cdata[y, vector, 1] = val
-    cdata[y, vector, 2] = final
-    cdata[y, vector, 3] = (val != final)
-    cdata[y, vector, 4] = eat
-    cdata[y, vector, 5] = lst
-    cdata[y, vector, 6] = ovl
-
-
-@cuda.jit()
-def assign_kernel(state, sat, ppi_offset, intf_len, tdata, time):
-    x, y = cuda.grid(2)
-    if y >= intf_len: return
-    line = sat[ppi_offset + y, 0]
-    if line < 0: return
-    sdim = state.shape[-1]
-    if x >= sdim: return
-    vector = x
-    a0 = tdata[y, 0, vector // 8]
-    a1 = tdata[y, 1, vector // 8]
-    a2 = tdata[y, 2, vector // 8]
-    m = np.uint8(1 << (7 - (vector % 8)))
-    toggle = 0
-    if a1 & m:
-        state[line + toggle, x] = TMIN
-        toggle += 1
-    if (a2 & m) and ((a0 & m) != (a1 & m)):
-        state[line + toggle, x] = time
-        toggle += 1
-    state[line + toggle, x] = TMAX
-
-
-@cuda.jit(device=True)
-def rand_gauss_dev(seed, sd):
-    clamp = 0.5
-    if sd <= 0.0:
-        return 1.0
-    while True:
-        x = -6.0
-        for _ in range(12):
-            seed = int(0xDEECE66D) * seed + 0xB
-            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
-        x *= sd
-        if abs(x) <= clamp:
-            break
-    return x + 1.0
-
-
-@cuda.jit()
-def wave_kernel(ops, op_start, op_stop, state, sat, st_start, st_stop, line_times, sdata, sd, seed):
-    x, y = cuda.grid(2)
-    st_idx = st_start + x
-    op_idx = op_start + y
-    if st_idx >= st_stop: return
-    if op_idx >= op_stop: return
-    lut = ops[op_idx, 0]
-    z_idx = ops[op_idx, 1]
-    a_idx = ops[op_idx, 2]
-    b_idx = ops[op_idx, 3]
-    overflows = int(0)
-    sdata = sdata[st_idx]
-
-    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
-
-    a_mem = sat[a_idx, 0]
-    b_mem = sat[b_idx, 0]
-    z_mem, z_cap, _ = sat[z_idx]
-
-    a_cur = int(0)
-    b_cur = int(0)
-    z_cur = lut & 1
-    if z_cur == 1:
-        state[z_mem, st_idx] = TMIN
-
-    a = state[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_dev(_seed ^ a_mem ^ z_cur, sd) * sdata[0]
-    if int(sdata[1]) == a_idx: a += sdata[2+z_cur]
-    b = state[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_dev(_seed ^ b_mem ^ z_cur, sd) * sdata[0]
-    if int(sdata[1]) == b_idx: b += sdata[2+z_cur]
-    
-    previous_t = TMIN
-
-    current_t = min(a, b)
-    inputs = int(0)
-
-    while current_t < TMAX:
-        z_val = z_cur & 1
-        if b < a:
-            b_cur += 1
-            b = state[b_mem + b_cur, st_idx]
-            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_dev(_seed ^ b_mem ^ z_val ^ 1, sd) * sdata[0]
-            thresh = line_times[b_idx, 1, z_val] * rand_gauss_dev(_seed ^ b_mem ^ z_val, sd) * sdata[0]
-            if int(sdata[1]) == b_idx:
-                b += sdata[2+(z_val^1)]
-                thresh += sdata[2+z_val]
-            inputs ^= 2
-            next_t = b
-        else:
-            a_cur += 1
-            a = state[a_mem + a_cur, st_idx]
-            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_dev(_seed ^ a_mem ^ z_val ^ 1, sd) * sdata[0]
-            thresh = line_times[a_idx, 1, z_val] * rand_gauss_dev(_seed ^ a_mem ^ z_val, sd) * sdata[0]
-            if int(sdata[1]) == a_idx:
-                a += sdata[2+(z_val^1)]
-                thresh += sdata[2+z_val]
-            inputs ^= 1
-            next_t = a
-
-        if (z_cur & 1) != ((lut >> inputs) & 1):
-            # we generate a toggle in z_mem, if:
-            #   ( it is the first toggle in z_mem OR
-            #   following toggle is earlier OR
-            #   pulse is wide enough ) AND enough space in z_mem.
-            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
-                if z_cur < (z_cap - 1):
-                    state[z_mem + z_cur, st_idx] = current_t
-                    previous_t = current_t
-                    z_cur += 1
-                else:
-                    overflows += 1
-                    previous_t = state[z_mem + z_cur - 1, st_idx]
-                    z_cur -= 1
-            else:
-                z_cur -= 1
-                if z_cur > 0:
-                    previous_t = state[z_mem + z_cur - 1, st_idx]
-                else:
-                    previous_t = TMIN
-        current_t = min(a, b)
-
-    if overflows > 0:
-        state[z_mem + z_cur, st_idx] = TMAX_OVL
-    else:
-        state[z_mem + z_cur, st_idx] = a if a > b else b  # propagate overflow flags by storing biggest TMAX from input