From 64e1de396fd3166d82464fe045a406415927460a Mon Sep 17 00:00:00 2001
From: Stefan Holst <mail@s-holst.de>
Date: Mon, 21 Dec 2020 20:44:23 +0900
Subject: [PATCH 1/2] New m-valued logic arrays, documentation, 0.0.2

- MVArray for multi-valued logic
- BPArray for bit-parallel storage layout
- Started documenting with Sphinx
- Migrated simulators to new BPArray
---
 .gitignore                        |   5 +
 UsageExamples.ipynb => Demo.ipynb | 205 +++++++++------
 README.md                         |  56 -----
 README.rst                        |  28 +++
 docs/Makefile                     |  20 ++
 docs/conf.py                      |  64 +++++
 docs/datastructures.rst           |  29 +++
 docs/index.rst                    |  12 +
 docs/parsers.rst                  |  42 ++++
 docs/simulators.rst               |  20 ++
 setup.py                          |   7 +-
 src/kyupy/__init__.py             |  28 ++-
 src/kyupy/bench.py                |  41 ++-
 src/kyupy/bittools.py             |  23 --
 src/kyupy/circuit.py              | 215 +++++++++++-----
 src/kyupy/logic.py                | 402 ++++++++++++++++++++++++++++++
 src/kyupy/logic_sim.py            | 286 ++++-----------------
 src/kyupy/packed_vectors.py       | 299 ----------------------
 src/kyupy/sdf.py                  |  66 +++--
 src/kyupy/stil.py                 | 167 ++++++-------
 src/kyupy/verilog.py              |  37 +--
 src/kyupy/wave_sim.py             | 347 +++++++++++++++++++++++++-
 src/kyupy/wave_sim_cuda.py        | 317 -----------------------
 tests/test_bench.py               |   2 +-
 tests/test_circuit.py             |  45 ++++
 tests/test_logic.py               | 214 ++++++++++++++++
 tests/test_logic_sim.py           | 201 +++++----------
 tests/test_packed_vectors.py      |  88 -------
 tests/test_sdf.py                 |   6 +-
 tests/test_stil.py                |   2 +-
 tests/test_verilog.py             |   3 +-
 tests/test_wave_sim.py            |  46 ++--
 32 files changed, 1847 insertions(+), 1476 deletions(-)
 rename UsageExamples.ipynb => Demo.ipynb (88%)
 delete mode 100644 README.md
 create mode 100644 README.rst
 create mode 100644 docs/Makefile
 create mode 100644 docs/conf.py
 create mode 100644 docs/datastructures.rst
 create mode 100644 docs/index.rst
 create mode 100644 docs/parsers.rst
 create mode 100644 docs/simulators.rst
 delete mode 100644 src/kyupy/bittools.py
 create mode 100644 src/kyupy/logic.py
 delete mode 100644 src/kyupy/packed_vectors.py
 delete mode 100644 src/kyupy/wave_sim_cuda.py
 create mode 100644 tests/test_logic.py
 delete mode 100644 tests/test_packed_vectors.py
diff --git a/.gitignore b/.gitignore
index c563798..1293051 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,8 @@
 **/.pytest_cache
 **/.DS_Store
 **/*.pyc
+docs/_build
+build
+dist
+.idea
+src/kyupy.egg-info
diff --git a/UsageExamples.ipynb b/Demo.ipynb
similarity index 88%
rename from UsageExamples.ipynb
rename to Demo.ipynb
index 0f17115..288f1bd 100644
--- a/UsageExamples.ipynb
+++ b/Demo.ipynb
@@ -22,8 +22,8 @@
    "source": [
     "from kyupy import bench\n",
     "\n",
-    "# parse a file\n",
-    "b01 = bench.parse('tests/b01.bench')\n",
+    "# load a file\n",
+    "b01 = bench.load('tests/b01.bench')\n",
     "\n",
     "# ... or specify the circuit as string \n",
     "mycircuit = bench.parse('input(a,b) output(o1,o2,o3) x=buf(a) o1=not(x) o2=buf(x) o3=buf(x)')"
@@ -44,7 +44,7 @@
     {
      "data": {
       "text/plain": [
-       "<Circuit 'tests/b01' with 92 nodes, 130 lines, 4 ports>"
+       "<Circuit 'tests/b01.bench' with 92 nodes, 130 lines, 4 ports>"
       ]
      },
      "execution_count": 2,
@@ -373,7 +373,7 @@
    "source": [
     "from kyupy import verilog\n",
     "\n",
-    "b14 = verilog.parse('tests/b14.v.gz')\n",
+    "b14 = verilog.load('tests/b14.v.gz')\n",
     "b14"
    ]
   },
@@ -456,7 +456,7 @@
    "source": [
     "from kyupy import verilog\n",
     "\n",
-    "b14 = verilog.parse('tests/b14.v.gz')\n",
+    "b14 = verilog.load('tests/b14.v.gz')\n",
     "b14"
    ]
   },
@@ -567,11 +567,11 @@
    "outputs": [],
    "source": [
     "from kyupy import verilog, stil\n",
+    "from kyupy.logic import MVArray, BPArray\n",
     "from kyupy.logic_sim import LogicSim\n",
-    "from kyupy.packed_vectors import PackedVectors\n",
     "\n",
-    "b14 = verilog.parse('tests/b14.v.gz')\n",
-    "s = stil.parse('tests/b14.stuck.stil.gz')\n",
+    "b14 = verilog.load('tests/b14.v.gz')\n",
+    "s = stil.load('tests/b14.stuck.stil.gz')\n",
     "stuck_tests = s.tests(b14)\n",
     "stuck_responses = s.responses(b14)"
    ]
@@ -580,7 +580,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Tests and responses are instances of `PackedVectors`. Its length is the number of test vectors stored (`nvectors`), its `width` is the number of values in a vector, and its `vdim` is the number of bits used for storing one value. By default, the stil parser returns 4-valued test vectors (`vdim=2`)."
+    "Tests and responses are instances of `MVArray`. Its `length` is the number of test vectors stored, its `width` is the number of values in a vector. By default, the stil parser returns 8-valued test vectors (`m=8`)."
    ]
   },
   {
@@ -591,7 +591,7 @@
     {
      "data": {
       "text/plain": [
-       "<PackedVectors nvectors=1081, width=306, vdim=2>"
+       "<MVArray length=1081 width=306 m=8 nbytes=330786>"
       ]
      },
      "execution_count": 19,
@@ -607,7 +607,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The data is stored in a bit-parallel fashion. This internal storage (an `ndarray` of `uint8`) is accessible via `bits`. The first axis is the width, the second axis is `vdim`, the last axis goes along the test set. This last axis is about `nvectors / 8` in length. "
+    "The internal storage (an `ndarray` of `uint8`) is accessible via `data`. The first axis is the width, and the last axis goes along the test set."
    ]
   },
   {
@@ -618,7 +618,7 @@
     {
      "data": {
       "text/plain": [
-       "(306, 2, 136)"
+       "(306, 1081)"
       ]
      },
      "execution_count": 20,
@@ -627,14 +627,14 @@
     }
    ],
    "source": [
-    "stuck_tests.bits.shape"
+    "stuck_tests.data.shape"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The subscript accessor returns a string representation of the given test vector number. Possible values are '0', '1', '-', and 'X'."
+    "The subscript accessor returns a string representation of the given test vector number. Possible values are '0', '1', '-', 'X', 'R', 'F', 'P', and 'N'."
    ]
   },
   {
@@ -645,7 +645,7 @@
     {
      "data": {
       "text/plain": [
-       "'-0--------------------11011111011001100111010101011101----------------------------------00-10111011010110011101110010111010111011101100010000110101111111011010101001010101010101010101001010110101001010101010101010110100000111111111111111011010100100101010010010101101010101001010100111010001010010000011100'"
+       "'P0--------------------11011111011001100111010101011101----------------------------------00-10111011010110011101110010111010111011101100010000110101111111011010101001010101010101010101001010110101001010101010101010110100000111111111111111011010100100101010010010101101010101001010100111010001010010000011100'"
       ]
      },
      "execution_count": 21,
@@ -681,25 +681,80 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The order of values in the vectors correspond to the circuit's interface followed by the scan flip-flops as they appear in `b14.cells`. The test data can be used directly in the simulators as they use the same ordering convention. The following code performs a 4-valued logic simulation and stores the results in a new instance of `PackedVectors`."
+    "The order of values in the vectors correspond to the circuit's interface followed by the scan flip-flops as they appear in `b14.cells`.\n",
+    "The test data can be used directly in the simulators as they use the same ordering convention.\n",
+    "\n",
+    "The logic simulator uses bit-parallel storage of logic values, but our loaded test data uses one `uint8` per logic value.\n",
+    "To convert the storage layout, we instanciate a `BPArray` for the input stimuli.\n",
+    "The storage layout is more compact, but individual values cannot be easily accessed anymore."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 23,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<BPArray length=1081 width=306 m=8 bytes=124848>"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stuck_tests_bp = BPArray(stuck_tests)\n",
+    "stuck_tests_bp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(306, 3, 136)"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stuck_tests_bp.data.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following code performs a 8-valued logic simulation and stores the results in a new instance of `BPArray`.\n",
+    "The packed array is unpacked into an `MVArray` for value access."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
    "outputs": [],
    "source": [
-    "responses = PackedVectors(len(stuck_tests), stuck_tests.width, 2)\n",
-    "simulator = LogicSim(b14, len(responses), 2)\n",
-    "simulator.assign(stuck_tests)\n",
+    "responses_bp = BPArray((stuck_tests_bp.width, len(stuck_tests_bp)))\n",
+    "simulator = LogicSim(b14, sims=len(stuck_tests_bp))\n",
+    "simulator.assign(stuck_tests_bp)\n",
     "simulator.propagate()\n",
-    "simulator.capture(responses)"
+    "simulator.capture(responses_bp)\n",
+    "responses = MVArray(responses_bp)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -708,7 +763,7 @@
        "'--10000010010100010111--------------------------------0101010010101010110101001001010100--011111110011011111000111010101010111011101100010000110101111111011010101001010101010101010101001010110101001010101010101010110100000111111111111111011010100100101010010010101101010101001010101000111111111111111011101'"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -726,7 +781,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
@@ -752,39 +807,32 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Transition faults require test vector pairs for testing. These pairs are generated by `tests8v`, assuming a launch-on-capture scheme (two functional clock cycles after scan-in)."
+    "Transition faults require test vector pairs for testing. These pairs are generated by `tests_loc`, assuming a launch-on-capture scheme (two functional clock cycles after scan-in)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
-    "s = stil.parse('tests/b14.transition.stil.gz')\n",
-    "trans_tests = s.tests8v(b14)\n",
+    "s = stil.load('tests/b14.transition.stil.gz')\n",
+    "trans_tests = s.tests_loc(b14)\n",
     "trans_responses = s.responses(b14)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The returned test data is now 8-valued (`vdim=3`)"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<PackedVectors nvectors=1392, width=306, vdim=3>"
+       "<MVArray length=1392 width=306 m=8 nbytes=425952>"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -802,16 +850,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'-0--------------------RRRRRRFRRRRRRRRRRRFFRFRRRRRRRRRR----------------------------------00-00000001110100011111011010000000000000000011001001100101111110101110110001000100010100110111111101101000000111110011100010111000111R1111111111111111111111110001100100000110100000111010101110RFF00F000F0F00F00000FF01F'"
+       "'00--------------------RRRRRRFRRRRRRRRRRRFFRFRRRRRRRRRR----------------------------------00-00000001110100011111011010000000000000000011001001100101111110101110110001000100010100110111111101101000000111110011100010111000111R1111111111111111111111110001100100000110100000111010101110RFF00F000F0F00F00000FF01F'"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -829,20 +877,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
-    "responses = PackedVectors(len(trans_tests), trans_tests.width, 3)\n",
-    "simulator = LogicSim(b14, len(responses), 3)\n",
-    "simulator.assign(trans_tests)\n",
+    "trans_tests_bp = BPArray(trans_tests)\n",
+    "responses_bp = BPArray((trans_tests_bp.width, len(trans_tests_bp)))\n",
+    "simulator = LogicSim(b14, sims=len(trans_tests_bp))\n",
+    "simulator.assign(trans_tests_bp)\n",
     "simulator.propagate()\n",
-    "simulator.capture(responses)"
+    "simulator.capture(responses_bp)\n",
+    "responses = MVArray(responses_bp)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -851,7 +901,7 @@
        "'--F00000F00F0F000F00FF--------------------------------01110101011100000101100000100110R0--0RRRRRRRNNNRNRPRNNNNNRFFRFRRRRRRR000000000011001001100101111110101110110001000100010100110111111101101000000111110011100010111000NNNNNNNNNNNNNNNNNNNNNNNNNNNNP0011001000001101000001110101011101RRRRRRRRRRRRRRRRRRRRP01R'"
       ]
      },
-     "execution_count": 30,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -869,7 +919,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
@@ -907,14 +957,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [],
    "source": [
     "from kyupy import sdf\n",
     "from kyupy.saed import pin_index\n",
     "\n",
-    "df = sdf.parse('tests/b14.sdf.gz')\n",
+    "df = sdf.load('tests/b14.sdf.gz')\n",
     "lt = df.annotation(b14, pin_index, dataset=0, interconnect=False)"
    ]
   },
@@ -927,7 +977,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [
     {
@@ -936,7 +986,7 @@
        "(46891, 2, 2)"
       ]
      },
-     "execution_count": 33,
+     "execution_count": 35,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -954,7 +1004,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [
     {
@@ -963,7 +1013,7 @@
        "119676"
       ]
      },
-     "execution_count": 34,
+     "execution_count": 36,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -991,11 +1041,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from kyupy.wave_sim_cuda import WaveSimCuda, TMAX\n",
+    "from kyupy.wave_sim import WaveSimCuda, TMAX\n",
     "import numpy as np\n",
     "\n",
     "wsim = WaveSimCuda(b14, lt, sims=32, wavecaps=16)"
@@ -1010,7 +1060,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [
     {
@@ -1043,23 +1093,23 @@
    "metadata": {},
    "source": [
     "This is a typical simulation loop where the number of patterns is larger than the number of simulators available.\n",
-    "We simulate `trans_tests`.\n",
-    "The timing simulator accepts 4-valued and 8-valued `PackedVectors`, but it will return response (capture) data in a different format."
+    "We simulate `trans_tests_bp`.\n",
+    "The timing simulator accepts 8-valued `BPArray`s, but it will return response (capture) data in a different format."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [],
    "source": [
-    "nvectors = 128  # len(trans_tests)  # Feel free to simulate all tests if CUDA is set up correctly.\n",
+    "sims = 128  # len(trans_tests_bp)  # Feel free to simulate all tests if CUDA is set up correctly.\n",
     "\n",
-    "cdata = np.zeros((len(wsim.interface), nvectors, 7))  # space to store all capture data\n",
+    "cdata = np.zeros((len(wsim.interface), sims, 7))  # space to store all capture data\n",
     "\n",
-    "for offset in range(0, nvectors, wsim.sims):\n",
-    "    wsim.assign(trans_tests, offset=offset)\n",
-    "    wsim.propagate(sims=nvectors-offset)\n",
+    "for offset in range(0, sims, wsim.sims):\n",
+    "    wsim.assign(trans_tests_bp, offset=offset)\n",
+    "    wsim.propagate(sims=sims-offset)\n",
     "    wsim.capture(time=2.5, cdata=cdata, offset=offset)  # capture at time 2.5"
    ]
   },
@@ -1079,7 +1129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [
     {
@@ -1088,7 +1138,7 @@
        "(306, 128, 7)"
       ]
      },
-     "execution_count": 38,
+     "execution_count": 40,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1106,7 +1156,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [
     {
@@ -1139,7 +1189,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [
     {
@@ -1148,7 +1198,7 @@
        "2.0610005855560303"
       ]
      },
-     "execution_count": 40,
+     "execution_count": 42,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1166,7 +1216,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [
     {
@@ -1175,7 +1225,7 @@
        "0.0"
       ]
      },
-     "execution_count": 41,
+     "execution_count": 43,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1193,7 +1243,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [
     {
@@ -1202,7 +1252,7 @@
        "0.0"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 44,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1222,15 +1272,14 @@
     "If there is an error related to `nvvm`, you probably need to set up some environment variables:\n",
     "```\n",
     "%env LD_LIBRARY_PATH=/usr/local/cuda/lib64\n",
-    "%env NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so\n",
-    "%env NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice\n",
+    "%env CUDA_HOME=/usr/local/cuda\n",
     "```\n",
     "If problems persist, refer to documentations for numba and cuda. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [
     {
@@ -1252,7 +1301,7 @@
        "True"
       ]
      },
-     "execution_count": 43,
+     "execution_count": 45,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/README.md b/README.md
deleted file mode 100644
index 2f11bf1..0000000
--- a/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-KyuPy - Processing VLSI Circuits With Ease
-==========================================
-
-KyuPy is a python package for high-performance processing and analysis of
-non-hierarchical VLSI designs. Its purpose is to provide a rapid prototyping
-platform to aid and accelerate research in the fields of VLSI test, diagnosis
-and reliability. KyuPy is freely available under the MIT license.
-
-
-Main Features
--------------
-
-* Partial [lark](https://github.com/lark-parser/lark) parsers for common files used with synthesized designs:
-  bench, gate-level verilog, standard delay format (SDF), standard test interface language (STIL)
-* Bit-parallel gate-level 2-, 4-, and 8-valued logic simulation
-* GPU-accelerated high-throughput gate-level timing simulation
-* High-performance through the use of [numpy](https://numpy.org) and [numba](https://numba.pydata.org)
-
-
-Getting Started
----------------
-
-KyuPy requires Python 3.6 or newer.
-Install the latest release by running:
-```commandline
-pip3 install --user kyupy
-```
-For best performance, ensure you have [numba](https://pypi.org/project/numba) installed:
-```commandline
-pip3 install --user numba
-```
-GPU/CUDA support may [require some additional setup](https://numba.pydata.org/numba-doc/latest/cuda/index.html).
-If CUDA or numba is not available, KyuPy will automatically fall back to slow, pure python execution.
-
-The Jupyter Notebook [UsageExamples.ipynb](https://github.com/s-holst/kyupy/blob/main/UsageExamples.ipynb) on GitHub
-contains some useful examples to get familiar with the API.
-
-
-Development
------------
-
-To contribute to KyuPy or simply explore the source code, clone the KyuPy [repository](https://github.com/s-holst/kyupy) on GitHub.
-Within your local checkout, run:
-```commandline
-pip3 install --user -e .
-```
-to make the kyupy package available in your python environment.
-The source code comes with tests that can be run with:
-```
-pytest
-```
-
-KyuPy depends on the following packages:
-* [lark-parser](https://pypi.org/project/lark-parser)
-* [numpy](https://pypi.org/project/numpy)
-* [numba](https://pypi.org/project/numba) (optional, required only for GPU/CUDA support)
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..739e7b8
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,28 @@
+KyuPy - Pythonic Processing of VLSI Circuits
+============================================
+
+KyuPy is a Python package for processing and analysis of non-hierarchical gate-level VLSI designs.
+It contains fundamental building blocks for research software in the fields of VLSI test, diagnosis and reliability:
+
+* Efficient data structures for gate-level circuits and related design data.
+* Partial `lark <https://github.com/lark-parser/lark>`_ parsers for common design files like
+  bench, gate-level verilog, standard delay format (SDF), standard test interface language (STIL).
+* Bit-parallel gate-level 2-, 4-, and 8-valued logic simulation.
+* GPU-accelerated high-throughput gate-level timing simulation.
+* High-performance through the use of `numpy <https://numpy.org>`_ and `numba <https://numba.pydata.org>`_.
+
+
+Getting Started
+---------------
+
+KyuPy is available in `PyPI <https://pypi.org/project/kyupy>`_.
+It requires Python 3.6 or newer, `lark-parser <https://pypi.org/project/lark-parser>`_, and `numpy`_.
+Although optional, `numba`_ should be installed for best performance.
+GPU/CUDA support in numba may `require some additional setup <https://numba.pydata.org/numba-doc/latest/cuda/index.html>`_.
+If numba is not available, KyuPy will automatically fall back to slow, pure Python execution.
+
+The Jupyter Notebook `Demo.ipynb <https://github.com/s-holst/kyupy/blob/main/Demo.ipynb>`_ contains some useful examples to get familiar with the API.
+
+To work with the latest pre-release source code, clone the `KyuPy GitHub repository <https://github.com/s-holst/kyupy>`_.
+Run ``pip3 install --user -e .`` within your local checkout to make the package available in your Python environment.
+The source code comes with tests that can be run with ``pytest``.
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..d4bb2cb
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..cb2e436
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,64 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+import sphinx_rtd_theme
+sys.path.insert(0, os.path.abspath('../src'))
+
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'KyuPy'
+copyright = '2020, Stefan Holst'
+author = 'Stefan Holst'
+
+# The full version, including alpha/beta/rc tags
+release = '0.0.2'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx_rtd_theme',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+# html_theme = 'alabaster'
+html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+autodoc_default_options = {
+    'member-order': 'bysource',
+}
diff --git a/docs/datastructures.rst b/docs/datastructures.rst
new file mode 100644
index 0000000..026ded9
--- /dev/null
+++ b/docs/datastructures.rst
@@ -0,0 +1,29 @@
+Data Structures
+===============
+
+KyuPy provides two types of core data structures, one for gate-level circuits, and a few others for representing and storing logic data and signal values.
+The data structures are designed to work together nicely with numpy arrays.
+For example, all the nodes and connections in the circuit graph have consecutive integer indices that can be used to access ndarrays with associated data.
+Circuit graphs also define an ordering of inputs, outputs and other nodes to easily process test vector data and alike.
+
+Circuit Graph - :mod:`kyupy.circuit`
+------------------------------------
+
+.. automodule:: kyupy.circuit
+
+.. autoclass:: kyupy.circuit.Node
+   :members:
+
+.. autoclass:: kyupy.circuit.Line
+   :members:
+
+.. autoclass:: kyupy.circuit.Circuit
+   :members:
+
+Multi-Valued Logic - :mod:`kyupy.logic`
+---------------------------------------
+
+.. automodule:: kyupy.logic
+   :members:
+
+
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..3caa343
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,12 @@
+.. include:: ../README.rst
+
+API Reference
+-------------
+
+.. toctree::
+   :maxdepth: 2
+
+   datastructures
+   parsers
+   simulators
+
diff --git a/docs/parsers.rst b/docs/parsers.rst
new file mode 100644
index 0000000..5dac7f8
--- /dev/null
+++ b/docs/parsers.rst
@@ -0,0 +1,42 @@
+Parsers
+=======
+
+KyuPy contains simple (and often incomplete) parsers for common file formats.
+These parsers are tailored to the most common use-cases to keep the grammars and the code-base as simple as possible.
+
+Each of the modules export a function ``parse()`` for parsing a string directly and a function
+``load()`` for loading a file. Files with a '.gz' extension are uncompressed on-the-fly.
+
+
+Verilog - :mod:`kyupy.verilog`
+------------------------------
+
+.. automodule:: kyupy.verilog
+   :members: parse, load
+
+
+Bench Format - :mod:`kyupy.bench`
+---------------------------------
+
+.. automodule:: kyupy.bench
+   :members: parse, load
+
+
+Standard Test Interface Language - :mod:`kyupy.stil`
+----------------------------------------------------
+
+.. automodule:: kyupy.stil
+   :members: parse, load
+
+.. autoclass:: kyupy.stil.StilFile
+   :members:
+
+
+Standard Delay Format - :mod:`kyupy.sdf`
+----------------------------------------
+
+.. automodule:: kyupy.sdf
+   :members: parse, load
+
+.. autoclass:: kyupy.sdf.DelayFile
+   :members:
diff --git a/docs/simulators.rst b/docs/simulators.rst
new file mode 100644
index 0000000..8d5f6b6
--- /dev/null
+++ b/docs/simulators.rst
@@ -0,0 +1,20 @@
+Simulators
+==========
+
+Logic Simulation - :mod:`kyupy.logic_sim`
+-----------------------------------------
+
+.. autoclass:: kyupy.logic_sim.LogicSim
+   :members:
+
+
+Timing Simulation - :mod:`kyupy.wave_sim`
+-----------------------------------------
+
+.. automodule:: kyupy.wave_sim
+
+.. autoclass:: kyupy.wave_sim.WaveSim
+   :members:
+
+.. autoclass:: kyupy.wave_sim.WaveSimCuda
+   :members:
diff --git a/setup.py b/setup.py
index d56de0f..9a0bb1b 100644
--- a/setup.py
+++ b/setup.py
@@ -1,14 +1,13 @@
 from setuptools import setup, find_packages
 
-with open('README.md', 'r') as f:
+with open('README.rst', 'r') as f:
     long_description = f.read()
 
 setup(
     name='kyupy',
-    version='0.0.1',
+    version='0.0.2',
     description='High-performance processing and analysis of non-hierarchical VLSI designs',
     long_description=long_description,
-    long_description_content_type='text/markdown',
     packages=find_packages(where='src'),
     package_dir={'': 'src'},
     url='https://github.com/s-holst/kyupy',
@@ -16,7 +15,7 @@ setup(
     author_email='mail@s-holst.de',
     python_requires='>=3.6',
     install_requires=[
-        'numpy>=1.15.0',
+        'numpy>=1.17.0',
         'lark-parser>=0.8.0'
     ],
     extras_requires={
diff --git a/src/kyupy/__init__.py b/src/kyupy/__init__.py
index b87bade..d1bb8db 100644
--- a/src/kyupy/__init__.py
+++ b/src/kyupy/__init__.py
@@ -1,10 +1,13 @@
-"""This package provides tools for high-performance processing and validation
-of non-hierarchical VLSI circuits to aid rapid prototyping of research code
-in the fields of VLSI test, diagnosis and reliability.
+"""A package for processing and analysis of non-hierarchical gate-level VLSI designs.
+
+It contains fundamental building blocks for research software in the fields of VLSI test, diagnosis and reliability.
 """
 
 import time
 import importlib.util
+import gzip
+
+import numpy as np
 
 
 class Log:
@@ -97,12 +100,27 @@ if importlib.util.find_spec('numba') is not None:
         list(numba.cuda.gpus)
         from numba import cuda
     except CudaSupportError:
-        log.warn('Cuda unavailable. Falling back to pure python')
+        log.warn('Cuda unavailable. Falling back to pure Python.')
         cuda = MockCuda()
 else:
     numba = MockNumba()
     cuda = MockCuda()
-    log.warn('Numba unavailable. Falling back to pure python')
+    log.warn('Numba unavailable. Falling back to pure Python.')
+
+
+_pop_count_lut = np.asarray([bin(x).count('1') for x in range(256)])
+
 
+def popcount(a):
+    return np.sum(_pop_count_lut[a])
 
 
+def readtext(file):
+    if hasattr(file, 'read'):
+        return file.read()
+    if str(file).endswith('.gz'):
+        with gzip.open(file, 'rt') as f:
+            return f.read()
+    else:
+        with open(file, 'rt') as f:
+            return f.read()
diff --git a/src/kyupy/bench.py b/src/kyupy/bench.py
index cf0662d..7ec1e1e 100644
--- a/src/kyupy/bench.py
+++ b/src/kyupy/bench.py
@@ -1,5 +1,16 @@
+"""A parser for the ISCAS89 benchmark format.
+
+The ISCAS89 benchmark format (`.bench`-suffix) is a very simple textual description of gate-level netlists.
+Historically it was first used in the
+`ISCAS89 benchmark set <https://people.engr.ncsu.edu/brglez/CBL/benchmarks/ISCAS89/>`_.
+Besides loading these benchmarks, this module is also useful for easily constructing simple circuits:
+``c = bench.parse('input(x, y) output(a, o, n) a=and(x,y) o=or(x,y) n=not(x)')``.
+"""
+
 from lark import Lark, Transformer
+
 from .circuit import Circuit, Node, Line
+from . import readtext
 
 
 class BenchTransformer(Transformer):
@@ -19,10 +30,9 @@ class BenchTransformer(Transformer):
         cell = Node(self.c, str(name), str(cell_type))
         Line(self.c, cell, self.c.get_or_add_fork(str(name)))
         [Line(self.c, d, cell) for d in drivers]
-        
 
-def parse(bench):
-    grammar = r"""
+
+grammar = r"""
     start: (statement)*
     statement: input | output | assignment
     input: ("INPUT" | "input") parameters -> interface
@@ -32,12 +42,23 @@ def parse(bench):
     NAME: /[-_a-z0-9]+/i
     %ignore ( /\r?\n/ | "#" /[^\n]*/ | /[\t\f ]/ )+
     """
-    name = None
-    if '(' not in str(bench):  # No parentheses?: Assuming it is a file name.
-        name = str(bench).replace('.bench', '')
-        with open(bench, 'r') as f:
-            text = f.read()
-    else:
-        text = bench
+
+
+def parse(text, name=None):
+    """Parses the given ``text`` as ISCAS89 bench code.
+
+    :param text: A string with bench code.
+    :param name: The name of the circuit. Circuit names are not included in bench descriptions.
+    :return: A :class:`Circuit` object.
+    """
     return Lark(grammar, parser="lalr", transformer=BenchTransformer(name)).parse(text)
 
+
+def load(file, name=None):
+    """Parses the contents of ``file`` as ISCAS89 bench code.
+
+    :param file: The file to be loaded.
+    :param name: The name of the circuit. If none given, the file name is used as circuit name.
+    :return: A :class:`Circuit` object.
+    """
+    return parse(readtext(file), name=name or str(file))
diff --git a/src/kyupy/bittools.py b/src/kyupy/bittools.py
deleted file mode 100644
index df4c033..0000000
--- a/src/kyupy/bittools.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import numpy as np
-import importlib.util
-if importlib.util.find_spec('numba') is not None:
-    import numba
-else:
-    from . import numba
-    print('Numba unavailable. Falling back to pure python')
-
-
-_pop_count_lut = np.asarray([bin(x).count('1') for x in range(256)])
-
-
-def popcount(a):
-    return np.sum(_pop_count_lut[a])
-
-
-_bit_in_lut = np.array([2 ** x for x in range(7, -1, -1)], dtype='uint8')
-
-
-@numba.njit
-def bit_in(a, pos):
-    return a[pos >> 3] & _bit_in_lut[pos & 7]
-
diff --git a/src/kyupy/circuit.py b/src/kyupy/circuit.py
index bad9a67..84cc96c 100644
--- a/src/kyupy/circuit.py
+++ b/src/kyupy/circuit.py
@@ -1,3 +1,10 @@
+"""Data structures for representing non-hierarchical gate-level circuits.
+
+The class :class:`Circuit` is a container of nodes connected by lines.
+A node is an instance of class :class:`Node`,
+and a line is an instance of class :class:`Line`.
+"""
+
 from collections import deque
 
 
@@ -7,6 +14,9 @@ class GrowingList(list):
             self.extend([None] * (index + 1 - len(self)))
         super().__setitem__(index, value)
 
+    def free_index(self):
+        return next((i for i, x in enumerate(self) if x is None), len(self))
+
 
 class IndexList(list):
     def __delitem__(self, index):
@@ -19,36 +29,51 @@ class IndexList(list):
 
 
 class Node:
-    """A Node is a named entity in a circuit (e.g. a gate, a standard cell,
-    a named signal, or a fan-out point) that has connections to other nodes.
-    Each node contains:
-
-    * `self.index`: a circuit-unique integer index.
-    * `self.kind`: a type describing its function (e.g. 'AND', 'NOR').
-      The type '__fork__' is special. It signifies a named signal
-      or a fan-out in the circuit. Any other type is considered a physical cell.
-    * `self.name`: a name. Names must be unique among all forks and all cells
-      in the circuit. However, a fork (`self.kind=='__fork__'`) and a cell with
-      the same name may coexist.
-    * `self.ins`: a list of input connections (objects of class `Line`)
-    * `self.outs`: a list of output connections (objects of class `Line`).
+    """A node is a named entity in a circuit (e.g. a gate, a standard cell,
+    a named signal, or a fan-out point) that is connected to other nodes via lines.
+
+    The constructor automatically adds the new node to the given circuit.
     """
     def __init__(self, circuit, name, kind='__fork__'):
         if kind == '__fork__':
-            if name in circuit.forks:
-                raise ValueError(f'fork of name {name} already exists.')
+            assert name not in circuit.forks, f'fork of name {name} already in circuit.'
             circuit.forks[name] = self
         else:
-            if name in circuit.cells:
-                raise ValueError(f'cell of name {name} already exists.')
+            assert name not in circuit.cells, f'cell of name {name} already in circuit.'
             circuit.cells[name] = self
-        self.index = len(circuit.nodes)
         circuit.nodes.append(self)
         self.circuit = circuit
+        """The :class:`Circuit` object the node is part of.
+        """
         self.name = name
+        """The name of the node.
+
+        Names must be unique among all forks and all cells in the circuit.
+        However, a fork (:py:attr:`kind` is set to '__fork__') and a cell with the same name may coexist.
+        """
         self.kind = kind
+        """A string describing the type of the node.
+        
+        Common types are the names from a standard cell library or general gate names like 'AND' or 'NOR'.
+        If :py:attr:`kind` is set to '__fork__', it receives special treatment.
+        A `fork` describes a named signal or a fan-out point in the circuit and not a physical `cell` like a gate.
+        In the circuit, the namespaces of forks and cells are kept separate.
+        While :py:attr:`name` must be unique among all forks and all cells, a fork can have the same name as a cell.
+        The :py:attr:`index`, however, is unique among all nodes; a fork cannot have the same index as a cell.
+        """
+        self.index = len(circuit.nodes) - 1
+        """A unique and consecutive integer index of the node within the circuit.
+
+        It can be used to store additional data about the node :code:`n`
+        by allocating an array or list :code:`my_data` of length :code:`len(n.circuit.nodes)` and
+        accessing it by :code:`my_data[n.index]`.
+        """
         self.ins = GrowingList()
+        """A list of input connections (:class:`Line` objects).
+        """
         self.outs = GrowingList()
+        """A list of output connections (:class:`Line` objects).
+        """
 
     def __repr__(self):
         ins = ' '.join([f'<{line.index}' if line is not None else '<None' for line in self.ins])
@@ -56,6 +81,13 @@ class Node:
         return f'{self.index}:{self.kind}"{self.name}" {ins} {outs}'
 
     def remove(self):
+        """Removes the node from its circuit.
+
+        Lines may still reference the removed node.
+        The user must connect such lines to other nodes or remove the lines from the circuit.
+        To keep the indices consecutive, the node with the highest index within the circuit
+        will be assigned the index of the removed node.
+        """
         if self.circuit is not None:
             del self.circuit.nodes[self.index]
             if self.kind == '__fork__':
@@ -66,56 +98,67 @@ class Node:
 
 
 class Line:
-    """A Line is a directional 1:1 connection between two Nodes. It always
-    connects an output of a node (called `driver`) to an input of a node
-    (called `reader`) and has a circuit-unique index (`self.index`).
+    """A line is a directional 1:1 connection between two nodes.
 
-    Furthermore, `self.driver_pin` and `self.reader_pin` are the
-    integer indices of the connected pins of the nodes. They always correspond
-    to the positions of the line in the connection lists of the nodes:
+    It always connects an output of one `driver` node to an input of one `reader` node.
+    If a signal fans out to multiple readers, a '__fork__' node needs to be added.
 
-    * `self.driver.outs[self.driver_pin] == self`
-    * `self.reader.ins[self.reader_pin] == self`
+    The constructor automatically adds the new line to the given circuit and inserts references into the connection
+    lists of connected nodes.
 
-    A Line always connects a single driver to a single reader. If a signal fans out to
-    multiple readers, a '__fork__' Node needs to be added.
+    When adding a line, input and output pins can either be specified explicitly
+    :code:`Line(circuit, (driver, 2), (reader, 0))`, or implicitly :code:`Line(circuit, driver, reader)`.
+    In the implicit case, the line will be connected to the first free pin of the node.
+    Use the explicit case only if connections to specific pins are required.
+    It may overwrite any previous line references in the connection list of the nodes.
     """
     def __init__(self, circuit, driver, reader):
-        self.index = len(circuit.lines)
-        circuit.lines.append(self)
-        if type(driver) is Node:
-            self.driver = driver
-            self.driver_pin = len(driver.outs)
-            for pin, line in enumerate(driver.outs):
-                if line is None:
-                    self.driver_pin = pin
-                    break
-        else:
-            self.driver, self.driver_pin = driver
-        if type(reader) is Node:
-            self.reader = reader
-            self.reader_pin = len(reader.ins)
-            for pin, line in enumerate(reader.ins):
-                if line is None:
-                    self.reader_pin = pin
-                    break
-        else:
-            self.reader, self.reader_pin = reader
+        self.circuit = circuit
+        """The :class:`Circuit` object the line is part of.
+        """
+        self.circuit.lines.append(self)
+        self.index = len(self.circuit.lines) - 1
+        """A unique and consecutive integer index of the line within the circuit.
+
+        It can be used to store additional data about the line :code:`l`
+        by allocating an array or list :code:`my_data` of length :code:`len(l.circuit.lines)` and
+        accessing it by :code:`my_data[l.index]`.
+        """
+        if not isinstance(driver, tuple): driver = (driver, driver.outs.free_index())
+        self.driver = driver[0]
+        """The :class:`Node` object that drives this line.
+        """
+        self.driver_pin = driver[1]
+        """The output pin position of the driver node this line is connected to.
+        
+        This is the position in the outs-list of the driving node this line referenced from:
+        :code:`self.driver.outs[self.driver_pin] == self`.
+        """
+        if not isinstance(reader, tuple): reader = (reader, reader.ins.free_index())
+        self.reader = reader[0]
+        """The :class:`Node` object that reads this line.
+        """
+        self.reader_pin = reader[1]
+        """The input pin position of the reader node this line is connected to.
+
+        This is the position in the ins-list of the reader node this line referenced from:
+        :code:`self.reader.ins[self.reader_pin] == self`.
+        """
         self.driver.outs[self.driver_pin] = self
         self.reader.ins[self.reader_pin] = self
 
     def remove(self):
-        circuit = None
-        if self.driver is not None:
-            self.driver.outs[self.driver_pin] = None
-            circuit = self.driver.circuit
-        if self.reader is not None:
-            self.reader.ins[self.reader_pin] = None
-            circuit = self.reader.circuit
-        if circuit is not None:
-            del circuit.lines[self.index]
+        """Removes the line from its circuit and its referencing nodes.
+
+        To keep the indices consecutive, the line with the highest index within the circuit
+        will be assigned the index of the removed line.
+        """
+        if self.driver is not None: self.driver.outs[self.driver_pin] = None
+        if self.reader is not None: self.reader.ins[self.reader_pin] = None
+        if self.circuit is not None: del self.circuit.lines[self.index]
         self.driver = None
         self.reader = None
+        self.circuit = None
 
     def __repr__(self):
         return f'{self.index}'
@@ -127,27 +170,53 @@ class Line:
 class Circuit:
     """A Circuit is a container for interconnected nodes and lines.
 
-    All contained lines have unique indices, so have all contained nodes.
-    These indices can be used to store additional data about nodes or lines
-    by allocating an array `my_data` of length `len(self.nodes)` and then
-    accessing it by `my_data[n.index]`. The indices may change iff lines or
-    nodes are removed from the circuit.
+    It provides access to lines by index and to nodes by index and by name.
+    Nodes come in two flavors: `cells` and `forks` (see :py:attr:`Node.kind`).
+    The name spaces of cells and forks are kept separate.
 
-    Nodes come in two flavors (cells and forks, see `Node`). The names of
-    these nodes are kept unique within these two flavors.
+    The indices of nodes and lines are kept consecutive and unique.
+    Whenever lines or nodes are removed from the circuit, the indices of some other lines or nodes may change
+    to enforce consecutiveness.
+
+    A subset of nodes can be designated as primary input- or output-ports of the circuit.
+    This is done by adding them to the :py:attr:`interface` list.
     """
     def __init__(self, name=None):
         self.name = name
+        """The name of the circuit.
+        """
         self.nodes = IndexList()
+        """A list of all :class:`Node` objects contained in the circuit.
+        
+        The position of a node in this list equals its index :code:`self.nodes[42].index == 42`.
+        """
         self.lines = IndexList()
+        """A list of all :class:`Line` objects contained in the circuit.
+        
+        The position of a line in this list equals its index :code:`self.lines[42].index == 42`.
+        """
         self.interface = GrowingList()
+        """A list of nodes that are designated as primary input- or output-ports.
+        
+        Port-nodes are contained in :py:attr:`nodes` as well as :py:attr:`interface`.
+        The position of a node in the interface list corresponds to positions of logic values in test vectors.
+        The port direction is not stored explicitly.
+        Usually, nodes in the interface list without any lines in their :py:attr:`Node.ins` list are primary inputs,
+        and nodes without any lines in their :py:attr:`Node.outs` list are regarded as primary outputs.
+        """
         self.cells = {}
+        """A dictionary to access cells by name.
+        """
         self.forks = {}
+        """A dictionary to access forks by name.
+        """
 
     def get_or_add_fork(self, name):
         return self.forks[name] if name in self.forks else Node(self, name)
     
     def copy(self):
+        """Returns a deep copy of the circuit.
+        """
         c = Circuit(self.name)
         for node in self.nodes:
             Node(c, node.name, node.kind)
@@ -164,6 +233,8 @@ class Circuit:
         return c
     
     def dump(self):
+        """Returns a string representation of the circuit and all its nodes.
+        """
         header = f'{self.name}({",".join([str(n.index) for n in self.interface])})\n'
         return header + '\n'.join([str(n) for n in self.nodes])
 
@@ -172,6 +243,11 @@ class Circuit:
         return f'<Circuit{name} with {len(self.nodes)} nodes, {len(self.lines)} lines, {len(self.interface)} ports>'
 
     def topological_order(self):
+        """Generator function to iterate over all nodes in topological order.
+
+        Nodes without input lines and nodes whose :py:attr:`Node.kind` contains the substring 'DFF' are
+        yielded first.
+        """
         visit_count = [0] * len(self.nodes)
         queue = deque(n for n in self.nodes if len(n.ins) == 0 or 'DFF' in n.kind)
         while len(queue) > 0:
@@ -185,12 +261,19 @@ class Circuit:
             yield n
 
     def topological_line_order(self):
+        """Generator function to iterate over all lines in topological order.
+        """
         for n in self.topological_order():
             for line in n.outs:
                 if line is not None:
                     yield line
 
     def reversed_topological_order(self):
+        """Generator function to iterate over all nodes in reversed topological order.
+
+        Nodes without output lines and nodes whose :py:attr:`Node.kind` contains the substring 'DFF' are
+        yielded first.
+        """
         visit_count = [0] * len(self.nodes)
         queue = deque(n for n in self.nodes if len(n.outs) == 0 or 'DFF' in n.kind)
         while len(queue) > 0:
@@ -203,6 +286,10 @@ class Circuit:
             yield n
 
     def fanin(self, origin_nodes):
+        """Generator function to iterate over the fan-in cone of a given list of origin nodes.
+
+        Nodes are yielded in reversed topological order.
+        """
         marks = [False] * len(self.nodes)
         for n in origin_nodes:
             marks[n.index] = True
diff --git a/src/kyupy/logic.py b/src/kyupy/logic.py
new file mode 100644
index 0000000..d30fd55
--- /dev/null
+++ b/src/kyupy/logic.py
@@ -0,0 +1,402 @@
+"""This module contains definitions and data structures for 2-, 4-, and 8-valued logic operations.
+
+8 logic values are defined as integer constants.
+
+* For 2-valued logic: ``ZERO`` and ``ONE``
+* 4-valued logic adds: ``UNASSIGNED`` and ``UNKNOWN``
+* 8-valued logic adds: ``RISE``, ``FALL``, ``PPULSE``, and ``NPULSE``.
+
+The bits in these constants have the following meaning:
+
+  * bit 0: Final/settled binary value of a signal
+  * bit 1: Initial binary value of a signal
+  * bit 2: Activity or transitions are present on a signal
+
+Special meaning is given to values where bits 0 and 1 differ, but bit 2 (activity) is 0.
+These values are interpreted as ``UNKNOWN`` or ``UNASSIGNED`` in 4-valued and 8-valued logic.
+
+In general, 2-valued logic only considers bit 0, 4-valued logic considers bits 0 and 1, and 8-valued logic
+considers all 3 bits.
+The only exception is constant ``ONE=0b11`` which has two bits set for all logics including 2-valued logic.
+"""
+
+import math
+from collections.abc import Iterable
+
+import numpy as np
+
+from . import numba
+
+
+ZERO = 0b000
+"""Integer constant ``0b000`` for logic-0. ``'0'``, ``0``, ``False``, ``'L'``, and ``'l'`` are interpreted as ``ZERO``.
+"""
+UNKNOWN = 0b001
+"""Integer constant ``0b001`` for unknown or conflict. ``'X'``, or any other value is interpreted as ``UNKNOWN``.
+"""
+UNASSIGNED = 0b010
+"""Integer constant ``0b010`` for unassigned or high-impedance. ``'-'``, ``None``, ``'Z'``, and ``'z'`` are
+interpreted as ``UNASSIGNED``.
+"""
+ONE = 0b011
+"""Integer constant ``0b011`` for logic-1. ``'1'``, ``1``, ``True``, ``'H'``, and ``'h'`` are interpreted as ``ONE``.
+"""
+PPULSE = 0b100
+"""Integer constant ``0b100`` for positive pulse, meaning initial and final values are 0, but there is some activity
+on a signal. ``'P'``, ``'p'``, and ``'^'`` are interpreted as ``PPULSE``.
+"""
+RISE = 0b101
+"""Integer constant ``0b110`` for a rising transition. ``'R'``, ``'r'``, and ``'/'`` are interpreted as ``RISE``.
+"""
+FALL = 0b110
+"""Integer constant ``0b101`` for a falling transition. ``'F'``, ``'f'``, and ``'\\'`` are interpreted as ``FALL``.
+"""
+NPULSE = 0b111
+"""Integer constant ``0b111`` for negative pulse, meaning initial and final values are 1, but there is some activity
+on a signal. ``'N'``, ``'n'``, and ``'v'`` are interpreted as ``NPULSE``.
+"""
+
+
+def interpret(value):
+    if isinstance(value, Iterable) and not (isinstance(value, str) and len(value) == 1):
+        return list(map(interpret, value))
+    if value in [0, '0', False, 'L', 'l']:
+        return ZERO
+    if value in [1, '1', True, 'H', 'h']:
+        return ONE
+    if value in [None, '-', 'Z', 'z']:
+        return UNASSIGNED
+    if value in ['R', 'r', '/']:
+        return RISE
+    if value in ['F', 'f', '\\']:
+        return FALL
+    if value in ['P', 'p', '^']:
+        return PPULSE
+    if value in ['N', 'n', 'v']:
+        return NPULSE
+    return UNKNOWN
+
+
+_bit_in_lut = np.array([2 ** x for x in range(7, -1, -1)], dtype='uint8')
+
+
+@numba.njit
+def bit_in(a, pos):
+    return a[pos >> 3] & _bit_in_lut[pos & 7]
+
+
+def mv_cast(*args, m=8):
+    return [a if isinstance(a, MVArray) else MVArray(a, m=m) for a in args]
+
+
+def mv_getm(*args):
+    return max([a.m for a in args if isinstance(a, MVArray)] + [0]) or 8
+
+
+def _mv_not(m, out, inp):
+    np.bitwise_xor(inp, 0b11, out=out)  # this also exchanges UNASSIGNED <-> UNKNOWN
+    if m > 2:
+        np.putmask(out, (inp == UNKNOWN), UNKNOWN)  # restore UNKNOWN
+
+
+def mv_not(x1, out=None):
+    m = mv_getm(x1)
+    x1 = mv_cast(x1, m=m)[0]
+    out = out or MVArray(x1.data.shape, m=m)
+    _mv_not(m, out.data, x1.data)
+    return out
+
+
+def _mv_or(m, out, *ins):
+    if m > 2:
+        any_unknown = (ins[0] == UNKNOWN) | (ins[0] == UNASSIGNED)
+        for inp in ins[1:]: any_unknown |= (inp == UNKNOWN) | (inp == UNASSIGNED)
+        any_one = (ins[0] == ONE)
+        for inp in ins[1:]: any_one |= (inp == ONE)
+
+        out[...] = ZERO
+        np.putmask(out, any_one, ONE)
+        for inp in ins:
+            np.bitwise_or(out, inp, out=out, where=~any_one)
+        np.putmask(out, (any_unknown & ~any_one), UNKNOWN)
+    else:
+        out[...] = ZERO
+        for inp in ins: np.bitwise_or(out, inp, out=out)
+
+
+def mv_or(x1, x2, out=None):
+    m = mv_getm(x1, x2)
+    x1, x2 = mv_cast(x1, x2, m=m)
+    out = out or MVArray(np.broadcast(x1.data, x2.data).shape, m=m)
+    _mv_or(m, out.data, x1.data, x2.data)
+    return out
+
+
+def _mv_and(m, out, *ins):
+    if m > 2:
+        any_unknown = (ins[0] == UNKNOWN) | (ins[0] == UNASSIGNED)
+        for inp in ins[1:]: any_unknown |= (inp == UNKNOWN) | (inp == UNASSIGNED)
+        any_zero = (ins[0] == ZERO)
+        for inp in ins[1:]: any_zero |= (inp == ZERO)
+
+        out[...] = ONE
+        np.putmask(out, any_zero, ZERO)
+        for inp in ins:
+            np.bitwise_and(out, inp | 0b100, out=out, where=~any_zero)
+            if m > 4: np.bitwise_or(out, inp & 0b100, out=out, where=~any_zero)
+        np.putmask(out, (any_unknown & ~any_zero), UNKNOWN)
+    else:
+        out[...] = ONE
+        for inp in ins: np.bitwise_and(out, inp, out=out)
+
+
+def mv_and(x1, x2, out=None):
+    m = mv_getm(x1, x2)
+    x1, x2 = mv_cast(x1, x2, m=m)
+    out = out or MVArray(np.broadcast(x1.data, x2.data).shape, m=m)
+    _mv_and(m, out.data, x1.data, x2.data)
+    return out
+
+
+def _mv_xor(m, out, *ins):
+    if m > 2:
+        any_unknown = (ins[0] == UNKNOWN) | (ins[0] == UNASSIGNED)
+        for inp in ins[1:]: any_unknown |= (inp == UNKNOWN) | (inp == UNASSIGNED)
+
+        out[...] = ZERO
+        for inp in ins:
+            np.bitwise_xor(out, inp & 0b011, out=out)
+            if m > 4: np.bitwise_or(out, inp & 0b100, out=out)
+        np.putmask(out, any_unknown, UNKNOWN)
+    else:
+        out[...] = ZERO
+        for inp in ins: np.bitwise_xor(out, inp, out=out)
+
+
+def mv_xor(x1, x2, out=None):
+    m = mv_getm(x1, x2)
+    x1, x2 = mv_cast(x1, x2, m=m)
+    out = out or MVArray(np.broadcast(x1.data, x2.data).shape, m=m)
+    _mv_xor(m, out.data, x1.data, x2.data)
+    return out
+
+
+def mv_transition(init, final, out=None):
+    m = mv_getm(init, final)
+    init, final = mv_cast(init, final, m=m)
+    init = init.data
+    final = final.data
+    out = out or MVArray(np.broadcast(init, final).shape, m=8)
+    out.data[...] = (init & 0b010) | (final & 0b001)
+    out.data[...] |= ((out.data << 1) ^ (out.data << 2)) & 0b100
+    unknown = (init == UNKNOWN) | (init == UNASSIGNED) | (final == UNKNOWN) | (final == UNASSIGNED)
+    unassigned = (init == UNASSIGNED) & (final == UNASSIGNED)
+    np.putmask(out.data, unknown, UNKNOWN)
+    np.putmask(out.data, unassigned, UNASSIGNED)
+    return out
+
+
+class MVArray:
+    """An n-dimensional array of m-valued logic values.
+
+    This class wraps a numpy.ndarray of type uint8 and adds support for encoding and
+    interpreting 2-valued, 4-valued, and 8-valued logic values.
+    Each logic value is stored as an uint8, value manipulations are cheaper than in BPArray.
+
+    An MVArray always has 2 axes:
+
+    * Axis 0 is PI/PO/FF position, the length of this axis is called "width".
+    * Axis 1 is vector/pattern, the length of this axis is called "length".
+
+    """
+
+    def __init__(self, a, m=None):
+        self.m = m or 8
+        assert self.m in [2, 4, 8]
+
+        # Try our best to interpret given a.
+        if isinstance(a, MVArray):
+            self.data = a.data.copy()
+            self.m = m or a.m
+        elif hasattr(a, 'data'):  # assume it is a BPArray. Can't use isinstance() because BPArray isn't declared yet.
+            self.data = np.zeros((a.width, a.length), dtype=np.uint8)
+            self.m = m or a.m
+            for i in range(a.data.shape[-2]):
+                self.data[...] <<= 1
+                self.data[...] |= np.unpackbits(a.data[..., -i-1, :], axis=1)[:, :a.length]
+            if a.data.shape[-2] == 1:
+                self.data *= 3
+        elif isinstance(a, int):
+            self.data = np.full((a, 1), UNASSIGNED, dtype=np.uint8)
+        elif isinstance(a, tuple):
+            self.data = np.full(a, UNASSIGNED, dtype=np.uint8)
+        else:
+            if isinstance(a, str): a = [a]
+            self.data = np.asarray(interpret(a), dtype=np.uint8)
+            self.data = self.data[:, np.newaxis] if self.data.ndim == 1 else np.moveaxis(self.data, -2, -1)
+
+        # Cast data to m-valued logic.
+        if self.m == 2:
+            self.data[...] = ((self.data & 0b001) & ((self.data >> 1) & 0b001) | (self.data == RISE)) * ONE
+        elif self.m == 4:
+            self.data[...] = (self.data & 0b011) & ((self.data != FALL) * ONE) | ((self.data == RISE) * ONE)
+        elif self.m == 8:
+            self.data[...] = self.data & 0b111
+
+        self.length = self.data.shape[-1]
+        self.width = self.data.shape[-2]
+
+    def __repr__(self):
+        return f'<MVArray length={self.length} width={self.width} m={self.m} nbytes={self.data.nbytes}>'
+
+    def __str__(self):
+        return str([self[idx] for idx in range(self.length)])
+
+    def __getitem__(self, vector_idx):
+        chars = ["0", "X", "-", "1", "P", "R", "F", "N"]
+        return ''.join(chars[v] for v in self.data[:, vector_idx])
+
+    def __len__(self):
+        return self.length
+
+
+def bp_buf(out, inp):
+    md = out.shape[-2]
+    assert md == inp.shape[-2]
+    if md > 1:
+        unknown = inp[..., 0, :] ^ inp[..., 1, :]
+        if md > 2: unknown &= ~inp[..., 2, :]
+        out[..., 0, :] = inp[..., 0, :] | unknown
+        out[..., 1, :] = inp[..., 1, :] & ~unknown
+        if md > 2: out[..., 2, :] = inp[..., 2, :] & ~unknown
+    else:
+        out[..., 0, :] = inp[..., 0, :]
+
+
+def bp_not(out, inp):
+    md = out.shape[-2]
+    assert md == inp.shape[-2]
+    if md > 1:
+        unknown = inp[..., 0, :] ^ inp[..., 1, :]
+        if md > 2: unknown &= ~inp[..., 2, :]
+        out[..., 0, :] = ~inp[..., 0, :] | unknown
+        out[..., 1, :] = ~inp[..., 1, :] & ~unknown
+        if md > 2: out[..., 2, :] = inp[..., 2, :] & ~unknown
+    else:
+        out[..., 0, :] = ~inp[..., 0, :]
+
+
+def bp_or(out, *ins):
+    md = out.shape[-2]
+    for inp in ins: assert md == inp.shape[-2]
+    out[...] = 0
+    if md == 1:
+        for inp in ins: out[..., 0, :] |= inp[..., 0, :]
+    elif md == 2:
+        any_unknown = ins[0][..., 0, :] ^ ins[0][..., 1, :]
+        for inp in ins[1:]: any_unknown |= inp[..., 0, :] ^ inp[..., 1, :]
+        any_one = ins[0][..., 0, :] & ins[0][..., 1, :]
+        for inp in ins[1:]: any_one |= inp[..., 0, :] & inp[..., 1, :]
+        for inp in ins:
+            out[..., 0, :] |= inp[..., 0, :] | any_unknown
+            out[..., 1, :] |= inp[..., 1, :] & (~any_unknown | any_one)
+    else:
+        any_unknown = (ins[0][..., 0, :] ^ ins[0][..., 1, :]) & ~ins[0][..., 2, :]
+        for inp in ins[1:]: any_unknown |= (inp[..., 0, :] ^ inp[..., 1, :]) & ~inp[..., 2, :]
+        any_one = ins[0][..., 0, :] & ins[0][..., 1, :] & ~ins[0][..., 2, :]
+        for inp in ins[1:]: any_one |= inp[..., 0, :] & inp[..., 1, :] & ~inp[..., 2, :]
+        for inp in ins:
+            out[..., 0, :] |= inp[..., 0, :] | any_unknown
+            out[..., 1, :] |= inp[..., 1, :] & (~any_unknown | any_one)
+            out[..., 2, :] |= inp[..., 2, :] & (~any_unknown | any_one) & ~any_one
+
+
+def bp_and(out, *ins):
+    md = out.shape[-2]
+    for inp in ins: assert md == inp.shape[-2]
+    out[...] = 0xff
+    if md == 1:
+        for inp in ins: out[..., 0, :] &= inp[..., 0, :]
+    elif md == 2:
+        any_unknown = ins[0][..., 0, :] ^ ins[0][..., 1, :]
+        for inp in ins[1:]: any_unknown |= inp[..., 0, :] ^ inp[..., 1, :]
+        any_zero = ~ins[0][..., 0, :] & ~ins[0][..., 1, :]
+        for inp in ins[1:]: any_zero |= ~inp[..., 0, :] & ~inp[..., 1, :]
+        for inp in ins:
+            out[..., 0, :] &= inp[..., 0, :] | (any_unknown & ~any_zero)
+            out[..., 1, :] &= inp[..., 1, :] & ~any_unknown
+    else:
+        any_unknown = (ins[0][..., 0, :] ^ ins[0][..., 1, :]) & ~ins[0][..., 2, :]
+        for inp in ins[1:]: any_unknown |= (inp[..., 0, :] ^ inp[..., 1, :]) & ~inp[..., 2, :]
+        any_zero = ~ins[0][..., 0, :] & ~ins[0][..., 1, :] & ~ins[0][..., 2, :]
+        for inp in ins[1:]: any_zero |= ~inp[..., 0, :] & ~inp[..., 1, :] & ~inp[..., 2, :]
+        out[..., 2, :] = 0
+        for inp in ins:
+            out[..., 0, :] &= inp[..., 0, :] | (any_unknown & ~any_zero)
+            out[..., 1, :] &= inp[..., 1, :] & ~any_unknown
+            out[..., 2, :] |= inp[..., 2, :] & (~any_unknown | any_zero) & ~any_zero
+
+
+def bp_xor(out, *ins):
+    md = out.shape[-2]
+    for inp in ins: assert md == inp.shape[-2]
+    out[...] = 0
+    if md == 1:
+        for inp in ins: out[..., 0, :] ^= inp[..., 0, :]
+    elif md == 2:
+        any_unknown = ins[0][..., 0, :] ^ ins[0][..., 1, :]
+        for inp in ins[1:]: any_unknown |= inp[..., 0, :] ^ inp[..., 1, :]
+        for inp in ins: out[...] ^= inp
+        out[..., 0, :] |= any_unknown
+        out[..., 1, :] &= ~any_unknown
+    else:
+        any_unknown = (ins[0][..., 0, :] ^ ins[0][..., 1, :]) & ~ins[0][..., 2, :]
+        for inp in ins[1:]: any_unknown |= (inp[..., 0, :] ^ inp[..., 1, :]) & ~inp[..., 2, :]
+        for inp in ins:
+            out[..., 0, :] ^= inp[..., 0, :]
+            out[..., 1, :] ^= inp[..., 1, :]
+            out[..., 2, :] |= inp[..., 2, :]
+        out[..., 0, :] |= any_unknown
+        out[..., 1, :] &= ~any_unknown
+        out[..., 2, :] &= ~any_unknown
+
+
+class BPArray:
+    """An n-dimensional array of m-valued logic values that uses bit-parallel storage.
+
+    The primary use of this format is in aiding efficient bit-parallel logic simulation.
+    The secondary benefit over MVArray is its memory efficiency.
+    Accessing individual values is more expensive than with :py:class:`MVArray`.
+    It is advised to first construct a MVArray, pack it into a :py:class:`BPArray` for simulation and unpack the results
+    back into a :py:class:`MVArray` for value access.
+
+    The values along the last axis (vectors/patterns) are packed into uint8 words.
+    The second-last axis has length ceil(log2(m)) for storing all bits.
+    All other axes stay the same as in MVArray.
+    """
+
+    def __init__(self, a, m=None):
+        if not isinstance(a, MVArray) and not isinstance(a, BPArray):
+            a = MVArray(a, m)
+            self.m = a.m
+        if isinstance(a, MVArray):
+            if m is not None and m != a.m:
+                a = MVArray(a, m)  # cast data
+            self.m = a.m
+            assert self.m in [2, 4, 8]
+            nwords = math.ceil(math.log2(self.m))
+            nbytes = (a.data.shape[-1] - 1) // 8 + 1
+            self.data = np.zeros(a.data.shape[:-1] + (nwords, nbytes), dtype=np.uint8)
+            for i in range(self.data.shape[-2]):
+                self.data[..., i, :] = np.packbits((a.data >> i) & 1, axis=-1)
+        else:  # we have a BPArray
+            self.data = a.data.copy()  # TODO: support conversion to different m
+            self.m = a.m
+        self.length = a.length
+        self.width = a.width
+
+    def __repr__(self):
+        return f'<BPArray length={self.length} width={self.width} m={self.m} bytes={self.data.nbytes}>'
+
+    def __len__(self):
+        return self.length
diff --git a/src/kyupy/logic_sim.py b/src/kyupy/logic_sim.py
index 9f75a5c..cddde47 100644
--- a/src/kyupy/logic_sim.py
+++ b/src/kyupy/logic_sim.py
@@ -1,21 +1,25 @@
+import math
+
 import numpy as np
-from . import packed_vectors
+
+from . import logic
 
 
 class LogicSim:
-    """A bit-parallel naive combinational logic simulator supporting 1, 4, or 8-valued logics.
+    """A bit-parallel naïve combinational simulator for 2-, 4-, or 8-valued logic.
     """
-    def __init__(self, circuit, nvectors=1, vdim=1):
+    def __init__(self, circuit, sims=1, m=8):
+        assert m in [2, 4, 8]
+        self.m = m
+        mdim = math.ceil(math.log2(m))
         self.circuit = circuit
-        self.nvectors = nvectors
-        nbytes = (nvectors - 1) // 8 + 1
+        self.sims = sims
+        nbytes = (sims - 1) // 8 + 1
         self.interface = list(circuit.interface) + [n for n in circuit.nodes if 'dff' in n.kind.lower()]
-        self.state = np.zeros((len(circuit.lines), vdim, nbytes), dtype='uint8')
+        self.state = np.zeros((len(circuit.lines), mdim, nbytes), dtype='uint8')
         self.state_epoch = np.zeros(len(circuit.nodes), dtype='int8') - 1
-        self.tmp = np.zeros((5, vdim, nbytes), dtype='uint8')
-        self.zero = np.zeros((vdim, nbytes), dtype='uint8')
-        if vdim > 1:
-            self.zero[1] = 255
+        self.tmp = np.zeros((5, mdim, nbytes), dtype='uint8')
+        self.zero = np.zeros((mdim, nbytes), dtype='uint8')
         self.epoch = 0
 
         self.fork_vd1 = self.fork_vdx
@@ -45,23 +49,23 @@ class LogicSim:
         self.nbuff_vd3 = self.fork_vd3
         self.xor2_vd3 = self.xor_vd3
         
-        known_fct = [(f[:-4], getattr(self, f)) for f in dir(self) if f.endswith(f'_vd{vdim}')]
+        known_fct = [(f[:-4], getattr(self, f)) for f in dir(self) if f.endswith(f'_vd{mdim}')]
         self.node_fct = []
         for n in circuit.nodes:
             t = n.kind.lower().replace('__fork__', 'fork')
             t = t.replace('__const0__', 'const0')
             t = t.replace('__const1__', 'const1')
             t = t.replace('tieh', 'const1')
-            # t = t.replace('xor', 'or').replace('xnor', 'nor')
             fcts = [f for n, f in known_fct if t.startswith(n)]
             if len(fcts) < 1:
                 raise ValueError(f'Unknown node kind {n.kind}')
             self.node_fct.append(fcts[0])
 
     def assign(self, stimuli):
-        if isinstance(stimuli, packed_vectors.PackedVectors):
-            stimuli = stimuli.bits
-        for (stim, node) in zip(stimuli, self.interface):
+        """Assign stimuli to the primary inputs and state-elements (flip-flops)."""
+        if hasattr(stimuli, 'data'):
+            stimuli = stimuli.data
+        for stim, node in zip(stimuli, self.interface):
             if len(node.outs) == 0: continue
             outputs = [self.state[line.index] if line else self.tmp[3] for line in node.outs]
             self.node_fct[node.index]([stim], outputs)
@@ -78,13 +82,16 @@ class LogicSim:
                         self.state_epoch[line.reader.index] = self.epoch
 
     def capture(self, responses):
-        if isinstance(responses, packed_vectors.PackedVectors):
-            responses = responses.bits
-        for (resp, node) in zip(responses, self.interface):
+        """Capture the current values at the primary outputs and in the state-elements (flip-flops)."""
+        if hasattr(responses, 'data'):
+            responses = responses.data
+        for resp, node in zip(responses, self.interface):
             if len(node.ins) == 0: continue
             resp[...] = self.state[node.ins[0].index]
+        # print(responses)
 
     def propagate(self):
+        """Propagate the input values towards the outputs (Perform all logic operations in topological order)."""
         for node in self.circuit.topological_order():
             if self.state_epoch[node.index] != self.epoch: continue
             inputs = [self.state[line.index] if line else self.zero for line in node.ins]
@@ -95,8 +102,7 @@ class LogicSim:
                 self.state_epoch[line.reader.index] = self.epoch
         self.epoch = (self.epoch + 1) % 128
 
-    @staticmethod
-    def fork_vdx(inputs, outputs):
+    def fork_vdx(self, inputs, outputs):
         for o in outputs: o[...] = inputs[0]
     
     def const0_vdx(self, _, outputs):
@@ -104,40 +110,34 @@ class LogicSim:
 
     # 2-valued simulation
 
-    @staticmethod
-    def not_vd1(inputs, outputs):
+    def not_vd1(self, inputs, outputs):
         outputs[0][0] = ~inputs[0][0]
 
     def const1_vd1(self, _, outputs):
         for o in outputs: o[...] = self.zero
         self.not_vd1(outputs, outputs)
 
-    @staticmethod
-    def and_vd1(inputs, outputs):
+    def and_vd1(self, inputs, outputs):
         o = outputs[0]
         o[0] = inputs[0][0]
         for i in inputs[1:]: o[0] &= i[0]
 
-    @staticmethod
-    def or_vd1(inputs, outputs):
+    def or_vd1(self, inputs, outputs):
         o = outputs[0]
         o[0] = inputs[0][0]
         for i in inputs[1:]: o[0] |= i[0]
 
-    @staticmethod
-    def xor_vd1(inputs, outputs):
+    def xor_vd1(self, inputs, outputs):
         o = outputs[0]
         o[0] = inputs[0][0]
         for i in inputs[1:]: o[0] ^= i[0]
 
-    @staticmethod
-    def sdff_vd1(inputs, outputs):
+    def sdff_vd1(self, inputs, outputs):
         outputs[0][0] = inputs[0][0]
         if len(outputs) > 1:
             outputs[1][0] = ~inputs[0][0]
 
-    @staticmethod
-    def dff_vd1(inputs, outputs):
+    def dff_vd1(self, inputs, outputs):
         outputs[0][0] = inputs[0][0]
         if len(outputs) > 1:
             outputs[1][0] = ~inputs[0][0]
@@ -155,93 +155,26 @@ class LogicSim:
         self.not_vd1(outputs, outputs)
 
     # 4-valued simulation
-    # sym [0] [1] (value, care)
-    #  0   0   1
-    #  1   1   1
-    #  -   0   0
-    #  X   1   0
-
-    @staticmethod
-    def not_vd2(inputs, outputs):
-        # 4-valued not:
-        # i: 0 1 - X
-        # o: 1 0 X X
-        # o0 1 0 1 1
-        # o1 1 1 0 0
-
-        outputs[0][0] = ~inputs[0][0] | ~inputs[0][1]  # value = 0 or DC
-        outputs[0][1] = inputs[0][1]  # care = C
+
+    def not_vd2(self, inputs, outputs):
+        logic.bp_not(outputs[0], inputs[0])
 
     def and_vd2(self, inputs, outputs):
-        # 4-valued:    o[0]:     o[1]:
-        #    0 1 - X   0 1 - X   0 1 - X
-        # 0  0 0 0 0   0 0 0 0   1 1 1 1
-        # 1  0 1 X X   0 1 1 1   1 1 0 0
-        # -  0 X X X   0 1 1 1   1 0 0 0
-        # X  0 X X X   0 1 1 1   1 0 0 0
-
-        i = inputs[0]
-        any0 = self.tmp[0]
-        anyd = self.tmp[1]
-        any0[0] = ~i[0] & i[1]
-        anyd[0] = ~i[1]
-        for i in inputs[1:]:
-            any0[0] |= ~i[0] & i[1]
-            anyd[0] |= ~i[1]
-        o = outputs[0]
-        o[0] = ~any0[0]  # value = no0
-        o[1] = any0[0] | ~anyd[0]  # care = any0 or noDC
+        logic.bp_and(outputs[0], *inputs)
 
     def or_vd2(self, inputs, outputs):
-        # 4-valued:    o[0]:     o[1]:
-        #    0 1 - X   0 1 - X   0 1 - X
-        # 0  0 1 X X   0 1 1 1   1 1 0 0
-        # 1  1 1 1 1   1 1 1 1   1 1 1 1
-        # -  X 1 X X   1 1 1 1   0 1 0 0
-        # X  X 1 X X   1 1 1 1   0 1 0 0
-
-        i = inputs[0]
-        any1 = self.tmp[0]
-        anyd = self.tmp[1]
-        any1[0] = i[0] & i[1]
-        anyd[0] = ~i[1]
-        for i in inputs[1:]:
-            any1[0] |= i[0] & i[1]
-            anyd[0] |= ~i[1]
-        o = outputs[0]
-        o[0] = any1[0] | anyd[0]  # value = any1 or anyDC
-        o[1] = any1[0] | ~anyd[0]  # care = any1 or noDC
+        logic.bp_or(outputs[0], *inputs)
 
     def xor_vd2(self, inputs, outputs):
-        # 4-valued:    o[0]:     o[1]:
-        #    0 1 - X   0 1 - X   0 1 - X
-        # 0  0 1 X X   0 1 1 1   1 1 0 0
-        # 1  1 0 X X   1 0 1 1   1 1 0 0
-        # -  X X X X   1 1 1 1   0 0 0 0
-        # X  X X X X   1 1 1 1   0 0 0 0
-
-        i = inputs[0]
-        odd1 = self.tmp[0]
-        anyd = self.tmp[1]
-        odd1[0] = i[0] & i[1]
-        anyd[0] = ~i[1]
-        for i in inputs[1:]:
-            odd1[0] ^= i[0] & i[1]
-            anyd[0] |= ~i[1]
-        o = outputs[0]
-        o[0] = odd1[0] | anyd[0]  # value = odd1 or anyDC
-        o[1] = ~anyd[0]  # care = noDC
+        logic.bp_xor(outputs[0], *inputs)
 
     def sdff_vd2(self, inputs, outputs):
         self.dff_vd2(inputs, outputs)
         if len(outputs) > 1:
-            outputs[1][0] = ~inputs[0][0] | ~inputs[0][1]  # value = 0 or DC
-            outputs[1][1] = inputs[0][1]  # care = C
+            logic.bp_not(outputs[1], inputs[0])
 
-    @staticmethod
-    def dff_vd2(inputs, outputs):
-        outputs[0][0] = inputs[0][0] | ~inputs[0][1]  # value = 1 or DC
-        outputs[0][1] = inputs[0][1]  # care = C
+    def dff_vd2(self, inputs, outputs):
+        logic.bp_buf(outputs[0], inputs[0])
 
     def nand_vd2(self, inputs, outputs):
         self.and_vd2(inputs, outputs)
@@ -260,149 +193,26 @@ class LogicSim:
         self.not_vd2(outputs, outputs)
 
     # 8-valued simulation
-    # sym [0] [1] [2] (initial value, ~final value, toggles present?)
-    #  0   0   1   0
-    #  1   1   0   0
-    #  -   0   0   0
-    #  X   1   1   0
-    #  R   0   0   1  _/"
-    #  F   1   1   1  "\_
-    #  P   0   1   1  _/\_
-    #  N   1   0   1  "\/"
 
     def not_vd3(self, inputs, outputs):
-        # 8-valued not:
-        # i: 0 1 - X R F P N
-        # i0 0 1 0 1 0 1 0 1
-        # i1 1 0 0 1 0 1 1 0
-        # i2 0 0 0 0 1 1 1 1
-        # o: 1 0 X X F R N P
-        # o0 1 0 1 1 1 0 1 0
-        # o1 0 1 1 1 1 0 0 1
-        # o2 0 0 0 0 1 1 1 1
-        i = inputs[0]
-        dc = self.tmp[0]
-        dc[0] = ~(i[0] ^ i[1]) & ~i[2]
-        dc = self.tmp[0]
-        outputs[0][0] = ~i[0] | dc[0]  # init.v = ~i0 or DC
-        outputs[0][1] = ~i[1] | dc[0]  # init.v = ~i1 or DC
-        outputs[0][2] = i[2]  # toggles = i2
+        logic.bp_not(outputs[0], inputs[0])
 
     def and_vd3(self, inputs, outputs):
-        # 8-valued:           o[0]:            o[1]:            o[2]:
-        #    0 1 - X R F P N  0 1 - X R F P N  0 1 - X R F P N  0 1 - X R F P N
-        # 0  0 0 0 0 0 0 0 0  0 0 0 0 0 0 0 0  1 1 1 1 1 1 1 1  0 0 0 0 0 0 0 0
-        # 1  0 1 X X R F P N  0 1 1 1 0 1 0 1  1 0 1 1 0 1 1 0  0 0 0 0 1 1 1 1
-        # -  0 X X X X X X X  0 1 1 1 1 1 1 1  1 1 1 1 1 1 1 1  0 0 0 0 0 0 0 0
-        # X  0 X X X X X X X  0 1 1 1 1 1 1 1  1 1 1 1 1 1 1 1  0 0 0 0 0 0 0 0
-        # R  0 R X X R R P R  0 0 1 1 0 0 0 0  1 0 1 1 0 0 1 0  0 1 0 0 1 1 1 1
-        # F  0 F X X R F P F  0 1 1 1 0 1 0 1  1 1 1 1 0 1 1 1  0 1 0 0 1 1 1 1
-        # P  0 P X X P P P P  0 0 1 1 0 0 0 0  1 1 1 1 1 1 1 1  0 1 0 0 1 1 1 1
-        # N  0 N X X R F P N  0 1 1 1 0 1 0 1  1 0 1 1 0 1 1 0  0 1 0 0 1 1 1 1
-        i = inputs[0]
-        anyi0 = self.tmp[0]
-        anyf0 = self.tmp[1]
-        anyd = self.tmp[2]
-        any0 = self.tmp[3]
-        any_t = self.tmp[4]
-        anyd[0] = ~(i[0] ^ i[1]) & ~i[2]
-        anyi0[0] = ~i[0] & ~anyd[0]
-        anyf0[0] = i[1] & ~anyd[0]
-        any_t[0] = i[2]
-        any0[0] = anyi0[0] & anyf0[0] & ~i[2]
-        for i in inputs[1:]:
-            dc = ~(i[0] ^ i[1]) & ~i[2]
-            anyd[0] |= dc
-            anyi0[0] |= ~i[0] & ~dc
-            anyf0[0] |= i[1] & ~dc
-            any_t[0] |= i[2]
-            any0[0] |= ~i[0] & ~dc & i[1] & ~i[2]
-        o = outputs[0]
-        o[0] = (~anyi0[0] | anyd[0]) & ~any0[0]  # initial = no_i0 or DC
-        o[1] = anyf0[0] | anyd[0]  # ~final = ~no_f0 or DC
-        o[2] = any_t[0] & ~(anyd[0] | any0[0])  # toggle = anyT and noDC and no0
+        logic.bp_and(outputs[0], *inputs)
 
     def or_vd3(self, inputs, outputs):
-        # 8-valued:           o[0]:            o[1]:            o[2]:
-        #    0 1 - X R F P N  0 1 - X R F P N  0 1 - X R F P N  0 1 - X R F P N
-        # 0  0 1 X X R F P N  0 1 1 1 0 1 0 1  1 0 1 1 0 1 1 0  0 0 0 0 1 1 1 1
-        # 1  1 1 1 1 1 1 1 1  1 1 1 1 1 1 1 1  0 0 0 0 0 0 0 0  0 0 0 0 0 0 0 0
-        # -  X 1 X X X X X X  1 1 1 1 1 1 1 1  1 0 1 1 1 1 1 1  0 0 0 0 0 0 0 0
-        # X  X 1 X X X X X X  1 1 1 1 1 1 1 1  1 0 1 1 1 1 1 1  0 0 0 0 0 0 0 0
-        # R  R 1 X X R N R R  0 1 1 1 0 1 0 0  0 0 1 1 0 0 0 0  1 0 0 0 1 1 1 1
-        # F  F 1 X X N F F F  1 1 1 1 1 1 1 1  1 0 1 1 0 1 1 1  1 0 0 0 1 1 1 1
-        # P  P 1 X X R F P N  0 1 1 1 0 1 0 1  1 0 1 1 0 1 1 0  1 0 0 0 1 1 1 1
-        # N  N 1 X X R F N N  1 1 1 1 0 1 1 1  0 0 1 1 0 1 0 0  1 0 0 0 1 1 1 1
-        i = inputs[0]
-        anyi1 = self.tmp[0]
-        anyf1 = self.tmp[1]
-        anyd = self.tmp[2]
-        any1 = self.tmp[3]
-        any_t = self.tmp[4]
-        anyd[0] = ~(i[0] ^ i[1]) & ~i[2]
-        anyi1[0] = i[0] & ~anyd[0]
-        anyf1[0] = ~i[1] & ~anyd[0]
-        any_t[0] = i[2]
-        any1[0] = (anyi1[0] & anyf1[0]) & ~i[2]
-        for i in inputs[1:]:
-            dc = ~(i[0] ^ i[1]) & ~i[2]
-            anyd[0] |= dc
-            anyi1[0] |= i[0] & ~dc
-            anyf1[0] |= ~i[1] & ~dc
-            any_t[0] |= i[2]
-            any1[0] |= i[0] & ~dc & ~i[1] & ~i[2]
-        o = outputs[0]
-        o[0] = anyi1[0] | anyd[0]  # initial = i1 or DC
-        o[1] = (~anyf1[0] | anyd[0]) & ~any1[0]  # ~final = f1 or DC
-        o[2] = any_t[0] & ~(anyd[0] | any1[0])  # toggle = anyT and no(DC or 1)
+        logic.bp_or(outputs[0], *inputs)
 
     def xor_vd3(self, inputs, outputs):
-        # 8-valued:           o[0]:            o[1]:            o[2]:
-        #    0 1 - X R F P N  0 1 - X R F P N  0 1 - X R F P N  0 1 - X R F P N
-        # 0  0 1 X X R F P N  0 1 1 1 0 1 0 1  1 0 1 1 0 1 1 0  0 0 0 0 1 1 1 1
-        # 1  1 0 X X F R N P  1 0 1 1 1 0 1 0  0 1 1 1 1 0 0 1  0 0 0 0 1 1 1 1
-        # -  X X X X X X X X  1 1 1 1 1 1 1 1  1 1 1 1 1 1 1 1  0 0 0 0 0 0 0 0
-        # X  X X X X X X X X  1 1 1 1 1 1 1 1  1 1 1 1 1 1 1 1  0 0 0 0 0 0 0 0
-        # R  R F X X P N R F  0 1 1 1 0 1 0 1  0 1 1 1 1 0 0 1  1 1 0 0 1 1 1 1
-        # F  F R X X N P F R  1 0 1 1 1 0 1 0  1 0 1 1 0 1 1 0  1 1 0 0 1 1 1 1
-        # P  P N X X R F P N  0 1 1 1 0 1 0 1  1 0 1 1 0 1 1 0  1 1 0 0 1 1 1 1
-        # N  N P X X F R N P  1 0 1 1 1 0 1 0  0 1 1 1 1 0 0 1  1 1 0 0 1 1 1 1
-        i = inputs[0]
-        odd0 = self.tmp[0]
-        odd1 = self.tmp[1]
-        anyd = self.tmp[2]
-        anyt = self.tmp[3]
-        odd0[0] = i[0]
-        odd1[0] = i[1]
-        anyd[0] = ~(i[0] ^ i[1]) & ~i[2]
-        anyt[0] = i[2]
-        for i in inputs[1:]:
-            odd0[0] ^= i[0]
-            odd1[0] ^= i[1]
-            anyd[0] |= ~(i[0] ^ i[1]) & ~i[2]
-            anyt[0] |= i[2]
-        o = outputs[0]
-        o[0] = odd0[0] | anyd[0]
-        o[1] = ~odd1[0] | anyd[0]
-        o[2] = anyt[0] & ~anyd[0]
-        
+        logic.bp_xor(outputs[0], *inputs)
+
     def sdff_vd3(self, inputs, outputs):
         self.dff_vd3(inputs, outputs)
         if len(outputs) > 1:
-            i = inputs[0]
-            dc = self.tmp[0]
-            dc[0] = ~(i[0] ^ i[1]) & ~i[2]
-            outputs[1][0] = ~i[0] | dc[0]  # value = 1 or DC
-            outputs[1][1] = ~i[1] | dc[0]  # value = 1 or DC
-            outputs[1][2] = i[2]  # toggle = T
+            logic.bp_not(outputs[1], inputs[0])
 
     def dff_vd3(self, inputs, outputs):
-        i = inputs[0]
-        dc = self.tmp[0]
-        dc[0] = ~(i[0] ^ i[1]) & ~i[2]
-        outputs[0][0] = i[0] | dc[0]  # value = 1 or DC
-        outputs[0][1] = i[1] | dc[0]  # value = 1 or DC
-        outputs[0][2] = i[2]  # toggle = T
+        logic.bp_buf(outputs[0], inputs[0])
 
     def nand_vd3(self, inputs, outputs):
         self.and_vd3(inputs, outputs)
diff --git a/src/kyupy/packed_vectors.py b/src/kyupy/packed_vectors.py
deleted file mode 100644
index ffd3dda..0000000
--- a/src/kyupy/packed_vectors.py
+++ /dev/null
@@ -1,299 +0,0 @@
-import numpy as np
-from .bittools import popcount, bit_in
-
-
-class PackedVectors:
-    def __init__(self, nvectors=8, width=1, vdim=1, from_cache=None):
-        if from_cache is not None:
-            self.bits = np.array(from_cache)
-            self.width, self.vdim, nbytes = self.bits.shape
-        else:
-            self.bits = np.zeros((width, vdim, (nvectors - 1) // 8 + 1), dtype='uint8')
-            self.vdim = vdim
-            self.width = width
-        self.nvectors = nvectors
-        m1 = np.array([2 ** x for x in range(7, -1, -1)], dtype='uint8')
-        m0 = ~m1
-        self.mask = np.rollaxis(np.vstack((m0, m1)), 1)
-
-    @classmethod
-    def from_pair(cls, init, final):
-        assert init.nvectors == final.nvectors
-        assert len(init.bits) == len(final.bits)
-        init_v = init.bits[:, 0]
-        if init.vdim == 3:
-            init_c = (init.bits[:, 0] ^ init.bits[:, 1]) | init.bits[:, 2]
-        elif init.vdim == 2:
-            init_c = init.bits[:, 1]
-        else:
-            init_c = ~np.zeros_like(init.bits[:, 0])
-        final_v = final.bits[:, 0]
-        if final.vdim == 3:
-            final_c = (final.bits[:, 0] ^ final.bits[:, 1]) | final.bits[:, 2]
-            final_v = ~final.bits[:, 1]
-        elif final.vdim == 2:
-            final_c = final.bits[:, 1]
-        else:
-            final_c = ~np.zeros_like(final.bits[:, 0])
-        c = init_c & final_c
-        a0 = init_v & c
-        a1 = ~final_v & c
-        a2 = (init_v ^ final_v) & c
-        p = PackedVectors(init.nvectors, len(init.bits), 3)
-        p.bits[:, 0] = a0
-        p.bits[:, 1] = a1
-        p.bits[:, 2] = a2
-        return p
-        
-    def transition_vectors(self):
-        a = PackedVectors(self.nvectors-1, self.width, 3)
-        for pos in range(self.width):
-            for vidx in range(self.nvectors-1):
-                tr = self.get_value(vidx, pos) + self.get_value(vidx+1, pos)
-                if tr == '00':
-                    a.set_value(vidx, pos, '0')
-                elif tr == '11':
-                    a.set_value(vidx, pos, '1')
-                elif tr == '01':
-                    a.set_value(vidx, pos, 'R')
-                elif tr == '10':
-                    a.set_value(vidx, pos, 'F')
-                elif tr == '--':
-                    a.set_value(vidx, pos, '-')
-                else:
-                    a.set_value(vidx, pos, 'X')
-        return a
-        
-    def __add__(self, other):
-        a = PackedVectors(self.nvectors + other.nvectors, self.width, max(self.vdim, other.vdim))
-        # a.bits[:self.bits.shape[0], 0] = self.bits[:, 0]
-        # if self.vdim == 2:
-        #    a.bits[:self.bits.shape[0], 1] = self.care_bits
-        # elif self.vdim == 3:
-        #    a.bits[:self.bits.shape[0], 1] = ~self.value_bits
-        #    a.bits[:self.bits.shape[0], 2] = self.toggle_bits
-        for i in range(self.nvectors):
-            a[i] = self[i]
-        for i in range(len(other)):
-            a[self.nvectors+i] = other[i]
-        return a
-
-    def __len__(self):
-        return self.nvectors
-    
-    def randomize(self, one_probability=0.5):
-        for data in self.bits:
-            data[0] = np.packbits((np.random.rand(self.nvectors) < one_probability).astype(int))
-            if self.vdim == 2:
-                data[1] = 255
-            elif self.vdim == 3:
-                data[1] = ~np.packbits((np.random.rand(self.nvectors) < one_probability).astype(int))
-                data[2] = data[0] ^ ~data[1]
-            
-    def copy(self, selection_mask=None):
-        if selection_mask is not None:
-            cpy = PackedVectors(popcount(selection_mask), len(self.bits), self.vdim)
-            cur = 0
-            for vidx in range(self.nvectors):
-                if bit_in(selection_mask, vidx):
-                    cpy[cur] = self[vidx]
-                    cur += 1
-        else:
-            cpy = PackedVectors(self.nvectors, len(self.bits), self.vdim)
-            np.copyto(cpy.bits, self.bits)
-        return cpy
-
-    @property
-    def care_bits(self):
-        if self.vdim == 1:
-            return self.bits[:, 0] | 255
-        elif self.vdim == 2:
-            return self.bits[:, 1]
-        elif self.vdim == 3:
-            return (self.bits[:, 0] ^ self.bits[:, 1]) | self.bits[:, 2]
-
-    @property
-    def initial_bits(self):
-        return self.bits[:, 0]
-
-    @property
-    def value_bits(self):
-        if self.vdim == 3:
-            return ~self.bits[:, 1]
-        else:
-            return self.bits[:, 0]
-
-    @property
-    def toggle_bits(self):
-        if self.vdim == 3:
-            return self.bits[:, 2]
-        else:
-            return self.bits[:, 0] & 0
-
-    def get_value(self, vector, position):
-        if vector >= self.nvectors:
-            raise IndexError(f'vector out of range: {vector} >= {self.nvectors}')
-        a = self.bits[position, :, vector // 8]
-        m = self.mask[vector % 8]
-        if self.vdim == 1:
-            return '1' if a[0] & m[1] else '0'
-        elif self.vdim == 2:
-            if a[0] & m[1]:
-                return '1' if a[1] & m[1] else 'X'
-            else:
-                return '0' if a[1] & m[1] else '-'
-        elif self.vdim == 3:
-            if a[2] & m[1]:
-                if a[0] & m[1]:
-                    return 'F' if a[1] & m[1] else 'N'
-                else:
-                    return 'P' if a[1] & m[1] else 'R'
-            else:
-                if a[0] & m[1]:
-                    return 'X' if a[1] & m[1] else '1'
-                else:
-                    return '0' if a[1] & m[1] else '-'                
-
-    def get_values_for_position(self, position):
-        return ''.join(self.get_value(x, position) for x in range(self.nvectors))
-
-    def set_value(self, vector, position, v):
-        if vector >= self.nvectors:
-            raise IndexError(f'vector out of range: {vector} >= {self.nvectors}')
-        a = self.bits[position, :, vector // 8]
-        m = self.mask[vector % 8]
-        if self.vdim == 1:
-            self._set_value_vd1(a, m, v)
-        elif self.vdim == 2:
-            self._set_value_vd2(a, m, v)
-        elif self.vdim == 3:
-            self._set_value_vd3(a, m, v)
-    
-    def set_values(self, vector, v, mapping=None, inversions=None):
-        if vector >= self.nvectors:
-            raise IndexError(f'vector out of range: {vector} >= {self.nvectors}')
-        if not mapping:
-            mapping = [y for y in range(len(v))]
-        if inversions is None:
-            inversions = [False] * len(v)
-        for i, c in enumerate(v):
-            if inversions[i]:
-                if c == '1':
-                    c = '0'
-                elif c == '0':
-                    c = '1'
-                elif c == 'H':
-                    c = 'L'
-                elif c == 'L':
-                    c = 'H'
-                elif c == 'R':
-                    c = 'F'
-                elif c == 'F':
-                    c = 'R'
-            self.set_value(vector, mapping[i], c)
-    
-    def set_values_for_position(self, position, values):
-        for i, v in enumerate(values):
-            self.set_value(i, position, v)
-            
-    def __setitem__(self, vector, value):
-        for i, c in enumerate(value):
-            self.set_value(vector, i, c)
-
-    def __getitem__(self, vector):
-        if isinstance(vector, slice):
-            first = self.get_values_for_position(0)[vector]
-            ret = PackedVectors(len(first), self.width, self.vdim)
-            ret.set_values_for_position(0, first)
-            for pos in range(1, self.width):
-                ret.set_values_for_position(pos, self.get_values_for_position(pos)[vector])
-            return ret
-        return ''.join(self.get_value(vector, pos) for pos in range(len(self.bits)))
-
-    @staticmethod
-    def _set_value_vd1(a, m, v):
-        if v in [True, 1, '1', 'H', 'h']:
-            a[0] |= m[1]
-        else:
-            a[0] &= m[0]
-    
-    @staticmethod
-    def _set_value_vd2(a, m, v):
-        if v in [True, 1, '1', 'H', 'h']:
-            a[0] |= m[1]
-            a[1] |= m[1]
-        elif v in [False, 0, '0', 'L', 'l']:
-            a[0] &= m[0]
-            a[1] |= m[1]
-        elif v in ['X', 'x']:
-            a[0] |= m[1]
-            a[1] &= m[0]
-        else:
-            a[0] &= m[0]
-            a[1] &= m[0]
-
-    #   i fb act
-    # a 0 1 2
-    # - 0 0 0  None, '-'
-    # 0 0 1 0  False, 0, '0', 'l', 'L'
-    # 1 1 0 0  True, 1, '1', 'h', 'H'
-    # X 1 1 0  'x', 'X'
-    # / 0 0 1  '/', 'r', 'R'
-    # ^ 0 1 1  '^', 'p', 'P'
-    # v 1 0 1  'v', 'n', 'N'
-    # \ 1 1 1  '\', 'f', 'F'
-    @staticmethod
-    def _set_value_vd3(a, m, v):
-        if v in [False, 0, '0', 'L', 'l']:
-            a[0] &= m[0]
-            a[1] |= m[1]
-            a[2] &= m[0]
-        elif v in [True, 1, '1', 'H', 'h']:
-            a[0] |= m[1]
-            a[1] &= m[0]
-            a[2] &= m[0]
-        elif v in ['X', 'x']:
-            a[0] |= m[1]
-            a[1] |= m[1]
-            a[2] &= m[0]
-        elif v in ['/', 'r', 'R']:
-            a[0] &= m[0]
-            a[1] &= m[0]
-            a[2] |= m[1]
-        elif v in ['^', 'p', 'P']:
-            a[0] &= m[0]
-            a[1] |= m[1]
-            a[2] |= m[1]
-        elif v in ['v', 'n', 'N']:
-            a[0] |= m[1]
-            a[1] &= m[0]
-            a[2] |= m[1]
-        elif v in ['\\', 'f', 'F']:
-            a[0] |= m[1]
-            a[1] |= m[1]
-            a[2] |= m[1]
-        else:
-            a[0] &= m[0]
-            a[1] &= m[0]
-            a[2] &= m[0]
-                                    
-    def __repr__(self):
-        return f'<PackedVectors nvectors={self.nvectors}, width={self.width}, vdim={self.vdim}>'
-
-    def __str__(self):
-        lst = []
-        for p in range(self.nvectors):
-            lst.append(''.join(self.get_value(p, w) for w in range(len(self.bits))))
-        if len(lst) == 0: return ''
-        if len(lst[0]) > 64:
-            lst = [s[:32] + '...' + s[-32:] for s in lst]
-        if len(lst) <= 16:
-            return '\n'.join(lst)
-        else:
-            return '\n'.join(lst[:8]) + '\n...\n' + '\n'.join(lst[-8:])
-            
-    def diff(self, other, out=None):
-        if out is None:
-            out = np.zeros((self.width, self.bits.shape[-1]), dtype='uint8')
-        out[...] = (self.value_bits ^ other.value_bits) & self.care_bits & other.care_bits
-        return out
diff --git a/src/kyupy/sdf.py b/src/kyupy/sdf.py
index ed24b3c..beb58c0 100644
--- a/src/kyupy/sdf.py
+++ b/src/kyupy/sdf.py
@@ -1,14 +1,28 @@
+"""A simple and incomplete parser for the Standard Delay Format (SDF).
+
+The main purpose of this parser is to extract pin-to-pin delay and interconnect delay information from SDF files.
+Sophisticated timing specifications (timing checks, conditional delays, etc.) are currently not supported.
+
+The functions :py:func:`load` and :py:func:`read` return an intermediate representation (:class:`DelayFile` object).
+Call :py:func:`DelayFile.annotation` to match the intermediate representation to a given circuit.
+
+"""
+
+from collections import namedtuple
+
 import numpy as np
 from lark import Lark, Transformer
-from collections import namedtuple
-from . import log
-import gzip
+
+from . import log, readtext
+
 
 Interconnect = namedtuple('Interconnect', ['orig', 'dest', 'r', 'f'])
 IOPath = namedtuple('IOPath', ['ipin', 'opin', 'r', 'f'])
 
 
 class DelayFile:
+    """An intermediate representation of an SDF file.
+    """
     def __init__(self, name, cells):
         self.name = name
         if None in cells:
@@ -22,26 +36,26 @@ class DelayFile:
                '\n'.join(str(i) for i in self.interconnects)
 
     def annotation(self, circuit, pin_index_f, dataset=1, interconnect=True, ffdelays=True):
-        """
-        Constructs an 3-dimensional array with timing data for each line in `circuit`.
-        Dimension 1 of the returned array is the line index.
-        Dimension 2 is the type of timing data: 0:`delay`, 1:`pulse rejection limit`.
-        Dimension 3 is the polarity at the output of the reading node: 0:`rising`, 1:`falling`.
-
-        The polarity for pulse rejection is determined by the latter transition of the pulse.
-        E.g., timing[42,1,0] is the rejection limit of a negative pulse at the output of the reader of line 42.
+        """Constructs an 3-dimensional ndarray with timing data for each line in ``circuit``.
 
         An IOPATH delay for a node is annotated to the line connected to the input pin specified in the IOPATH.
 
         Currently, only ABSOLUTE IOPATH and INTERCONNECT delays are supported.
         Pulse rejection limits are derived from absolute delays, explicit declarations (PATHPULSE etc.) are ignored.
 
-
+        :param circuit:
+        :param pin_index_f:
         :param ffdelays:
         :param interconnect:
-        :param pin_index_f:
-        :param circuit:
         :type dataset: int or tuple
+        :return: A 3-dimensional ndarray with timing data.
+
+            * Axis 0: line index.
+            * Axis 1: type of timing data: 0=`delay`, 1=`pulse rejection limit`.
+            * Axis 2: The polarity of the output transition of the reading node: 0=`rising`, 1=`falling`.
+
+            The polarity for pulse rejection is determined by the latter transition of the pulse.
+            E.g., timing[42,1,0] is the rejection limit of a negative pulse at the output of the reader of line 42.
         """
         def select_del(_delvals, idx):
             if type(dataset) is tuple:
@@ -170,8 +184,7 @@ class SdfTransformer(Transformer):
         return DelayFile(name, cells)
 
 
-def parse(sdf):
-    grammar = r"""
+grammar = r"""
     start: "(DELAYFILE" ( "(SDFVERSION" _NOB ")"
         | "(DESIGN" "\"" NAME "\"" ")"
         | "(DATE" _NOB ")"
@@ -201,13 +214,16 @@ def parse(sdf):
     %ignore ( /\r?\n/ | COMMENT )+
     %ignore /[\t\f ]+/
     """
-    if '\n' not in str(sdf):  # One line?: Assuming it is a file name.
-        if str(sdf).endswith('.gz'):
-            with gzip.open(sdf, 'rt') as f:
-                text = f.read()
-        else:
-            with open(sdf, 'r') as f:
-                text = f.read()
-    else:
-        text = str(sdf)
+
+
+def parse(text):
+    """Parses the given ``text`` and returns a :class:`DelayFile` object."""
     return Lark(grammar, parser="lalr", transformer=SdfTransformer()).parse(text)
+
+
+def load(file):
+    """Parses the contents of ``file`` and returns a :class:`DelayFile` object.
+
+    The given file may be gzip compressed.
+    """
+    return parse(readtext(file))
diff --git a/src/kyupy/stil.py b/src/kyupy/stil.py
index d03b4ef..5c022ca 100644
--- a/src/kyupy/stil.py
+++ b/src/kyupy/stil.py
@@ -1,8 +1,19 @@
-from lark import Lark, Transformer
-from collections import namedtuple
+"""A simple and incomplete parser for the Standard Test Interface Language (STIL).
+
+The main purpose of this parser is to load scan pattern sets from STIL files.
+It supports only a very limited subset of STIL.
+
+The functions :py:func:`load` and :py:func:`read` return an intermediate representation (:class:`StilFile` object).
+Call :py:func:`StilFile.tests4v`, :py:func:`StilFile.tests8v`, or :py:func:`StilFile.responses4v` to
+obtain the appropriate vector sets.
+"""
+
 import re
-import gzip
-from .packed_vectors import PackedVectors
+from collections import namedtuple
+
+from lark import Lark, Transformer
+
+from . import readtext, logic
 from .logic_sim import LogicSim
 
 
@@ -11,6 +22,8 @@ ScanPattern = namedtuple('ScanPattern', ['load', 'launch', 'capture', 'unload'])
 
 
 class StilFile:
+    """An intermediate representation of a STIL file.
+    """
     def __init__(self, version, signal_groups, scan_chains, calls):
         self.version = version
         self.signal_groups = signal_groups
@@ -21,7 +34,7 @@ class StilFile:
         self.patterns = []
         launch = {}
         capture = {}
-        load = {}
+        sload = {}
         for call in self.calls:
             if call.name == 'load_unload':
                 unload = {}
@@ -29,13 +42,13 @@ class StilFile:
                     if so_port in call.parameters:
                         unload[so_port] = call.parameters[so_port].replace('\n', '')
                 if len(launch) > 0:
-                    self.patterns.append(ScanPattern(load, launch, capture, unload))
+                    self.patterns.append(ScanPattern(sload, launch, capture, unload))
                     capture = {}
                     launch = {}
-                load = {}
+                sload = {}
                 for si_port in self.si_ports:
                     if si_port in call.parameters:
-                        load[si_port] = call.parameters[si_port].replace('\n', '')
+                        sload[si_port] = call.parameters[si_port].replace('\n', '')
             if call.name.endswith('_launch') or call.name.endswith('_capture'):
                 if len(launch) == 0:
                     launch = dict((k, v.replace('\n', '')) for k, v in call.parameters.items())
@@ -73,48 +86,69 @@ class StilFile:
             scan_inversions[chain[-1]] = scan_out_inversion
         return interface, pi_map, po_map, scan_maps, scan_inversions
         
-    def tests(self, c):
-        interface, pi_map, po_map, scan_maps, scan_inversions = self._maps(c)
-        tests = PackedVectors(len(self.patterns), len(interface), 2)
+    def tests(self, circuit):
+        """Assembles and returns a scan test pattern set for given circuit.
+
+        This function assumes a static (stuck-at fault) test.
+        """
+        interface, pi_map, po_map, scan_maps, scan_inversions = self._maps(circuit)
+        tests = logic.MVArray((len(interface), len(self.patterns)))
         for i, p in enumerate(self.patterns):
             for si_port in self.si_ports.keys():
-                tests.set_values(i, p.load[si_port], scan_maps[si_port], scan_inversions[si_port])
-            tests.set_values(i, p.launch['_pi'], pi_map)
+                pattern = logic.mv_xor(p.load[si_port], scan_inversions[si_port])
+                tests.data[scan_maps[si_port], i] = pattern.data[:, 0]
+            tests.data[pi_map, i] = logic.MVArray(p.launch['_pi']).data[:, 0]
         return tests
 
-    def tests8v(self, c):
-        interface, pi_map, po_map, scan_maps, scan_inversions = self._maps(c)
-        init = PackedVectors(len(self.patterns), len(interface), 2)
+    def tests_loc(self, circuit):
+        """Assembles and returns a LoC scan test pattern set for given circuit.
+
+        This function assumes a launch-on-capture (LoC) delay test.
+        It performs a logic simulation to obtain the first capture pattern (the one that launches the
+        delay test) and assembles the test pattern set from from pairs for initialization- and launch-patterns.
+        """
+        interface, pi_map, po_map, scan_maps, scan_inversions = self._maps(circuit)
+        init = logic.MVArray((len(interface), len(self.patterns)), m=4)
+        # init = PackedVectors(len(self.patterns), len(interface), 2)
         for i, p in enumerate(self.patterns):
             # init.set_values(i, '0' * len(interface))
             for si_port in self.si_ports.keys():
-                init.set_values(i, p.load[si_port], scan_maps[si_port], scan_inversions[si_port])
-            init.set_values(i, p.launch['_pi'], pi_map)
-        sim4v = LogicSim(c, len(init), 2)
-        sim4v.assign(init)
+                pattern = logic.mv_xor(p.load[si_port], scan_inversions[si_port])
+                init.data[scan_maps[si_port], i] = pattern.data[:, 0]
+            init.data[pi_map, i] = logic.MVArray(p.launch['_pi']).data[:, 0]
+        launch_bp = logic.BPArray(init)
+        sim4v = LogicSim(circuit, len(init), m=4)
+        sim4v.assign(launch_bp)
         sim4v.propagate()
-        launch = init.copy()
-        sim4v.capture(launch)
+        sim4v.capture(launch_bp)
+        launch = logic.MVArray(launch_bp)
         for i, p in enumerate(self.patterns):
             # if there was no launch clock, then init = launch
             if ('P' not in p.launch['_pi']) or ('P' not in p.capture['_pi']):
                 for si_port in self.si_ports.keys():
-                    launch.set_values(i, p.load[si_port], scan_maps[si_port], scan_inversions[si_port])
+                    pattern = logic.mv_xor(p.load[si_port], scan_inversions[si_port])
+                    launch.data[scan_maps[si_port], i] = pattern.data[:, 0]
             if '_pi' in p.capture and 'P' in p.capture['_pi']:
-                launch.set_values(i, p.capture['_pi'], pi_map)
-        
-        return PackedVectors.from_pair(init, launch)
+                launch.data[pi_map, i] = logic.MVArray(p.capture['_pi']).data[:, 0]
+            launch.data[po_map, i] = logic.UNASSIGNED
+
+        return logic.mv_transition(init, launch)
                 
-    def responses(self, c):
-        interface, pi_map, po_map, scan_maps, scan_inversions = self._maps(c)
-        resp = PackedVectors(len(self.patterns), len(interface), 2)
+    def responses(self, circuit):
+        """Assembles and returns a scan test response pattern set for given circuit."""
+        interface, pi_map, po_map, scan_maps, scan_inversions = self._maps(circuit)
+        resp = logic.MVArray((len(interface), len(self.patterns)))
+        # resp = PackedVectors(len(self.patterns), len(interface), 2)
         for i, p in enumerate(self.patterns):
-            if len(p.capture) > 0:
-                resp.set_values(i, p.capture['_po'], po_map)
-            else:
-                resp.set_values(i, p.launch['_po'], po_map)
+            resp.data[po_map, i] = logic.MVArray(p.capture['_po'] if len(p.capture) > 0 else p.launch['_po']).data[:, 0]
+            # if len(p.capture) > 0:
+            #    resp.set_values(i, p.capture['_po'], po_map)
+            # else:
+            #    resp.set_values(i, p.launch['_po'], po_map)
             for so_port in self.so_ports.keys():
-                resp.set_values(i, p.unload[so_port], scan_maps[so_port], scan_inversions[so_port])
+                pattern = logic.mv_xor(p.unload[so_port], scan_inversions[so_port])
+                resp.data[scan_maps[so_port], i] = pattern.data[:, 0]
+                # resp.set_values(i, p.unload[so_port], scan_maps[so_port], scan_inversions[so_port])
         return resp
         
         
@@ -160,10 +194,9 @@ class StilTransformer(Transformer):
 
     def start(self, args):
         return StilFile(float(args[0]), self._signal_groups, self._scan_chains, self._calls)
-        
 
-def parse(stil):
-    grammar = r"""
+
+grammar = r"""
     start: "STIL" FLOAT _ignore _block*
     _block: signal_groups | scan_structures | pattern
         | "Header" _ignore
@@ -173,10 +206,10 @@ def parse(stil):
         | "PatternExec" _ignore
         | "Procedures" _ignore
         | "MacroDefs" _ignore
-    
+
     signal_groups: "SignalGroups" "{" signal_group* "}"
     signal_group: quoted "=" "'" quoted ( "+" quoted)* "'" _ignore? ";"?
-    
+
     scan_structures: "ScanStructures" "{" scan_chain* "}"
     scan_chain: "ScanChain" quoted "{" ( scan_length
         | scan_in | scan_out | scan_inversion | scan_cells | scan_master_clock )* "}"
@@ -186,7 +219,7 @@ def parse(stil):
     scan_inversion: "ScanInversion" /[0-9]+/ ";"
     scan_cells: "ScanCells" (quoted | /!/)* ";"
     scan_master_clock: "ScanMasterClock" quoted ";"
-    
+
     pattern: "Pattern" quoted "{" ( label | w | c | macro | ann | call )* "}"
     label: quoted ":"
     w: "W" quoted ";"
@@ -195,7 +228,7 @@ def parse(stil):
     ann: "Ann" _ignore
     call: "Call" quoted "{" call_parameter* "}"
     call_parameter: quoted "=" /[^;]+/ ";"
-        
+
     quoted: /"[^"]*"/
     FLOAT: /[-0-9.]+/
     _ignore: "{" _NOB? _ignore_inner* "}"
@@ -203,50 +236,16 @@ def parse(stil):
     _NOB: /[^{}]+/
     %ignore ( /\r?\n/ | "//" /[^\n]*/ | /[\t\f ]/ )+
     """
-    if '\n' not in str(stil):  # One line?: Assuming it is a file name.
-        if str(stil).endswith('.gz'):
-            with gzip.open(stil, 'rt') as f:
-                text = f.read()
-        else:
-            with open(stil, 'r') as f:
-                text = f.read()
-    else:
-        text = str(stil)
+
+
+def parse(text):
+    """Parses the given ``text`` and returns a :class:`StilFile` object."""
     return Lark(grammar, parser="lalr", transformer=StilTransformer()).parse(text)
 
 
-def extract_scan_pattens(stil_calls):
-    pats = []
-    pi = None
-    scan_in = None
-    for call in stil_calls:
-        if call.name == 'load_unload':
-            scan_out = call.parameters.get('Scan_Out')
-            if scan_out is not None:
-                scan_out = scan_out.replace('\n', '')
-            if pi: pats.append(ScanPattern(scan_in, pi, None, scan_out))
-            scan_in = call.parameters.get('Scan_In')
-            if scan_in is not None:
-                scan_in = scan_in.replace('\n', '')
-        if call.name == 'allclock_capture':
-            pi = call.parameters['_pi'].replace('\n', '')
-    return pats
-
-
-def match_patterns(stil_file, pats, interface):    
-    intf_pos = dict([(n.name, i) for i, n in enumerate(interface)])
-    pi_map = [intf_pos[n] for n in stil_file.signal_groups['_pi']]
-    scan_map = [intf_pos[re.sub(r'b..\.', '', n)] for n in reversed(stil_file.scan_chains['1'])]
-    # print(scan_map)
-    tests = PackedVectors(len(pats), len(interface), 2)
-    for i, p in enumerate(pats):
-        tests.set_values(i, p.scan_in, scan_map)
-        tests.set_values(i, p.pi, pi_map)
-
-    resp = PackedVectors(len(pats), len(interface), 2)
-    for i, p in enumerate(pats):
-        resp.set_values(i, p.pi, pi_map)
-        resp.set_values(i, p.scan_out, scan_map)
-
-    return tests, resp
+def load(file):
+    """Parses the contents of ``file`` and returns a :class:`StilFile` object.
 
+    The given file may be gzip compressed.
+    """
+    return parse(readtext(file))
diff --git a/src/kyupy/verilog.py b/src/kyupy/verilog.py
index 971ba7a..61e76ee 100644
--- a/src/kyupy/verilog.py
+++ b/src/kyupy/verilog.py
@@ -1,8 +1,14 @@
+"""A simple and incomplete parser for Verilog files.
+
+The main purpose of this parser is to load synthesized, non-hierarchical (flat) gate-level netlists.
+It supports only a very limited subset of Verilog.
+"""
+
 from collections import namedtuple
-import gzip
 
 from lark import Lark, Transformer
 
+from . import readtext
 from .circuit import Circuit, Node, Line
 from .saed import pin_index, pin_is_output
 
@@ -152,22 +158,21 @@ grammar = """
     """
 
 
-def loads(s, *, branchforks=False):
-    return Lark(grammar, parser="lalr", transformer=VerilogTransformer(branchforks)).parse(s)
+def parse(text, *, branchforks=False):
+    """Parses the given ``text`` as Verilog code.
 
+    :param text: A string with Verilog code.
+    :param branchforks: If set to ``True``, the returned circuit will include additional `forks` on each fanout branch.
+        These forks are needed to correctly annotate interconnect delays
+        (see :py:func:`kyupy.sdf.DelayFile.annotation`).
+    :return: A :class:`~kyupy.circuit.Circuit` object.
+    """
+    return Lark(grammar, parser="lalr", transformer=VerilogTransformer(branchforks)).parse(text)
 
-def load(fp, *, branchforks=False):
-    return loads(fp.read(), branchforks=branchforks)
 
+def load(file, *args, **kwargs):
+    """Parses the contents of ``file`` as Verilog code.
 
-def parse(verilog, branchforks=False):
-    if '\n' not in str(verilog):  # One line?: Assuming it is a file name.
-        if str(verilog).endswith('.gz'):
-            with gzip.open(verilog, 'rt') as f:
-                text = f.read()
-        else:
-            with open(verilog, 'r') as f:
-                text = f.read()
-    else:
-        text = str(verilog)
-    return loads(text, branchforks=branchforks)
+    The given file may be gzip compressed. Takes the same keyword arguments as :py:func:`parse`.
+    """
+    return parse(readtext(file), *args, **kwargs)
diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py
index 052e969..2766997 100644
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@@ -1,8 +1,24 @@
+"""High-Throughput combinational logic timing simulators.
+
+These simulators work similarly to :py:class:`kyupy.logic_sim.LogicSim`.
+They propagate values through the combinational circuit from (pseudo) primary inputs to (pseudo) primary outputs.
+Instead of propagating logic values, these simulators propagate signal histories (waveforms).
+They are designed to run many simulations in parallel and while their latencies are quite high, they achieve
+high throughput performance.
+
+The simulators are not event-based and are not capable of simulating sequential circuits directly.
+
+Two simulators are available: :py:class:`WaveSim` runs on the CPU, and the derived class
+:py:class:`WaveSimCuda` runs on the GPU.
+"""
+
 import math
 from bisect import bisect, insort_left
 
 import numpy as np
+
 from . import numba
+from . import cuda
 
 
 TMAX = np.float32(2 ** 127)  # almost np.PINF for 32-bit floating point values
@@ -77,6 +93,7 @@ class Heap:
 
 
 class WaveSim:
+    """A waveform-based combinational logic timing simulator."""
     def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
         self.circuit = circuit
         self.sims = sims
@@ -243,21 +260,26 @@ class WaveSim:
         self.timing[line, 0, polarity] = delay
 
     def assign(self, vectors, time=0.0, offset=0):
-        nvectors = min(vectors.nvectors - offset, self.sims)
+        nvectors = min(len(vectors) - offset, self.sims)
         for i, node in enumerate(self.interface):
             ppi_loc = self.sat[self.ppi_offset + i, 0]
             if ppi_loc < 0: continue
             for p in range(nvectors):
                 vector = p + offset
-                a = vectors.bits[i, :, vector // 8]
+                a = vectors.data[i, :, vector // 8]
                 m = self.mask[vector % 8]
                 toggle = 0
-                if a[0] & m[1]:
-                    self.state[ppi_loc, p] = TMIN
-                    toggle += 1
-                if (len(a) > 2) and (a[2] & m[1]) and ((a[0] & m[1]) == (a[1] & m[1])):
-                    self.state[ppi_loc + toggle, p] = time
-                    toggle += 1
+                if len(a) <= 2:
+                    if a[0] & m[1]:
+                        self.state[ppi_loc, p] = TMIN
+                        toggle += 1
+                else:
+                    if a[1] & m[1]:
+                        self.state[ppi_loc, p] = TMIN
+                        toggle += 1
+                    if (a[2] & m[1]) and ((a[0] & m[1]) != (a[1] & m[1])):
+                        self.state[ppi_loc + toggle, p] = time
+                        toggle += 1
                 self.state[ppi_loc + toggle, p] = TMAX
 
     def propagate(self, sims=None, sd=0.0, seed=1):
@@ -519,3 +541,312 @@ def wave_eval(op, state, sat, st_idx, line_times, sd=0.0, seed=0):
         state[z_mem + z_cur, st_idx] = a if a > b else b  # propagate overflow flags by storing biggest TMAX from input
         
     return overflows
+
+
+class WaveSimCuda(WaveSim):
+    """A GPU-accelerated waveform-based combinational logic timing simulator."""
+    def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
+        super().__init__(circuit, timing, sims, wavecaps, strip_forks, keep_waveforms)
+
+        self.tdata = np.zeros((len(self.interface), 3, (sims - 1) // 8 + 1), dtype='uint8')
+
+        self.d_state = cuda.to_device(self.state)
+        self.d_sat = cuda.to_device(self.sat)
+        self.d_ops = cuda.to_device(self.ops)
+        self.d_timing = cuda.to_device(self.timing)
+        self.d_tdata = cuda.to_device(self.tdata)
+        self.d_cdata = cuda.to_device(self.cdata)
+
+        self._block_dim = (32, 16)
+
+    def get_line_delay(self, line, polarity):
+        return self.d_timing[line, 0, polarity]
+
+    def set_line_delay(self, line, polarity, delay):
+        self.d_timing[line, 0, polarity] = delay
+
+    def assign(self, vectors, time=0.0, offset=0):
+        assert (offset % 8) == 0
+        byte_offset = offset // 8
+        assert byte_offset < vectors.data.shape[-1]
+        pdim = min(vectors.data.shape[-1] - byte_offset, self.tdata.shape[-1])
+
+        self.tdata[..., 0:pdim] = vectors.data[..., byte_offset:pdim + byte_offset]
+        if vectors.m == 2:
+            self.tdata[:, 2, 0:pdim] = 0
+        cuda.to_device(self.tdata, to=self.d_tdata)
+
+        grid_dim = self._grid_dim(self.sims, len(self.interface))
+        assign_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppi_offset,
+                                                 len(self.interface), self.d_tdata, time)
+
+    def _grid_dim(self, x, y):
+        gx = math.ceil(x / self._block_dim[0])
+        gy = math.ceil(y / self._block_dim[1])
+        return gx, gy
+
+    def propagate(self, sims=None, sd=0.0, seed=1):
+        if sims is None:
+            sims = self.sims
+        else:
+            sims = min(sims, self.sims)
+        for op_start, op_stop in zip(self.level_starts, self.level_stops):
+            grid_dim = self._grid_dim(sims, op_stop - op_start)
+            wave_kernel[grid_dim, self._block_dim](self.d_ops, op_start, op_stop, self.d_state, self.sat, int(0),
+                                                   sims, self.d_timing, sd, seed)
+        cuda.synchronize()
+        self.lst_eat_valid = False
+
+    def wave(self, line, vector):
+        if line < 0:
+            return None
+        mem, wcap, _ = self.sat[line]
+        if mem < 0:
+            return None
+        return self.d_state[mem:mem + wcap, vector]
+
+    def capture(self, time=TMAX, sd=0, seed=1, cdata=None, offset=0):
+        grid_dim = self._grid_dim(self.sims, len(self.interface))
+        capture_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppo_offset,
+                                                  self.d_cdata, time, sd * math.sqrt(2), seed)
+        self.cdata[...] = self.d_cdata
+        if cdata is not None:
+            assert offset < cdata.shape[1]
+            cap_dim = min(cdata.shape[1] - offset, self.sims)
+            cdata[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim]
+        self.lst_eat_valid = True
+        return self.cdata
+
+    def reassign(self, time=0.0):
+        grid_dim = self._grid_dim(self.sims, len(self.interface))
+        reassign_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppi_offset, self.ppo_offset,
+                                                   self.d_cdata, time)
+        cuda.synchronize()
+
+    def wavecaps(self):
+        gx = math.ceil(len(self.circuit.lines) / 512)
+        wavecaps_kernel[gx, 512](self.d_state, self.d_sat, self.sims)
+        self.sat[...] = self.d_sat
+        return self.sat[..., 2]
+
+
+@cuda.jit()
+def wavecaps_kernel(state, sat, sims):
+    idx = cuda.grid(1)
+    if idx >= len(sat): return
+
+    lidx, lcap, _ = sat[idx]
+    if lidx < 0: return
+
+    wcap = 0
+    for sidx in range(sims):
+        for tidx in range(lcap):
+            t = state[lidx + tidx, sidx]
+            if tidx > wcap:
+                wcap = tidx
+            if t >= TMAX: break
+
+    sat[idx, 2] = wcap + 1
+
+
+@cuda.jit()
+def reassign_kernel(state, sat, ppi_offset, ppo_offset, cdata, ppi_time):
+    vector, y = cuda.grid(2)
+    if vector >= state.shape[-1]: return
+    if ppo_offset + y >= len(sat): return
+
+    ppo, ppo_cap, _ = sat[ppo_offset + y]
+    ppi, ppi_cap, _ = sat[ppi_offset + y]
+    if ppo < 0: return
+    if ppi < 0: return
+
+    ppo_val = int(cdata[y, vector, 1])
+    ppi_val = int(0)
+    for tidx in range(ppi_cap):
+        t = state[ppi + tidx, vector]
+        if t >= TMAX: break
+        ppi_val ^= 1
+
+    # make new waveform at PPI
+    toggle = 0
+    if ppi_val:
+        state[ppi + toggle, vector] = TMIN
+        toggle += 1
+    if ppi_val != ppo_val:
+        state[ppi + toggle, vector] = ppi_time
+        toggle += 1
+    state[ppi + toggle, vector] = TMAX
+
+
+@cuda.jit()
+def capture_kernel(state, sat, ppo_offset, cdata, time, s_sqrt2, seed):
+    x, y = cuda.grid(2)
+    if ppo_offset + y >= len(sat): return
+    line, tdim, _ = sat[ppo_offset + y]
+    if line < 0: return
+    if x >= state.shape[-1]: return
+    vector = x
+    m = 0.5
+    acc = 0.0
+    eat = TMAX
+    lst = TMIN
+    tog = 0
+    ovl = 0
+    val = int(0)
+    final = int(0)
+    for tidx in range(tdim):
+        t = state[line + tidx, vector]
+        if t >= TMAX:
+            if t == TMAX_OVL:
+                ovl = 1
+            break
+        m = -m
+        final ^= 1
+        if t < time:
+            val ^= 1
+        if t <= TMIN: continue
+        if s_sqrt2 > 0:
+            acc += m * (1 + math.erf((t - time) / s_sqrt2))
+        eat = min(eat, t)
+        lst = max(lst, t)
+        tog += 1
+    if s_sqrt2 > 0:
+        if m < 0:
+            acc += 1
+        if acc >= 0.99:
+            val = 1
+        elif acc > 0.01:
+            seed = (seed << 4) + (vector << 20) + (y << 1)
+            seed = int(0xDEECE66D) * seed + 0xB
+            seed = int(0xDEECE66D) * seed + 0xB
+            rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
+            val = rnd < acc
+        else:
+            val = 0
+    else:
+        acc = val
+
+    cdata[y, vector, 0] = acc
+    cdata[y, vector, 1] = val
+    cdata[y, vector, 2] = final
+    cdata[y, vector, 3] = (val != final)
+    cdata[y, vector, 4] = eat
+    cdata[y, vector, 5] = lst
+    cdata[y, vector, 6] = ovl
+
+
+@cuda.jit()
+def assign_kernel(state, sat, ppi_offset, intf_len, tdata, time):
+    x, y = cuda.grid(2)
+    if y >= intf_len: return
+    line = sat[ppi_offset + y, 0]
+    if line < 0: return
+    sdim = state.shape[-1]
+    if x >= sdim: return
+    vector = x
+    a0 = tdata[y, 0, vector // 8]
+    a1 = tdata[y, 1, vector // 8]
+    a2 = tdata[y, 2, vector // 8]
+    m = np.uint8(1 << (7 - (vector % 8)))
+    toggle = 0
+    if a1 & m:
+        state[line + toggle, x] = TMIN
+        toggle += 1
+    if (a2 & m) and ((a0 & m) != (a1 & m)):
+        state[line + toggle, x] = time
+        toggle += 1
+    state[line + toggle, x] = TMAX
+
+
+@cuda.jit(device=True)
+def rand_gauss_dev(seed, sd):
+    clamp = 0.5
+    if sd <= 0.0:
+        return 1.0
+    while True:
+        x = -6.0
+        for i in range(12):
+            seed = int(0xDEECE66D) * seed + 0xB
+            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
+        x *= sd
+        if abs(x) <= clamp:
+            break
+    return x + 1.0
+
+
+@cuda.jit()
+def wave_kernel(ops, op_start, op_stop, state, sat, st_start, st_stop, line_times, sd, seed):
+    x, y = cuda.grid(2)
+    st_idx = st_start + x
+    op_idx = op_start + y
+    if st_idx >= st_stop: return
+    if op_idx >= op_stop: return
+    lut = ops[op_idx, 0]
+    z_idx = ops[op_idx, 1]
+    a_idx = ops[op_idx, 2]
+    b_idx = ops[op_idx, 3]
+    overflows = int(0)
+
+    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
+
+    a_mem = sat[a_idx, 0]
+    b_mem = sat[b_idx, 0]
+    z_mem, z_cap, _ = sat[z_idx]
+
+    a_cur = int(0)
+    b_cur = int(0)
+    z_cur = lut & 1
+    if z_cur == 1:
+        state[z_mem, st_idx] = TMIN
+
+    a = state[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_dev(_seed ^ a_mem ^ z_cur, sd)
+    b = state[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_dev(_seed ^ b_mem ^ z_cur, sd)
+
+    previous_t = TMIN
+
+    current_t = min(a, b)
+    inputs = int(0)
+
+    while current_t < TMAX:
+        z_val = z_cur & 1
+        if b < a:
+            b_cur += 1
+            b = state[b_mem + b_cur, st_idx]
+            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_dev(_seed ^ b_mem ^ z_val ^ 1, sd)
+            thresh = line_times[b_idx, 1, z_val] * rand_gauss_dev(_seed ^ b_mem ^ z_val, sd)
+            inputs ^= 2
+            next_t = b
+        else:
+            a_cur += 1
+            a = state[a_mem + a_cur, st_idx]
+            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_dev(_seed ^ a_mem ^ z_val ^ 1, sd)
+            thresh = line_times[a_idx, 1, z_val] * rand_gauss_dev(_seed ^ a_mem ^ z_val, sd)
+            inputs ^= 1
+            next_t = a
+
+        if (z_cur & 1) != ((lut >> inputs) & 1):
+            # we generate a toggle in z_mem, if:
+            #   ( it is the first toggle in z_mem OR
+            #   following toggle is earlier OR
+            #   pulse is wide enough ) AND enough space in z_mem.
+            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
+                if z_cur < (z_cap - 1):
+                    state[z_mem + z_cur, st_idx] = current_t
+                    previous_t = current_t
+                    z_cur += 1
+                else:
+                    overflows += 1
+                    previous_t = state[z_mem + z_cur - 1, st_idx]
+                    z_cur -= 1
+            else:
+                z_cur -= 1
+                if z_cur > 0:
+                    previous_t = state[z_mem + z_cur - 1, st_idx]
+                else:
+                    previous_t = TMIN
+        current_t = min(a, b)
+
+    if overflows > 0:
+        state[z_mem + z_cur, st_idx] = TMAX_OVL
+    else:
+        state[z_mem + z_cur, st_idx] = a if a > b else b  # propagate overflow flags by storing biggest TMAX from input
diff --git a/src/kyupy/wave_sim_cuda.py b/src/kyupy/wave_sim_cuda.py
deleted file mode 100644
index 835bfa2..0000000
--- a/src/kyupy/wave_sim_cuda.py
+++ /dev/null
@@ -1,317 +0,0 @@
-import numpy as np
-import math
-from .wave_sim import WaveSim
-from . import cuda
-
-TMAX = np.float32(2 ** 127)  # almost np.PINF for 32-bit floating point values
-TMAX_OVL = np.float32(1.1 * 2 ** 127)  # almost np.PINF with overflow mark
-TMIN = np.float32(-2 ** 127)  # almost np.NINF for 32-bit floating point values
-
-
-class WaveSimCuda(WaveSim):
-    def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
-        super().__init__(circuit, timing, sims, wavecaps, strip_forks, keep_waveforms)
-
-        self.tdata = np.zeros((len(self.interface), 3, (sims - 1) // 8 + 1), dtype='uint8')
-
-        self.d_state = cuda.to_device(self.state)
-        self.d_sat = cuda.to_device(self.sat)
-        self.d_ops = cuda.to_device(self.ops)
-        self.d_timing = cuda.to_device(self.timing)
-        self.d_tdata = cuda.to_device(self.tdata)
-        self.d_cdata = cuda.to_device(self.cdata)
-
-        self._block_dim = (32, 16)
-
-    def get_line_delay(self, line, polarity):
-        return self.d_timing[line, 0, polarity]
-
-    def set_line_delay(self, line, polarity, delay):
-        self.d_timing[line, 0, polarity] = delay
-
-    def assign(self, vectors, time=0.0, offset=0):
-        assert (offset % 8) == 0
-        byte_offset = offset // 8
-        assert byte_offset < vectors.bits.shape[-1]
-        pdim = min(vectors.bits.shape[-1] - byte_offset, self.tdata.shape[-1])
-
-        self.tdata[..., 0:pdim] = vectors.bits[..., byte_offset:pdim + byte_offset]
-        if vectors.vdim == 1:
-            self.tdata[:, 1, 0:pdim] = ~self.tdata[:, 1, 0:pdim]
-            self.tdata[:, 2, 0:pdim] = 0
-        cuda.to_device(self.tdata, to=self.d_tdata)
-
-        grid_dim = self._grid_dim(self.sims, len(self.interface))
-        assign_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppi_offset,
-                                                 len(self.interface), self.d_tdata, time)
-
-    def _grid_dim(self, x, y):
-        gx = math.ceil(x / self._block_dim[0])
-        gy = math.ceil(y / self._block_dim[1])
-        return gx, gy
-
-    def propagate(self, sims=None, sd=0.0, seed=1):
-        if sims is None:
-            sims = self.sims
-        else:
-            sims = min(sims, self.sims)
-        for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            grid_dim = self._grid_dim(sims, op_stop - op_start)
-            wave_kernel[grid_dim, self._block_dim](self.d_ops, op_start, op_stop, self.d_state, self.sat, int(0),
-                                                   sims, self.d_timing, sd, seed)
-        cuda.synchronize()
-        self.lst_eat_valid = False
-
-    def wave(self, line, vector):
-        if line < 0:
-            return None
-        mem, wcap, _ = self.sat[line]
-        if mem < 0:
-            return None
-        return self.d_state[mem:mem + wcap, vector]
-    
-    def capture(self, time=TMAX, sd=0, seed=1, cdata=None, offset=0):
-        grid_dim = self._grid_dim(self.sims, len(self.interface))
-        capture_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppo_offset,
-                                                  self.d_cdata, time, sd * math.sqrt(2), seed)
-        self.cdata[...] = self.d_cdata
-        if cdata is not None:
-            assert offset < cdata.shape[1]
-            cap_dim = min(cdata.shape[1] - offset, self.sims)
-            cdata[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim]
-        self.lst_eat_valid = True
-        return self.cdata
-
-    def reassign(self, time=0.0):
-        grid_dim = self._grid_dim(self.sims, len(self.interface))
-        reassign_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppi_offset, self.ppo_offset,
-                                                   self.d_cdata, time)
-        cuda.synchronize()
-        
-    def wavecaps(self):
-        gx = math.ceil(len(self.circuit.lines) / 512)
-        wavecaps_kernel[gx, 512](self.d_state, self.d_sat, self.sims)
-        self.sat[...] = self.d_sat
-        return self.sat[..., 2]
-
-
-@cuda.jit()
-def wavecaps_kernel(state, sat, sims):
-    idx = cuda.grid(1)
-    if idx >= len(sat): return
-    
-    lidx, lcap, _ = sat[idx]
-    if lidx < 0: return
-    
-    wcap = 0
-    for sidx in range(sims):
-        for tidx in range(lcap):
-            t = state[lidx + tidx, sidx]
-            if tidx > wcap:
-                wcap = tidx
-            if t >= TMAX: break
-
-    sat[idx, 2] = wcap + 1
-    
-    
-@cuda.jit()
-def reassign_kernel(state, sat, ppi_offset, ppo_offset, cdata, ppi_time):
-    vector, y = cuda.grid(2)
-    if vector >= state.shape[-1]: return
-    if ppo_offset + y >= len(sat): return
-
-    ppo, ppo_cap, _ = sat[ppo_offset + y]
-    ppi, ppi_cap, _ = sat[ppi_offset + y]
-    if ppo < 0: return
-    if ppi < 0: return
-
-    ppo_val = int(cdata[y, vector, 1])
-    ppi_val = int(0)
-    for tidx in range(ppi_cap):
-        t = state[ppi + tidx, vector]
-        if t >= TMAX: break
-        ppi_val ^= 1
-    
-    # make new waveform at PPI
-    toggle = 0
-    if ppi_val:
-        state[ppi + toggle, vector] = TMIN
-        toggle += 1
-    if ppi_val != ppo_val:
-        state[ppi + toggle, vector] = ppi_time
-        toggle += 1
-    state[ppi + toggle, vector] = TMAX
-
-
-@cuda.jit()
-def capture_kernel(state, sat, ppo_offset, cdata, time, s_sqrt2, seed):
-    x, y = cuda.grid(2)
-    if ppo_offset + y >= len(sat): return
-    line, tdim, _ = sat[ppo_offset + y]
-    if line < 0: return
-    if x >= state.shape[-1]: return
-    vector = x
-    m = 0.5
-    acc = 0.0
-    eat = TMAX
-    lst = TMIN
-    tog = 0
-    ovl = 0
-    val = int(0)
-    final = int(0)
-    for tidx in range(tdim):
-        t = state[line + tidx, vector]
-        if t >= TMAX:
-            if t == TMAX_OVL:
-                ovl = 1
-            break
-        m = -m
-        final ^= 1
-        if t < time:
-            val ^= 1
-        if t <= TMIN: continue
-        if s_sqrt2 > 0:
-            acc += m * (1 + math.erf((t - time) / s_sqrt2))
-        eat = min(eat, t)
-        lst = max(lst, t)
-        tog += 1
-    if s_sqrt2 > 0:
-        if m < 0:
-            acc += 1
-        if acc >= 0.99:
-            val = 1
-        elif acc > 0.01:
-            seed = (seed << 4) + (vector << 20) + (y << 1)
-            seed = int(0xDEECE66D) * seed + 0xB
-            seed = int(0xDEECE66D) * seed + 0xB
-            rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
-            val = rnd < acc
-        else:
-            val = 0
-    else:
-        acc = val
-        
-    cdata[y, vector, 0] = acc
-    cdata[y, vector, 1] = val
-    cdata[y, vector, 2] = final
-    cdata[y, vector, 3] = (val != final)
-    cdata[y, vector, 4] = eat
-    cdata[y, vector, 5] = lst
-    cdata[y, vector, 6] = ovl
-
-
-@cuda.jit()
-def assign_kernel(state, sat, ppi_offset, intf_len, tdata, time):
-    x, y = cuda.grid(2)
-    if y >= intf_len: return
-    line = sat[ppi_offset + y, 0]
-    if line < 0: return
-    sdim = state.shape[-1]
-    if x >= sdim: return
-    vector = x
-    a0 = tdata[y, 0, vector // 8]
-    a1 = tdata[y, 1, vector // 8]
-    a2 = tdata[y, 2, vector // 8]
-    m = np.uint8(1 << (7 - (vector % 8)))
-    toggle = 0
-    if a0 & m:
-        state[line + toggle, x] = TMIN
-        toggle += 1
-    if (a2 & m) and ((a0 & m) == (a1 & m)):
-        state[line + toggle, x] = time
-        toggle += 1
-    state[line + toggle, x] = TMAX
-
-
-@cuda.jit(device=True)
-def rand_gauss(seed, sd):
-    clamp = 0.5
-    if sd <= 0.0:
-        return 1.0
-    while True:
-        x = -6.0
-        for i in range(12):
-            seed = int(0xDEECE66D) * seed + 0xB
-            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
-        x *= sd
-        if abs(x) <= clamp:
-            break
-    return x + 1.0
-
-
-@cuda.jit()
-def wave_kernel(ops, op_start, op_stop, state, sat, st_start, st_stop, line_times, sd, seed):
-    x, y = cuda.grid(2)
-    st_idx = st_start + x
-    op_idx = op_start + y
-    if st_idx >= st_stop: return
-    if op_idx >= op_stop: return
-    lut = ops[op_idx, 0]
-    z_idx = ops[op_idx, 1]
-    a_idx = ops[op_idx, 2]
-    b_idx = ops[op_idx, 3]
-    overflows = int(0)
-
-    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
-
-    a_mem = sat[a_idx, 0]
-    b_mem = sat[b_idx, 0]
-    z_mem, z_cap, _ = sat[z_idx]
-
-    a_cur = int(0)
-    b_cur = int(0)
-    z_cur = lut & 1
-    if z_cur == 1:
-        state[z_mem, st_idx] = TMIN
-
-    a = state[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss(_seed ^ a_mem ^ z_cur, sd)
-    b = state[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss(_seed ^ b_mem ^ z_cur, sd)
-
-    previous_t = TMIN
-
-    current_t = min(a, b)
-    inputs = int(0)
-
-    while current_t < TMAX:
-        z_val = z_cur & 1
-        if b < a:
-            b_cur += 1
-            b = state[b_mem + b_cur, st_idx]
-            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ b_mem ^ z_val ^ 1, sd)
-            thresh = line_times[b_idx, 1, z_val] * rand_gauss(_seed ^ b_mem ^ z_val, sd)
-            inputs ^= 2
-            next_t = b
-        else:
-            a_cur += 1
-            a = state[a_mem + a_cur, st_idx]
-            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ a_mem ^ z_val ^ 1, sd)
-            thresh = line_times[a_idx, 1, z_val] * rand_gauss(_seed ^ a_mem ^ z_val, sd)
-            inputs ^= 1
-            next_t = a
-
-        if (z_cur & 1) != ((lut >> inputs) & 1):
-            # we generate a toggle in z_mem, if:
-            #   ( it is the first toggle in z_mem OR
-            #   following toggle is earlier OR
-            #   pulse is wide enough ) AND enough space in z_mem.
-            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
-                if z_cur < (z_cap - 1):
-                    state[z_mem + z_cur, st_idx] = current_t
-                    previous_t = current_t
-                    z_cur += 1
-                else:
-                    overflows += 1
-                    previous_t = state[z_mem + z_cur - 1, st_idx]
-                    z_cur -= 1
-            else:
-                z_cur -= 1
-                if z_cur > 0:
-                    previous_t = state[z_mem + z_cur - 1, st_idx]
-                else:
-                    previous_t = TMIN
-        current_t = min(a, b)
-        
-    if overflows > 0:
-        state[z_mem + z_cur, st_idx] = TMAX_OVL
-    else:
-        state[z_mem + z_cur, st_idx] = a if a > b else b  # propagate overflow flags by storing biggest TMAX from input
diff --git a/tests/test_bench.py b/tests/test_bench.py
index 800f9be..25b9b1b 100644
--- a/tests/test_bench.py
+++ b/tests/test_bench.py
@@ -5,7 +5,7 @@ def test_b01(mydir):
     with open(mydir / 'b01.bench', 'r') as f:
         c = bench.parse(f.read())
         assert 92 == len(c.nodes)
-    c = bench.parse(mydir / 'b01.bench')
+    c = bench.load(mydir / 'b01.bench')
     assert 92 == len(c.nodes)
 
 
diff --git a/tests/test_circuit.py b/tests/test_circuit.py
index d61e8aa..b5d6055 100644
--- a/tests/test_circuit.py
+++ b/tests/test_circuit.py
@@ -1,6 +1,51 @@
 from kyupy.circuit import Circuit, Node, Line
 
 
+def test_lines():
+    c = Circuit()
+    n1 = Node(c, 'n1')
+    n2 = Node(c, 'n2')
+    line = Line(c, n1, n2)
+
+    assert line.driver == n1
+    assert line.reader == n2
+    assert line.driver_pin == 0
+    assert line.reader_pin == 0
+    assert n1.outs[0] == line
+    assert n2.ins[0] == line
+
+    line2 = Line(c, n1, (n2, 2))
+
+    assert line2.driver == n1
+    assert line2.reader == n2
+    assert line2.driver_pin == 1
+    assert line2.reader_pin == 2
+    assert n1.outs[0] == line
+    assert n1.outs[1] == line2
+    assert n2.ins[1] is None
+    assert n2.ins[2] == line2
+
+    line3 = Line(c, n1, n2)
+
+    assert line3.driver_pin == 2
+    assert line3.reader_pin == 1
+    assert n1.outs[2] == line3
+    assert n2.ins[1] == line3
+    assert n2.ins[2] == line2
+
+    assert len(c.lines) == 3
+
+    line3.remove()
+
+    assert len(c.lines) == 2
+    assert c.lines[0].index == 0
+    assert c.lines[1].index == 1
+
+    assert n1.outs[2] is None
+    assert n2.ins[1] is None
+    assert n2.ins[2] == line2
+
+
 def test_circuit():
     c = Circuit()
     in1 = Node(c, 'in1', 'buf')
diff --git a/tests/test_logic.py b/tests/test_logic.py
new file mode 100644
index 0000000..8fb933a
--- /dev/null
+++ b/tests/test_logic.py
@@ -0,0 +1,214 @@
+import kyupy.logic as lg
+
+
+def test_mvarray():
+
+    # instantiation with shape
+
+    ary = lg.MVArray(4)
+    assert ary.length == 1
+    assert len(ary) == 1
+    assert ary.width == 4
+
+    ary = lg.MVArray((3, 2))
+    assert ary.length == 2
+    assert len(ary) == 2
+    assert ary.width == 3
+
+    # instantiation with single vector
+
+    ary = lg.MVArray([1, 0, 1])
+    assert ary.length == 1
+    assert ary.width == 3
+    assert str(ary) == "['101']"
+    assert ary[0] == '101'
+
+    ary = lg.MVArray("10X-")
+    assert ary.length == 1
+    assert ary.width == 4
+    assert str(ary) == "['10X-']"
+    assert ary[0] == '10X-'
+
+    ary = lg.MVArray("1")
+    assert ary.length == 1
+    assert ary.width == 1
+
+    ary = lg.MVArray(["1"])
+    assert ary.length == 1
+    assert ary.width == 1
+
+    # instantiation with multiple vectors
+
+    ary = lg.MVArray([[0, 0], [0, 1], [1, 0], [1, 1]])
+    assert ary.length == 4
+    assert ary.width == 2
+
+    ary = lg.MVArray(["000", "001", "110", "---"])
+    assert ary.length == 4
+    assert ary.width == 3
+    assert str(ary) == "['000', '001', '110', '---']"
+    assert ary[2] == '110'
+
+    # casting to 2-valued logic
+
+    ary = lg.MVArray([0, 1, 2, None], m=2)
+    assert ary.data[0] == lg.ZERO
+    assert ary.data[1] == lg.ONE
+    assert ary.data[2] == lg.ZERO
+    assert ary.data[3] == lg.ZERO
+
+    ary = lg.MVArray("0-X1PRFN", m=2)
+    assert ary.data[0] == lg.ZERO
+    assert ary.data[1] == lg.ZERO
+    assert ary.data[2] == lg.ZERO
+    assert ary.data[3] == lg.ONE
+    assert ary.data[4] == lg.ZERO
+    assert ary.data[5] == lg.ONE
+    assert ary.data[6] == lg.ZERO
+    assert ary.data[7] == lg.ONE
+
+    # casting to 4-valued logic
+
+    ary = lg.MVArray([0, 1, 2, None, 'F'], m=4)
+    assert ary.data[0] == lg.ZERO
+    assert ary.data[1] == lg.ONE
+    assert ary.data[2] == lg.UNKNOWN
+    assert ary.data[3] == lg.UNASSIGNED
+    assert ary.data[4] == lg.ZERO
+
+    ary = lg.MVArray("0-X1PRFN", m=4)
+    assert ary.data[0] == lg.ZERO
+    assert ary.data[1] == lg.UNASSIGNED
+    assert ary.data[2] == lg.UNKNOWN
+    assert ary.data[3] == lg.ONE
+    assert ary.data[4] == lg.ZERO
+    assert ary.data[5] == lg.ONE
+    assert ary.data[6] == lg.ZERO
+    assert ary.data[7] == lg.ONE
+
+    # casting to 8-valued logic
+
+    ary = lg.MVArray([0, 1, 2, None, 'F'], m=8)
+    assert ary.data[0] == lg.ZERO
+    assert ary.data[1] == lg.ONE
+    assert ary.data[2] == lg.UNKNOWN
+    assert ary.data[3] == lg.UNASSIGNED
+    assert ary.data[4] == lg.FALL
+
+    ary = lg.MVArray("0-X1PRFN", m=8)
+    assert ary.data[0] == lg.ZERO
+    assert ary.data[1] == lg.UNASSIGNED
+    assert ary.data[2] == lg.UNKNOWN
+    assert ary.data[3] == lg.ONE
+    assert ary.data[4] == lg.PPULSE
+    assert ary.data[5] == lg.RISE
+    assert ary.data[6] == lg.FALL
+    assert ary.data[7] == lg.NPULSE
+
+    # copy constructor and casting
+
+    ary8 = lg.MVArray(ary, m=8)
+    assert ary8.length == 1
+    assert ary8.width == 8
+    assert ary8.data[7] == lg.NPULSE
+
+    ary4 = lg.MVArray(ary, m=4)
+    assert ary4.data[1] == lg.UNASSIGNED
+    assert ary4.data[7] == lg.ONE
+
+    ary2 = lg.MVArray(ary, m=2)
+    assert ary2.data[1] == lg.ZERO
+    assert ary2.data[7] == lg.ONE
+
+
+def test_mv_operations():
+    x1_2v = lg.MVArray("0011", m=2)
+    x2_2v = lg.MVArray("0101", m=2)
+    x1_4v = lg.MVArray("0000XXXX----1111", m=4)
+    x2_4v = lg.MVArray("0X-10X-10X-10X-1", m=4)
+    x1_8v = lg.MVArray("00000000XXXXXXXX--------11111111PPPPPPPPRRRRRRRRFFFFFFFFNNNNNNNN", m=8)
+    x2_8v = lg.MVArray("0X-1PRFN0X-1PRFN0X-1PRFN0X-1PRFN0X-1PRFN0X-1PRFN0X-1PRFN0X-1PRFN", m=8)
+
+    assert lg.mv_not(x1_2v)[0] == '1100'
+    assert lg.mv_not(x1_4v)[0] == '1111XXXXXXXX0000'
+    assert lg.mv_not(x1_8v)[0] == '11111111XXXXXXXXXXXXXXXX00000000NNNNNNNNFFFFFFFFRRRRRRRRPPPPPPPP'
+
+    assert lg.mv_or(x1_2v, x2_2v)[0] == '0111'
+    assert lg.mv_or(x1_4v, x2_4v)[0] == '0XX1XXX1XXX11111'
+    assert lg.mv_or(x1_8v, x2_8v)[0] == '0XX1PRFNXXX1XXXXXXX1XXXX11111111PXX1PRFNRXX1RRNNFXX1FNFNNXX1NNNN'
+
+    assert lg.mv_and(x1_2v, x2_2v)[0] == '0001'
+    assert lg.mv_and(x1_4v, x2_4v)[0] == '00000XXX0XXX0XX1'
+    assert lg.mv_and(x1_8v, x2_8v)[0] == '000000000XXXXXXX0XXXXXXX0XX1PRFN0XXPPPPP0XXRPRPR0XXFPPFF0XXNPRFN'
+
+    assert lg.mv_xor(x1_2v, x2_2v)[0] == '0110'
+    assert lg.mv_xor(x1_4v, x2_4v)[0] == '0XX1XXXXXXXX1XX0'
+    assert lg.mv_xor(x1_8v, x2_8v)[0] == '0XX1PRFNXXXXXXXXXXXXXXXX1XX0NFRPPXXNPRFNRXXFRPNFFXXRFNPRNXXPNFRP'
+
+
+def test_bparray():
+
+    ary = lg.BPArray(4)
+    assert ary.length == 1
+    assert len(ary) == 1
+    assert ary.width == 4
+
+    ary = lg.BPArray((3, 2))
+    assert ary.length == 2
+    assert len(ary) == 2
+    assert ary.width == 3
+
+    assert lg.MVArray(lg.BPArray("01", m=2))[0] == '01'
+    assert lg.MVArray(lg.BPArray("0X-1", m=4))[0] == '0X-1'
+    assert lg.MVArray(lg.BPArray("0X-1PRFN", m=8))[0] == '0X-1PRFN'
+
+    x1_2v = lg.BPArray("0011", m=2)
+    x2_2v = lg.BPArray("0101", m=2)
+    x1_4v = lg.BPArray("0000XXXX----1111", m=4)
+    x2_4v = lg.BPArray("0X-10X-10X-10X-1", m=4)
+    x1_8v = lg.BPArray("00000000XXXXXXXX--------11111111PPPPPPPPRRRRRRRRFFFFFFFFNNNNNNNN", m=8)
+    x2_8v = lg.BPArray("0X-1PRFN0X-1PRFN0X-1PRFN0X-1PRFN0X-1PRFN0X-1PRFN0X-1PRFN0X-1PRFN", m=8)
+
+    out_2v = lg.BPArray((4, 1), m=2)
+    out_4v = lg.BPArray((16, 1), m=4)
+    out_8v = lg.BPArray((64, 1), m=8)
+
+    lg.bp_buf(out_2v.data, x1_2v.data)
+    lg.bp_buf(out_4v.data, x1_4v.data)
+    lg.bp_buf(out_8v.data, x1_8v.data)
+
+    assert lg.MVArray(out_2v)[0] == '0011'
+    assert lg.MVArray(out_4v)[0] == '0000XXXXXXXX1111'
+    assert lg.MVArray(out_8v)[0] == '00000000XXXXXXXXXXXXXXXX11111111PPPPPPPPRRRRRRRRFFFFFFFFNNNNNNNN'
+
+    lg.bp_not(out_2v.data, x1_2v.data)
+    lg.bp_not(out_4v.data, x1_4v.data)
+    lg.bp_not(out_8v.data, x1_8v.data)
+
+    assert lg.MVArray(out_2v)[0] == '1100'
+    assert lg.MVArray(out_4v)[0] == '1111XXXXXXXX0000'
+    assert lg.MVArray(out_8v)[0] == '11111111XXXXXXXXXXXXXXXX00000000NNNNNNNNFFFFFFFFRRRRRRRRPPPPPPPP'
+
+    lg.bp_or(out_2v.data, x1_2v.data, x2_2v.data)
+    lg.bp_or(out_4v.data, x1_4v.data, x2_4v.data)
+    lg.bp_or(out_8v.data, x1_8v.data, x2_8v.data)
+
+    assert lg.MVArray(out_2v)[0] == '0111'
+    assert lg.MVArray(out_4v)[0] == '0XX1XXX1XXX11111'
+    assert lg.MVArray(out_8v)[0] == '0XX1PRFNXXX1XXXXXXX1XXXX11111111PXX1PRFNRXX1RRNNFXX1FNFNNXX1NNNN'
+
+    lg.bp_and(out_2v.data, x1_2v.data, x2_2v.data)
+    lg.bp_and(out_4v.data, x1_4v.data, x2_4v.data)
+    lg.bp_and(out_8v.data, x1_8v.data, x2_8v.data)
+
+    assert lg.MVArray(out_2v)[0] == '0001'
+    assert lg.MVArray(out_4v)[0] == '00000XXX0XXX0XX1'
+    assert lg.MVArray(out_8v)[0] == '000000000XXXXXXX0XXXXXXX0XX1PRFN0XXPPPPP0XXRPRPR0XXFPPFF0XXNPRFN'
+
+    lg.bp_xor(out_2v.data, x1_2v.data, x2_2v.data)
+    lg.bp_xor(out_4v.data, x1_4v.data, x2_4v.data)
+    lg.bp_xor(out_8v.data, x1_8v.data, x2_8v.data)
+
+    assert lg.MVArray(out_2v)[0] == '0110'
+    assert lg.MVArray(out_4v)[0] == '0XX1XXXXXXXX1XX0'
+    assert lg.MVArray(out_8v)[0] == '0XX1PRFNXXXXXXXXXXXXXXXX1XX0NFRPPXXNPRFNRXXFRPNFFXXRFNPRNXXPNFRP'
diff --git a/tests/test_logic_sim.py b/tests/test_logic_sim.py
index df50546..990eec7 100644
--- a/tests/test_logic_sim.py
+++ b/tests/test_logic_sim.py
@@ -1,161 +1,96 @@
 from kyupy.logic_sim import LogicSim
 from kyupy import bench
-from kyupy.packed_vectors import PackedVectors
+from kyupy.logic import MVArray, BPArray
 
 
-def test_vd1():
+def test_2v():
     c = bench.parse('input(x, y) output(a, o, n) a=and(x,y) o=or(x,y) n=not(x)')
-    s = LogicSim(c, 4)
+    s = LogicSim(c, 4, m=2)
     assert len(s.interface) == 5
-    p = PackedVectors(4, len(s.interface))
-    p[0] = '00000'
-    p[1] = '01000'
-    p[2] = '10000'
-    p[3] = '11000'
-    s.assign(p)
+    mva = MVArray(['00000', '01000', '10000', '11000'], m=2)
+    bpa = BPArray(mva)
+    s.assign(bpa)
     s.propagate()
-    s.capture(p)
-    assert p[0] == '00001'
-    assert p[1] == '01011'
-    assert p[2] == '10010'
-    assert p[3] == '11110'
+    s.capture(bpa)
+    mva = MVArray(bpa)
+    assert mva[0] == '00001'
+    assert mva[1] == '01011'
+    assert mva[2] == '10010'
+    assert mva[3] == '11110'
 
 
-def test_vd2():
+def test_4v():
     c = bench.parse('input(x, y) output(a, o, n) a=and(x,y) o=or(x,y) n=not(x)')
-    s = LogicSim(c, 16, 2)
+    s = LogicSim(c, 16, m=4)
     assert len(s.interface) == 5
-    p = PackedVectors(16, len(s.interface), 2)
-    p[0] = '00000'
-    p[1] = '01000'
-    p[2] = '0-000'
-    p[3] = '0X000'
-    p[4] = '10000'
-    p[5] = '11000'
-    p[6] = '1-000'
-    p[7] = '1X000'
-    p[8] = '-0000'
-    p[9] = '-1000'
-    p[10] = '--000'
-    p[11] = '-X000'
-    p[12] = 'X0000'
-    p[13] = 'X1000'
-    p[14] = 'X-000'
-    p[15] = 'XX000'
-    s.assign(p)
+    mva = MVArray(['00000', '01000', '0-000', '0X000',
+                   '10000', '11000', '1-000', '1X000',
+                   '-0000', '-1000', '--000', '-X000',
+                   'X0000', 'X1000', 'X-000', 'XX000'], m=4)
+    bpa = BPArray(mva)
+    s.assign(bpa)
     s.propagate()
-    s.capture(p)
-    assert p[0] == '00001'
-    assert p[1] == '01011'
-    assert p[2] == '0-0X1'
-    assert p[3] == '0X0X1'
-    assert p[4] == '10010'
-    assert p[5] == '11110'
-    assert p[6] == '1-X10'
-    assert p[7] == '1XX10'
-    assert p[8] == '-00XX'
-    assert p[9] == '-1X1X'
-    assert p[10] == '--XXX'
-    assert p[11] == '-XXXX'
-    assert p[12] == 'X00XX'
-    assert p[13] == 'X1X1X'
-    assert p[14] == 'X-XXX'
-    assert p[15] == 'XXXXX'
+    s.capture(bpa)
+    mva = MVArray(bpa)
+    assert mva[0] == '00001'
+    assert mva[1] == '01011'
+    assert mva[2] == '0-0X1'
+    assert mva[3] == '0X0X1'
+    assert mva[4] == '10010'
+    assert mva[5] == '11110'
+    assert mva[6] == '1-X10'
+    assert mva[7] == '1XX10'
+    assert mva[8] == '-00XX'
+    assert mva[9] == '-1X1X'
+    assert mva[10] == '--XXX'
+    assert mva[11] == '-XXXX'
+    assert mva[12] == 'X00XX'
+    assert mva[13] == 'X1X1X'
+    assert mva[14] == 'X-XXX'
+    assert mva[15] == 'XXXXX'
 
     
-def test_vd3():
+def test_8v():
     c = bench.parse('input(x, y) output(a, o, n, xo) a=and(x,y) o=or(x,y) n=not(x) xo=xor(x,y)')
-    s = LogicSim(c, 64, 3)
+    s = LogicSim(c, 64, m=8)
     assert len(s.interface) == 6
-    p = PackedVectors(64, len(s.interface), 3)
-    p[0] = '000010'
-    p[1] = '010111'
-    p[2] = '0-0X1X'
-    p[3] = '0X0X1X'
-    p[4] = '0R0R1R'
-    p[5] = '0F0F1F'
-    p[6] = '0P0P1P'
-    p[7] = '0N0N1N'
-    p[8] = '100101'
-    p[9] = '111100'
-    p[10] = '1-X10X'
-    p[11] = '1XX10X'
-    p[12] = '1RR10F'
-    p[13] = '1FF10R'
-    p[14] = '1PP10N'
-    p[15] = '1NN10P'
-    p[16] = '-00XXX'
-    p[17] = '-1X1XX'
-    p[18] = '--XXXX'
-    p[19] = '-XXXXX'
-    p[20] = '-RXXXX'
-    p[21] = '-FXXXX'
-    p[22] = '-PXXXX'
-    p[23] = '-NXXXX'
-    p[24] = 'X00XXX'
-    p[25] = 'X1X1XX'
-    p[26] = 'X-XXXX'
-    p[27] = 'XXXXXX'
-    p[28] = 'XRXXXX'
-    p[29] = 'XFXXXX'
-    p[30] = 'XPXXXX'
-    p[31] = 'XNXXXX'
-    p[32] = 'R00RFR'
-    p[33] = 'R1R1FF'
-    p[34] = 'R-XXFX'
-    p[35] = 'RXXXFX'
-    p[36] = 'RRRRFP'
-    p[37] = 'RFPNFN'
-    p[38] = 'RPPRFR'
-    p[39] = 'RNRNFF'
-    p[40] = 'F00FRF'
-    p[41] = 'F1F1RR'
-    p[42] = 'F-XXRX'
-    p[43] = 'FXXXRX'
-    p[44] = 'FRPNRN'
-    p[45] = 'FFFFRP'
-    p[46] = 'FPPFRF'
-    p[47] = 'FNFNRR'
-    p[48] = 'P00PNP'
-    p[49] = 'P1P1NN'
-    p[50] = 'P-XXNX'
-    p[51] = 'PXXXNX'
-    p[52] = 'PRPRNR'
-    p[53] = 'PFPFNF'
-    p[54] = 'PPPPNP'
-    p[55] = 'PNPNNN'
-    p[56] = 'N00NPN'
-    p[57] = 'N1N1PP'
-    p[58] = 'N-XXPX'
-    p[59] = 'NXXXPX'
-    p[60] = 'NRRNPF'
-    p[61] = 'NFFNPR'
-    p[62] = 'NPPNPN'
-    p[63] = 'NNNNPP'
-    expect = p.copy()
-    s.assign(p)
+    mva = MVArray(['000010', '010111', '0-0X1X', '0X0X1X', '0R0R1R', '0F0F1F', '0P0P1P', '0N0N1N',
+                   '100101', '111100', '1-X10X', '1XX10X', '1RR10F', '1FF10R', '1PP10N', '1NN10P',
+                   '-00XXX', '-1X1XX', '--XXXX', '-XXXXX', '-RXXXX', '-FXXXX', '-PXXXX', '-NXXXX',
+                   'X00XXX', 'X1X1XX', 'X-XXXX', 'XXXXXX', 'XRXXXX', 'XFXXXX', 'XPXXXX', 'XNXXXX',
+                   'R00RFR', 'R1R1FF', 'R-XXFX', 'RXXXFX', 'RRRRFP', 'RFPNFN', 'RPPRFR', 'RNRNFF',
+                   'F00FRF', 'F1F1RR', 'F-XXRX', 'FXXXRX', 'FRPNRN', 'FFFFRP', 'FPPFRF', 'FNFNRR',
+                   'P00PNP', 'P1P1NN', 'P-XXNX', 'PXXXNX', 'PRPRNR', 'PFPFNF', 'PPPPNP', 'PNPNNN',
+                   'N00NPN', 'N1N1PP', 'N-XXPX', 'NXXXPX', 'NRRNPF', 'NFFNPR', 'NPPNPN', 'NNNNPP'], m=8)
+    bpa = BPArray(mva)
+    s.assign(bpa)
     s.propagate()
-    s.capture(p)
+    resp_bp = BPArray(bpa)
+    s.capture(resp_bp)
+    resp = MVArray(resp_bp)
+
     for i in range(64):
-        assert p[i] == expect[i]
+        assert resp[i] == mva[i]
         
 
 def test_b01(mydir):
-    c = bench.parse(mydir / 'b01.bench')
+    c = bench.load(mydir / 'b01.bench')
 
     # 2-valued
-    s = LogicSim(c, 8)
+    s = LogicSim(c, 8, m=2)
     assert len(s.interface) == 9
-    t = PackedVectors(8, len(s.interface))
-    t.randomize()
-    s.assign(t)
+    mva = MVArray((len(s.interface), 8), m=2)
+    # mva.randomize()
+    bpa = BPArray(mva)
+    s.assign(bpa)
     s.propagate()
-    s.capture(t)
+    s.capture(bpa)
 
     # 8-valued
-    s = LogicSim(c, 8, 3)
-    t = PackedVectors(8, len(s.interface), 3)
-    t.randomize()
-    s.assign(t)
+    s = LogicSim(c, 8, m=8)
+    mva = MVArray((len(s.interface), 8), m=8)
+    # mva.randomize()
+    bpa = BPArray(mva)
+    s.assign(bpa)
     s.propagate()
-    s.capture(t)
+    s.capture(bpa)
diff --git a/tests/test_packed_vectors.py b/tests/test_packed_vectors.py
deleted file mode 100644
index 2f2a4a0..0000000
--- a/tests/test_packed_vectors.py
+++ /dev/null
@@ -1,88 +0,0 @@
-from kyupy.packed_vectors import PackedVectors
-
-
-def test_basic():
-    ba = PackedVectors(8, 1, 1)
-    assert '0\n0\n0\n0\n0\n0\n0\n0' == str(ba)
-    ba.set_value(0, 0, 1)
-    ba.set_value(1, 0, 'H')
-    ba.set_value(2, 0, 'h')
-    ba.set_value(3, 0, True)
-    ba.set_value(4, 0, 0)
-    ba.set_value(5, 0, 'L')
-    ba.set_value(6, 0, 'l')
-    ba.set_value(7, 0, False)
-    assert '1\n1\n1\n1\n0\n0\n0\n0' == str(ba)
-    ba.set_value(1, 0, '0')
-    ba.set_value(5, 0, '1')
-    assert '1\n0\n1\n1\n0\n1\n0\n0' == str(ba)
-    ba = PackedVectors(8, 1, 2)
-    assert '-\n-\n-\n-\n-\n-\n-\n-' == str(ba)
-    ba.set_value(0, 0, 1)
-    ba.set_value(7, 0, 0)
-    ba.set_value(4, 0, 'X')
-    assert '1\n-\n-\n-\nX\n-\n-\n0' == str(ba)
-    ba.set_value(4, 0, '-')
-    assert '1\n-\n-\n-\n-\n-\n-\n0' == str(ba)
-    ba = PackedVectors(8, 2, 2)
-    assert '--\n--\n--\n--\n--\n--\n--\n--' == str(ba)
-    ba.set_value(0, 0, '1')
-    ba.set_value(7, 1, '0')
-    ba.set_values(1, 'XX')
-    assert '1-\nXX\n--\n--\n--\n--\n--\n-0' == str(ba)
-
-
-def test_8v():
-    ba = PackedVectors(1, 8, 3)
-    assert '--------' == str(ba)
-    ba.set_values(0, r'-x01^v\/')
-    assert r'-X01PNFR' == str(ba)
-    ba.set_values(0, '-XLHPNFR')
-    assert r'-X01PNFR' == str(ba)
-    ba.set_values(0, '-xlhpnfr')
-    assert r'-X01PNFR' == str(ba)
-    p1 = PackedVectors(1, 8, 1)
-    p2 = PackedVectors(1, 8, 1)
-    p1.set_values(0, '01010101')
-    p2.set_values(0, '00110011')
-    p = PackedVectors.from_pair(p1, p2)
-    assert r'0FR10FR1' == str(p)
-    p1 = PackedVectors(1, 8, 2)
-    p2 = PackedVectors(1, 8, 2)
-    p1.set_values(0, '0101-X-X')
-    p2.set_values(0, '00110011')
-    p = PackedVectors.from_pair(p1, p2)
-    assert r'0FR1----' == str(p)
-    p1.set_values(0, '0101-X-X')
-    p2.set_values(0, '-X-X--XX')
-    p = PackedVectors.from_pair(p1, p2)
-    assert r'--------' == str(p)
-
-
-def test_slicing():
-    lv = PackedVectors(3, 2, 1)
-    assert '00\n00\n00' == str(lv)
-    lv.set_value(1, 0, '1')
-    lv.set_value(1, 1, '1')
-    assert '00' == lv[0]
-    assert '11' == lv[1]
-    assert 3 == len(lv)
-    lv2 = lv[1:3]
-    assert 2 == len(lv2)
-    assert '11' == lv2[0]
-    assert '00' == lv2[1]
-
-
-def test_copy():
-    lv1 = PackedVectors(8, 1, 1)
-    lv1.set_values_for_position(0, '01010101')
-    lv2 = PackedVectors(8, 1, 1)
-    lv2.set_values_for_position(0, '00100101')
-    diff = lv1.diff(lv2)
-    lv3 = lv1.copy(selection_mask=diff)
-    assert str(lv3) == '1\n0\n1'
-    lv4 = lv1.copy(selection_mask=~diff)
-    assert str(lv4) == '0\n0\n1\n0\n1'
-    lv5 = lv3 + lv4
-    assert str(lv5) == '1\n0\n1\n0\n0\n1\n0\n1'
-
diff --git a/tests/test_sdf.py b/tests/test_sdf.py
index 61932fa..8b30b68 100644
--- a/tests/test_sdf.py
+++ b/tests/test_sdf.py
@@ -74,13 +74,13 @@ def test_parse():
 
 
 def test_b14(mydir):
-    df = sdf.parse(mydir / 'b14.sdf.gz')
+    df = sdf.load(mydir / 'b14.sdf.gz')
     assert df.name == 'b14'
 
 
 def test_gates(mydir):
-    c = verilog.parse(mydir / 'gates.v')
-    df = sdf.parse(mydir / 'gates.sdf')
+    c = verilog.load(mydir / 'gates.v')
+    df = sdf.load(mydir / 'gates.sdf')
     lt = df.annotation(c, pin_index, dataset=1)
     nand_a = c.cells['nandgate'].ins[0]
     nand_b = c.cells['nandgate'].ins[1]
diff --git a/tests/test_stil.py b/tests/test_stil.py
index 6747a42..1f0d89b 100644
--- a/tests/test_stil.py
+++ b/tests/test_stil.py
@@ -2,7 +2,7 @@ from kyupy import stil
 
 
 def test_b14(mydir):
-    s = stil.parse(mydir / 'b14.stuck.stil.gz')
+    s = stil.load(mydir / 'b14.stuck.stil.gz')
     assert 10 == len(s.signal_groups)
     assert 1 == len(s.scan_chains)
     assert 2163 == len(s.calls)
diff --git a/tests/test_verilog.py b/tests/test_verilog.py
index 1a4aef9..366032a 100644
--- a/tests/test_verilog.py
+++ b/tests/test_verilog.py
@@ -5,5 +5,4 @@ def test_b01(mydir):
     with open(mydir / 'b01.v', 'r') as f:
         modules = verilog.parse(f.read())
     assert modules is not None
-    assert verilog.parse(mydir / 'b01.v') is not None
-
+    assert verilog.load(mydir / 'b01.v') is not None
diff --git a/tests/test_wave_sim.py b/tests/test_wave_sim.py
index 1fdf793..bea26d3 100644
--- a/tests/test_wave_sim.py
+++ b/tests/test_wave_sim.py
@@ -1,11 +1,10 @@
 import numpy as np
-from kyupy.wave_sim import WaveSim, wave_eval, TMIN, TMAX
+
+from kyupy.wave_sim import WaveSim, WaveSimCuda, wave_eval, TMIN, TMAX
 from kyupy.logic_sim import LogicSim
-from kyupy import verilog
-from kyupy import sdf
+from kyupy import verilog, sdf, logic
 from kyupy.saed import pin_index
-from kyupy.packed_vectors import PackedVectors
-from kyupy.wave_sim_cuda import WaveSimCuda
+from kyupy.logic import MVArray, BPArray
 
 
 def test_wave_eval():
@@ -96,24 +95,29 @@ def test_wave_eval():
 
 
 def compare_to_logic_sim(wsim):
-    tests = PackedVectors(wsim.sims, len(wsim.interface), 3)
-    tests.randomize()
-    wsim.assign(tests)
-    wsim.propagate(8)
+    tests = MVArray((len(wsim.interface), wsim.sims))
+    choices = np.asarray([logic.ZERO, logic.ONE, logic.RISE, logic.FALL], dtype=np.uint8)
+    rng = np.random.default_rng(10)
+    tests.data[...] = rng.choice(choices, tests.data.shape)
+    tests_bp = BPArray(tests)
+    wsim.assign(tests_bp)
+    wsim.propagate()
     cdata = wsim.capture()
 
-    resp = tests.copy()
+    resp = MVArray(tests)
 
     for iidx, inode in enumerate(wsim.interface):
         if len(inode.ins) > 0:
             for vidx in range(wsim.sims):
-                resp.set_value(vidx, iidx, 0 if cdata[iidx, vidx, 0] < 0.5 else 1)
+                resp.data[iidx, vidx] = logic.ZERO if cdata[iidx, vidx, 0] < 0.5 else logic.ONE
+                # resp.set_value(vidx, iidx, 0 if cdata[iidx, vidx, 0] < 0.5 else 1)
 
-    lsim = LogicSim(wsim.circuit, len(tests), 3)
-    lsim.assign(tests)
+    lsim = LogicSim(wsim.circuit, len(tests_bp))
+    lsim.assign(tests_bp)
     lsim.propagate()
-    exp = tests.copy()
-    lsim.capture(exp)
+    exp_bp = BPArray(tests_bp)
+    lsim.capture(exp_bp)
+    exp = MVArray(exp_bp)
     
     for i in range(8):
         exp_str = exp[i].replace('R', '1').replace('F', '0').replace('P', '0').replace('N', '1')
@@ -122,24 +126,24 @@ def compare_to_logic_sim(wsim):
 
 
 def test_b14(mydir):
-    c = verilog.parse(mydir / 'b14.v.gz', branchforks=True)
-    df = sdf.parse(mydir / 'b14.sdf.gz')
+    c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
+    df = sdf.load(mydir / 'b14.sdf.gz')
     lt = df.annotation(c, pin_index)
     wsim = WaveSim(c, lt, 8)
     compare_to_logic_sim(wsim)
 
 
 def test_b14_strip_forks(mydir):
-    c = verilog.parse(mydir / 'b14.v.gz', branchforks=True)
-    df = sdf.parse(mydir / 'b14.sdf.gz')
+    c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
+    df = sdf.load(mydir / 'b14.sdf.gz')
     lt = df.annotation(c, pin_index)
     wsim = WaveSim(c, lt, 8, strip_forks=True)
     compare_to_logic_sim(wsim)
 
 
 def test_b14_cuda(mydir):
-    c = verilog.parse(mydir / 'b14.v.gz', branchforks=True)
-    df = sdf.parse(mydir / 'b14.sdf.gz')
+    c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
+    df = sdf.load(mydir / 'b14.sdf.gz')
     lt = df.annotation(c, pin_index)
     wsim = WaveSimCuda(c, lt, 8)
     compare_to_logic_sim(wsim)

From c9445f2d79d81f404688c68f71fb63802ca64f31 Mon Sep 17 00:00:00 2001
From: Stefan Holst <mail@s-holst.de>
Date: Sat, 16 Jan 2021 14:48:27 +0900
Subject: [PATCH 2/2] Docs, __index__, fault injection and TechLib

- Documentation improvements
- Node and Line objects now provide __index__
- LogicSim cleanup and improvements (inject_cb, cycle, ...)
- Introduce TechLib class to organize tech-specific info
- More human-readable output
- De-linting
---
 Demo.ipynb              |  24 ++--
 LICENSE.txt             |   2 +-
 docs/conf.py            |   4 +-
 docs/index.rst          |   1 +
 docs/miscellaneous.rst  |  10 ++
 docs/simulators.rst     |   3 +
 setup.py                |   3 +-
 src/kyupy/__init__.py   | 141 +++++++++++++++----
 src/kyupy/bench.py      |  14 +-
 src/kyupy/circuit.py    |  41 +++---
 src/kyupy/logic.py      | 244 ++++++++++++++++++++------------
 src/kyupy/logic_sim.py  | 292 +++++++++++++++++---------------------
 src/kyupy/saed.py       | 289 --------------------------------------
 src/kyupy/sdf.py        |  76 +++++-----
 src/kyupy/stil.py       |  36 ++---
 src/kyupy/techlib.py    | 301 ++++++++++++++++++++++++++++++++++++++++
 src/kyupy/verilog.py    |  52 ++++---
 src/kyupy/wave_sim.py   | 157 ++++++++++++++++-----
 tests/test_bench.py     |   4 +-
 tests/test_logic_sim.py |   4 +-
 tests/test_sdf.py       |  19 ++-
 tests/test_stil.py      |   7 +-
 tests/test_wave_sim.py  |  49 ++++---
 23 files changed, 1002 insertions(+), 771 deletions(-)
 create mode 100644 docs/miscellaneous.rst
 delete mode 100644 src/kyupy/saed.py
 create mode 100644 src/kyupy/techlib.py

diff --git a/Demo.ipynb b/Demo.ipynb
index 288f1bd..805c60c 100644
--- a/Demo.ipynb
+++ b/Demo.ipynb
@@ -44,7 +44,7 @@
     {
      "data": {
       "text/plain": [
-       "<Circuit 'tests/b01.bench' with 92 nodes, 130 lines, 4 ports>"
+       "<Circuit tests/b01.bench cells=45 forks=47 lines=130 ports=4>"
       ]
      },
      "execution_count": 2,
@@ -64,7 +64,7 @@
     {
      "data": {
       "text/plain": [
-       "<Circuit with 10 nodes, 8 lines, 5 ports>"
+       "<Circuit cells=4 forks=6 lines=8 ports=5>"
       ]
      },
      "execution_count": 3,
@@ -362,7 +362,7 @@
     {
      "data": {
       "text/plain": [
-       "<Circuit 'b14' with 31715 nodes, 46891 lines, 91 ports>"
+       "<Circuit b14 cells=15873 forks=15842 lines=46891 ports=91>"
       ]
      },
      "execution_count": 13,
@@ -445,7 +445,7 @@
     {
      "data": {
       "text/plain": [
-       "<Circuit 'b14' with 31715 nodes, 46891 lines, 91 ports>"
+       "<Circuit b14 cells=15873 forks=15842 lines=46891 ports=91>"
       ]
      },
      "execution_count": 15,
@@ -489,11 +489,11 @@
     "\n",
     "for cell in b14.topological_order():\n",
     "    if 'DFF' in cell.kind or 'input' == cell.kind:\n",
-    "        levels[cell.index] = 0\n",
+    "        levels[cell] = 0\n",
     "    elif '__fork__' == cell.kind:\n",
-    "        levels[cell.index] = levels[cell.ins[0].driver.index]  # forks only have exactly one driver\n",
+    "        levels[cell] = levels[cell.ins[0].driver]  # forks only have exactly one driver\n",
     "    else:\n",
-    "        levels[cell.index] = max([levels[line.driver.index] for line in cell.ins]) + 1\n",
+    "        levels[cell] = max([levels[line.driver] for line in cell.ins]) + 1\n",
     "        \n",
     "print(f'Maximum logic depth: {np.max(levels)}')"
    ]
@@ -591,7 +591,7 @@
     {
      "data": {
       "text/plain": [
-       "<MVArray length=1081 width=306 m=8 nbytes=330786>"
+       "<MVArray length=1081 width=306 m=8 mem=323.0kiB>"
       ]
      },
      "execution_count": 19,
@@ -697,7 +697,7 @@
     {
      "data": {
       "text/plain": [
-       "<BPArray length=1081 width=306 m=8 bytes=124848>"
+       "<BPArray length=1081 width=306 m=8 mem=121.9kiB>"
       ]
      },
      "execution_count": 23,
@@ -829,7 +829,7 @@
     {
      "data": {
       "text/plain": [
-       "<MVArray length=1392 width=306 m=8 nbytes=425952>"
+       "<MVArray length=1392 width=306 m=8 mem=416.0kiB>"
       ]
      },
      "execution_count": 29,
@@ -962,10 +962,9 @@
    "outputs": [],
    "source": [
     "from kyupy import sdf\n",
-    "from kyupy.saed import pin_index\n",
     "\n",
     "df = sdf.load('tests/b14.sdf.gz')\n",
-    "lt = df.annotation(b14, pin_index, dataset=0, interconnect=False)"
+    "lt = df.annotation(b14, dataset=0, interconnect=False)"
    ]
   },
   {
@@ -1118,6 +1117,7 @@
    "metadata": {},
    "source": [
     "The capture data contains for each PI, PO, and scan flip-flop (axis 0), and each test (axis 1) seven values:\n",
+    "\n",
     "0. Probability of capturing a 1 at the given capture time (same as next value, if no standard deviation given).\n",
     "1. A capture value decided by random sampling according to above probability.\n",
     "2. The final value (assume a very late capture time).\n",
diff --git a/LICENSE.txt b/LICENSE.txt
index 1e4a002..293fa79 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2020 s-holst
+Copyright (c) 2020-2021 Stefan Holst
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/docs/conf.py b/docs/conf.py
index cb2e436..540783b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -20,11 +20,11 @@ sys.path.insert(0, os.path.abspath('../src'))
 # -- Project information -----------------------------------------------------
 
 project = 'KyuPy'
-copyright = '2020, Stefan Holst'
+copyright = '2020-2021, Stefan Holst'
 author = 'Stefan Holst'
 
 # The full version, including alpha/beta/rc tags
-release = '0.0.2'
+release = '0.0.3'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/index.rst b/docs/index.rst
index 3caa343..3fc74e8 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -9,4 +9,5 @@ API Reference
    datastructures
    parsers
    simulators
+   miscellaneous
 
diff --git a/docs/miscellaneous.rst b/docs/miscellaneous.rst
new file mode 100644
index 0000000..fff469f
--- /dev/null
+++ b/docs/miscellaneous.rst
@@ -0,0 +1,10 @@
+Miscellaneous
+=============
+
+.. automodule:: kyupy
+   :members:
+
+.. automodule:: kyupy.techlib
+   :members:
+
+
diff --git a/docs/simulators.rst b/docs/simulators.rst
index 8d5f6b6..bcc0ea4 100644
--- a/docs/simulators.rst
+++ b/docs/simulators.rst
@@ -4,6 +4,8 @@ Simulators
 Logic Simulation - :mod:`kyupy.logic_sim`
 -----------------------------------------
 
+.. automodule:: kyupy.logic_sim
+
 .. autoclass:: kyupy.logic_sim.LogicSim
    :members:
 
@@ -12,6 +14,7 @@ Timing Simulation - :mod:`kyupy.wave_sim`
 -----------------------------------------
 
 .. automodule:: kyupy.wave_sim
+   :members: TMAX, TMAX_OVL, TMIN
 
 .. autoclass:: kyupy.wave_sim.WaveSim
    :members:
diff --git a/setup.py b/setup.py
index 9a0bb1b..efb49ec 100644
--- a/setup.py
+++ b/setup.py
@@ -5,9 +5,10 @@ with open('README.rst', 'r') as f:
 
 setup(
     name='kyupy',
-    version='0.0.2',
+    version='0.0.3',
     description='High-performance processing and analysis of non-hierarchical VLSI designs',
     long_description=long_description,
+    long_description_content_type='text/x-rst',
     packages=find_packages(where='src'),
     package_dir={'': 'src'},
     url='https://github.com/s-holst/kyupy',
diff --git a/src/kyupy/__init__.py b/src/kyupy/__init__.py
index d1bb8db..8bbfc9f 100644
--- a/src/kyupy/__init__.py
+++ b/src/kyupy/__init__.py
@@ -1,6 +1,8 @@
 """A package for processing and analysis of non-hierarchical gate-level VLSI designs.
 
-It contains fundamental building blocks for research software in the fields of VLSI test, diagnosis and reliability.
+The kyupy package itself contains a logger and other simple utility functions.
+In addition, it defines a ``numba`` and a ``cuda`` objects that point to the actual packages
+if they are available and otherwise point to mocks.
 """
 
 import time
@@ -10,10 +12,78 @@ import gzip
 import numpy as np
 
 
+_pop_count_lut = np.asarray([bin(x).count('1') for x in range(256)])
+
+
+def popcount(a):
+    """Returns the number of 1-bits in a given packed numpy array."""
+    return np.sum(_pop_count_lut[a])
+
+
+def readtext(file):
+    """Reads and returns the text in a given file. Transparently decompresses \\*.gz files."""
+    if hasattr(file, 'read'):
+        return file.read()
+    if str(file).endswith('.gz'):
+        with gzip.open(file, 'rt') as f:
+            return f.read()
+    else:
+        with open(file, 'rt') as f:
+            return f.read()
+
+
+def hr_sci(value):
+    """Formats a value in a human-readible scientific notation."""
+    multiplier = 0
+    while abs(value) >= 1000:
+        value /= 1000
+        multiplier += 1
+    while abs(value) < 1:
+        value *= 1000
+        multiplier -= 1
+    return f'{value:.3f}{" kMGTPEafpnµm"[multiplier]}'
+
+
+def hr_bytes(nbytes):
+    """Formats a given number of bytes for human readability."""
+    multiplier = 0
+    while abs(nbytes) >= 1000:
+        nbytes /= 1024
+        multiplier += 1
+    return f'{nbytes:.1f}{["", "ki", "Mi", "Gi", "Ti", "Pi"][multiplier]}B'
+
+
+def hr_time(seconds):
+    """Formats a given time interval for human readability."""
+    s = ''
+    if seconds >= 86400:
+        d = seconds // 86400
+        seconds -= d * 86400
+        s += f'{int(d)}d'
+    if seconds >= 3600:
+        h = seconds // 3600
+        seconds -= h * 3600
+        s += f'{int(h)}h'
+    if seconds >= 60:
+        m = seconds // 60
+        seconds -= m * 60
+        if 'd' not in s:
+            s += f'{int(m)}m'
+    if 'h' not in s and 'd' not in s:
+        s += f'{int(seconds)}s'
+    return s
+
+
 class Log:
+    """A very simple logger that formats the messages with the number of seconds since
+    program start.
+    """
     def __init__(self):
         self.start = time.perf_counter()
         self.logfile = None
+        """When set to a file handle, log messages are written to it instead to standard output.
+        After each write, ``flush()`` is called as well.
+        """
 
     def log(self, level, message):
         t = time.perf_counter() - self.start
@@ -23,15 +93,45 @@ class Log:
             self.logfile.write(f'{t:011.3f} {level} {message}\n')
             self.logfile.flush()
 
-    def info(self, message): self.log('-', message)
-
-    def warn(self, message): self.log('W', message)
-
-    def error(self, message): self.log('E', message)
+    def info(self, message):
+        """Log an informational message."""
+        self.log('-', message)
+
+    def warn(self, message):
+        """Log a warning message."""
+        self.log('W', message)
+
+    def error(self, message):
+        """Log an error message."""
+        self.log('E', message)
+
+    def range(self, *args):
+        """A generator that operates just like the ``range()`` built-in, and also occasionally logs the progress
+        and compute time estimates."""
+        elems = len(range(*args))
+        start_time = time.perf_counter()
+        lastlog_time = start_time
+        log_interval = 5
+        for elem, i in enumerate(range(*args)):
+            yield i
+            current_time = time.perf_counter()
+            if current_time > lastlog_time + log_interval:
+                done = (elem + 1) / elems
+                elapsed_time = current_time - start_time
+                total_time = elapsed_time / done
+                rem_time = total_time - elapsed_time
+                self.log(':', f'{done*100:.0f}% done {hr_time(elapsed_time)} elapsed {hr_time(rem_time)} remaining')
+                log_interval = min(600, int(log_interval*1.5))
+                lastlog_time = current_time
 
 
 log = Log()
+"""The standard logger instance."""
+
 
+#
+# Code below mocks basic numba and cuda functions for pure-python fallback.
+#
 
 class MockNumba:
     @staticmethod
@@ -52,17 +152,15 @@ class MockCuda:
         outer = self
 
         def make_launcher(func):
-            class Launcher(object):
+            class Launcher:
                 def __init__(self, funcc):
                     self.func = funcc
 
                 def __call__(self, *args, **kwargs):
-                    # print(f'device func call {self.func.__name__}')
                     return self.func(*args, **kwargs)
 
                 def __getitem__(self, item):
                     grid_dim, block_dim = item
-                    # print(f'kernel call {self.func.__name__} grid_dim:{grid_dim} block_dim:{block_dim}')
 
                     def inner(*args, **kwargs):
                         for grid_x in range(grid_dim[0]):
@@ -104,23 +202,12 @@ if importlib.util.find_spec('numba') is not None:
         cuda = MockCuda()
 else:
     numba = MockNumba()
+    """If Numba is available on the system, it is the actual ``numba`` package.
+    Otherwise, it simply defines an ``njit`` decorator that does nothing.
+    """
     cuda = MockCuda()
+    """If Numba is installed and Cuda GPUs are available, it is the actual ``numba.cuda`` package.
+    Otherwise, it is an object that defines basic methods and decorators so that cuda-code can still
+    run in the Python interpreter.
+    """
     log.warn('Numba unavailable. Falling back to pure Python.')
-
-
-_pop_count_lut = np.asarray([bin(x).count('1') for x in range(256)])
-
-
-def popcount(a):
-    return np.sum(_pop_count_lut[a])
-
-
-def readtext(file):
-    if hasattr(file, 'read'):
-        return file.read()
-    if str(file).endswith('.gz'):
-        with gzip.open(file, 'rt') as f:
-            return f.read()
-    else:
-        with open(file, 'rt') as f:
-            return f.read()
diff --git a/src/kyupy/bench.py b/src/kyupy/bench.py
index 7ec1e1e..21310d5 100644
--- a/src/kyupy/bench.py
+++ b/src/kyupy/bench.py
@@ -14,25 +14,25 @@ from . import readtext
 
 
 class BenchTransformer(Transformer):
-    
+
     def __init__(self, name):
         super().__init__()
         self.c = Circuit(name)
-    
+
     def start(self, _): return self.c
-        
+
     def parameters(self, args): return [self.c.get_or_add_fork(name) for name in args]
-        
+
     def interface(self, args): self.c.interface.extend(args[0])
 
     def assignment(self, args):
         name, cell_type, drivers = args
         cell = Node(self.c, str(name), str(cell_type))
         Line(self.c, cell, self.c.get_or_add_fork(str(name)))
-        [Line(self.c, d, cell) for d in drivers]
+        for d in drivers: Line(self.c, d, cell)
 
 
-grammar = r"""
+GRAMMAR = r"""
     start: (statement)*
     statement: input | output | assignment
     input: ("INPUT" | "input") parameters -> interface
@@ -51,7 +51,7 @@ def parse(text, name=None):
     :param name: The name of the circuit. Circuit names are not included in bench descriptions.
     :return: A :class:`Circuit` object.
     """
-    return Lark(grammar, parser="lalr", transformer=BenchTransformer(name)).parse(text)
+    return Lark(GRAMMAR, parser="lalr", transformer=BenchTransformer(name)).parse(text)
 
 
 def load(file, name=None):
diff --git a/src/kyupy/circuit.py b/src/kyupy/circuit.py
index 84cc96c..5801f0c 100644
--- a/src/kyupy/circuit.py
+++ b/src/kyupy/circuit.py
@@ -53,7 +53,7 @@ class Node:
         """
         self.kind = kind
         """A string describing the type of the node.
-        
+
         Common types are the names from a standard cell library or general gate names like 'AND' or 'NOR'.
         If :py:attr:`kind` is set to '__fork__', it receives special treatment.
         A `fork` describes a named signal or a fan-out point in the circuit and not a physical `cell` like a gate.
@@ -75,6 +75,9 @@ class Node:
         """A list of output connections (:class:`Line` objects).
         """
 
+    def __index__(self):
+        return self.index
+
     def __repr__(self):
         ins = ' '.join([f'<{line.index}' if line is not None else '<None' for line in self.ins])
         outs = ' '.join([f'>{line.index}' if line is not None else '>None' for line in self.outs])
@@ -130,7 +133,7 @@ class Line:
         """
         self.driver_pin = driver[1]
         """The output pin position of the driver node this line is connected to.
-        
+
         This is the position in the outs-list of the driving node this line referenced from:
         :code:`self.driver.outs[self.driver_pin] == self`.
         """
@@ -160,6 +163,9 @@ class Line:
         self.reader = None
         self.circuit = None
 
+    def __index__(self):
+        return self.index
+
     def __repr__(self):
         return f'{self.index}'
 
@@ -187,17 +193,17 @@ class Circuit:
         """
         self.nodes = IndexList()
         """A list of all :class:`Node` objects contained in the circuit.
-        
+
         The position of a node in this list equals its index :code:`self.nodes[42].index == 42`.
         """
         self.lines = IndexList()
         """A list of all :class:`Line` objects contained in the circuit.
-        
+
         The position of a line in this list equals its index :code:`self.lines[42].index == 42`.
         """
         self.interface = GrowingList()
         """A list of nodes that are designated as primary input- or output-ports.
-        
+
         Port-nodes are contained in :py:attr:`nodes` as well as :py:attr:`interface`.
         The position of a node in the interface list corresponds to positions of logic values in test vectors.
         The port direction is not stored explicitly.
@@ -213,7 +219,7 @@ class Circuit:
 
     def get_or_add_fork(self, name):
         return self.forks[name] if name in self.forks else Node(self, name)
-    
+
     def copy(self):
         """Returns a deep copy of the circuit.
         """
@@ -231,7 +237,7 @@ class Circuit:
                 n = c.cells[node.name]
             c.interface.append(n)
         return c
-    
+
     def dump(self):
         """Returns a string representation of the circuit and all its nodes.
         """
@@ -239,8 +245,9 @@ class Circuit:
         return header + '\n'.join([str(n) for n in self.nodes])
 
     def __repr__(self):
-        name = f" '{self.name}'" if self.name else ''
-        return f'<Circuit{name} with {len(self.nodes)} nodes, {len(self.lines)} lines, {len(self.interface)} ports>'
+        name = f' {self.name}' if self.name else ''
+        return f'<Circuit{name} cells={len(self.cells)} forks={len(self.forks)} ' + \
+               f'lines={len(self.lines)} ports={len(self.interface)}>'
 
     def topological_order(self):
         """Generator function to iterate over all nodes in topological order.
@@ -255,8 +262,8 @@ class Circuit:
             for line in n.outs:
                 if line is None: continue
                 succ = line.reader
-                visit_count[succ.index] += 1
-                if visit_count[succ.index] == len(succ.ins) and 'DFF' not in succ.kind:
+                visit_count[succ] += 1
+                if visit_count[succ] == len(succ.ins) and 'DFF' not in succ.kind:
                     queue.append(succ)
             yield n
 
@@ -280,8 +287,8 @@ class Circuit:
             n = queue.popleft()
             for line in n.ins:
                 pred = line.driver
-                visit_count[pred.index] += 1
-                if visit_count[pred.index] == len(pred.outs) and 'DFF' not in pred.kind:
+                visit_count[pred] += 1
+                if visit_count[pred] == len(pred.outs) and 'DFF' not in pred.kind:
                     queue.append(pred)
             yield n
 
@@ -292,13 +299,13 @@ class Circuit:
         """
         marks = [False] * len(self.nodes)
         for n in origin_nodes:
-            marks[n.index] = True
+            marks[n] = True
         for n in self.reversed_topological_order():
-            if not marks[n.index]:
+            if not marks[n]:
                 for line in n.outs:
                     if line is not None:
-                        marks[n.index] |= marks[line.reader.index]
-            if marks[n.index]:
+                        marks[n] |= marks[line.reader]
+            if marks[n]:
                 yield n
 
     def fanout_free_regions(self):
diff --git a/src/kyupy/logic.py b/src/kyupy/logic.py
index d30fd55..7b0c149 100644
--- a/src/kyupy/logic.py
+++ b/src/kyupy/logic.py
@@ -25,7 +25,7 @@ from collections.abc import Iterable
 
 import numpy as np
 
-from . import numba
+from . import numba, hr_bytes
 
 
 ZERO = 0b000
@@ -58,6 +58,12 @@ on a signal. ``'N'``, ``'n'``, and ``'v'`` are interpreted as ``NPULSE``.
 
 
 def interpret(value):
+    """Converts characters, strings, and lists of them to lists of logic constants defined above.
+
+    :param value: A character (string of length 1), Boolean, Integer, None, or Iterable.
+        Iterables (such as strings) are traversed and their individual characters are interpreted.
+    :return: A logic constant or a (possibly multi-dimensional) list of logic constants.
+    """
     if isinstance(value, Iterable) and not (isinstance(value, str) and len(value) == 1):
         return list(map(interpret, value))
     if value in [0, '0', False, 'L', 'l']:
@@ -85,6 +91,79 @@ def bit_in(a, pos):
     return a[pos >> 3] & _bit_in_lut[pos & 7]
 
 
+class MVArray:
+    """An n-dimensional array of m-valued logic values.
+
+    This class wraps a numpy.ndarray of type uint8 and adds support for encoding and
+    interpreting 2-valued, 4-valued, and 8-valued logic values.
+    Each logic value is stored as an uint8, manipulations of individual values are cheaper than in
+    :py:class:`BPArray`.
+
+    :param a: If a tuple is given, it is interpreted as desired shape. To make an array of ``n`` vectors
+        compatible with a simulator ``sim``, use ``(len(sim.interface), n)``. If a :py:class:`BPArray` or
+        :py:class:`MVArray` is given, a deep copy is made. If a string, a list of strings, a list of characters,
+        or a list of lists of characters are given, the data is interpreted best-effort and the array is
+        initialized accordingly.
+    :param m: The arity of the logic. Can be set to 2, 4, or 8. If None is given, the arity of a given
+        :py:class:`BPArray` or :py:class:`MVArray` is used, or, if the array is initialized differently, 8 is used.
+    """
+
+    def __init__(self, a, m=None):
+        self.m = m or 8
+        assert self.m in [2, 4, 8]
+
+        # Try our best to interpret given a.
+        if isinstance(a, MVArray):
+            self.data = a.data.copy()
+            """The wrapped 2-dimensional ndarray of logic values.
+
+            * Axis 0 is PI/PO/FF position, the length of this axis is called "width".
+            * Axis 1 is vector/pattern, the length of this axis is called "length".
+            """
+            self.m = m or a.m
+        elif hasattr(a, 'data'):  # assume it is a BPArray. Can't use isinstance() because BPArray isn't declared yet.
+            self.data = np.zeros((a.width, a.length), dtype=np.uint8)
+            self.m = m or a.m
+            for i in range(a.data.shape[-2]):
+                self.data[...] <<= 1
+                self.data[...] |= np.unpackbits(a.data[..., -i-1, :], axis=1)[:, :a.length]
+            if a.data.shape[-2] == 1:
+                self.data *= 3
+        elif isinstance(a, int):
+            self.data = np.full((a, 1), UNASSIGNED, dtype=np.uint8)
+        elif isinstance(a, tuple):
+            self.data = np.full(a, UNASSIGNED, dtype=np.uint8)
+        else:
+            if isinstance(a, str): a = [a]
+            self.data = np.asarray(interpret(a), dtype=np.uint8)
+            self.data = self.data[:, np.newaxis] if self.data.ndim == 1 else np.moveaxis(self.data, -2, -1)
+
+        # Cast data to m-valued logic.
+        if self.m == 2:
+            self.data[...] = ((self.data & 0b001) & ((self.data >> 1) & 0b001) | (self.data == RISE)) * ONE
+        elif self.m == 4:
+            self.data[...] = (self.data & 0b011) & ((self.data != FALL) * ONE) | ((self.data == RISE) * ONE)
+        elif self.m == 8:
+            self.data[...] = self.data & 0b111
+
+        self.length = self.data.shape[-1]
+        self.width = self.data.shape[-2]
+
+    def __repr__(self):
+        return f'<MVArray length={self.length} width={self.width} m={self.m} mem={hr_bytes(self.data.nbytes)}>'
+
+    def __str__(self):
+        return str([self[idx] for idx in range(self.length)])
+
+    def __getitem__(self, vector_idx):
+        """Returns a string representing the desired vector."""
+        chars = ["0", "X", "-", "1", "P", "R", "F", "N"]
+        return ''.join(chars[v] for v in self.data[:, vector_idx])
+
+    def __len__(self):
+        return self.length
+
+
 def mv_cast(*args, m=8):
     return [a if isinstance(a, MVArray) else MVArray(a, m=m) for a in args]
 
@@ -100,6 +179,13 @@ def _mv_not(m, out, inp):
 
 
 def mv_not(x1, out=None):
+    """A multi-valued NOT operator.
+
+    :param x1: An :py:class:`MVArray` or data the :py:class:`MVArray` constructor accepts.
+    :param out: Optionally an :py:class:`MVArray` as storage destination. If None, a new :py:class:`MVArray`
+        is returned.
+    :return: An :py:class:`MVArray` with the result.
+    """
     m = mv_getm(x1)
     x1 = mv_cast(x1, m=m)[0]
     out = out or MVArray(x1.data.shape, m=m)
@@ -125,6 +211,14 @@ def _mv_or(m, out, *ins):
 
 
 def mv_or(x1, x2, out=None):
+    """A multi-valued OR operator.
+
+    :param x1: An :py:class:`MVArray` or data the :py:class:`MVArray` constructor accepts.
+    :param x2: An :py:class:`MVArray` or data the :py:class:`MVArray` constructor accepts.
+    :param out: Optionally an :py:class:`MVArray` as storage destination. If None, a new :py:class:`MVArray`
+        is returned.
+    :return: An :py:class:`MVArray` with the result.
+    """
     m = mv_getm(x1, x2)
     x1, x2 = mv_cast(x1, x2, m=m)
     out = out or MVArray(np.broadcast(x1.data, x2.data).shape, m=m)
@@ -151,6 +245,14 @@ def _mv_and(m, out, *ins):
 
 
 def mv_and(x1, x2, out=None):
+    """A multi-valued AND operator.
+
+    :param x1: An :py:class:`MVArray` or data the :py:class:`MVArray` constructor accepts.
+    :param x2: An :py:class:`MVArray` or data the :py:class:`MVArray` constructor accepts.
+    :param out: Optionally an :py:class:`MVArray` as storage destination. If None, a new :py:class:`MVArray`
+        is returned.
+    :return: An :py:class:`MVArray` with the result.
+    """
     m = mv_getm(x1, x2)
     x1, x2 = mv_cast(x1, x2, m=m)
     out = out or MVArray(np.broadcast(x1.data, x2.data).shape, m=m)
@@ -174,6 +276,14 @@ def _mv_xor(m, out, *ins):
 
 
 def mv_xor(x1, x2, out=None):
+    """A multi-valued XOR operator.
+
+    :param x1: An :py:class:`MVArray` or data the :py:class:`MVArray` constructor accepts.
+    :param x2: An :py:class:`MVArray` or data the :py:class:`MVArray` constructor accepts.
+    :param out: Optionally an :py:class:`MVArray` as storage destination. If None, a new :py:class:`MVArray`
+        is returned.
+    :return: An :py:class:`MVArray` with the result.
+    """
     m = mv_getm(x1, x2)
     x1, x2 = mv_cast(x1, x2, m=m)
     out = out or MVArray(np.broadcast(x1.data, x2.data).shape, m=m)
@@ -182,6 +292,16 @@ def mv_xor(x1, x2, out=None):
 
 
 def mv_transition(init, final, out=None):
+    """Computes the logic transitions from the initial values of ``init`` to the final values of ``final``.
+    Pulses in the input data are ignored. If any of the inputs are ``UNKNOWN``, the result is ``UNKNOWN``.
+    If both inputs are ``UNASSIGNED``, the result is ``UNASSIGNED``.
+
+    :param init: An :py:class:`MVArray` or data the :py:class:`MVArray` constructor accepts.
+    :param final: An :py:class:`MVArray` or data the :py:class:`MVArray` constructor accepts.
+    :param out: Optionally an :py:class:`MVArray` as storage destination. If None, a new :py:class:`MVArray`
+        is returned.
+    :return: An :py:class:`MVArray` with the result.
+    """
     m = mv_getm(init, final)
     init, final = mv_cast(init, final, m=m)
     init = init.data
@@ -196,65 +316,46 @@ def mv_transition(init, final, out=None):
     return out
 
 
-class MVArray:
-    """An n-dimensional array of m-valued logic values.
-
-    This class wraps a numpy.ndarray of type uint8 and adds support for encoding and
-    interpreting 2-valued, 4-valued, and 8-valued logic values.
-    Each logic value is stored as an uint8, value manipulations are cheaper than in BPArray.
-
-    An MVArray always has 2 axes:
+class BPArray:
+    """An n-dimensional array of m-valued logic values that uses bit-parallel storage.
 
-    * Axis 0 is PI/PO/FF position, the length of this axis is called "width".
-    * Axis 1 is vector/pattern, the length of this axis is called "length".
+    The primary use of this format is in aiding efficient bit-parallel logic simulation.
+    The secondary benefit over :py:class:`MVArray` is its memory efficiency.
+    Accessing individual values is more expensive than with :py:class:`MVArray`.
+    Therefore it may be more efficient to unpack the data into an :py:class:`MVArray` and pack it again into a
+    :py:class:`BPArray` for simulation.
 
+    See :py:class:`MVArray` for constructor parameters.
     """
 
     def __init__(self, a, m=None):
-        self.m = m or 8
-        assert self.m in [2, 4, 8]
-
-        # Try our best to interpret given a.
+        if not isinstance(a, MVArray) and not isinstance(a, BPArray):
+            a = MVArray(a, m)
+            self.m = a.m
         if isinstance(a, MVArray):
-            self.data = a.data.copy()
-            self.m = m or a.m
-        elif hasattr(a, 'data'):  # assume it is a BPArray. Can't use isinstance() because BPArray isn't declared yet.
-            self.data = np.zeros((a.width, a.length), dtype=np.uint8)
-            self.m = m or a.m
-            for i in range(a.data.shape[-2]):
-                self.data[...] <<= 1
-                self.data[...] |= np.unpackbits(a.data[..., -i-1, :], axis=1)[:, :a.length]
-            if a.data.shape[-2] == 1:
-                self.data *= 3
-        elif isinstance(a, int):
-            self.data = np.full((a, 1), UNASSIGNED, dtype=np.uint8)
-        elif isinstance(a, tuple):
-            self.data = np.full(a, UNASSIGNED, dtype=np.uint8)
-        else:
-            if isinstance(a, str): a = [a]
-            self.data = np.asarray(interpret(a), dtype=np.uint8)
-            self.data = self.data[:, np.newaxis] if self.data.ndim == 1 else np.moveaxis(self.data, -2, -1)
-
-        # Cast data to m-valued logic.
-        if self.m == 2:
-            self.data[...] = ((self.data & 0b001) & ((self.data >> 1) & 0b001) | (self.data == RISE)) * ONE
-        elif self.m == 4:
-            self.data[...] = (self.data & 0b011) & ((self.data != FALL) * ONE) | ((self.data == RISE) * ONE)
-        elif self.m == 8:
-            self.data[...] = self.data & 0b111
+            if m is not None and m != a.m:
+                a = MVArray(a, m)  # cast data
+            self.m = a.m
+            assert self.m in [2, 4, 8]
+            nwords = math.ceil(math.log2(self.m))
+            nbytes = (a.data.shape[-1] - 1) // 8 + 1
+            self.data = np.zeros(a.data.shape[:-1] + (nwords, nbytes), dtype=np.uint8)
+            """The wrapped 3-dimensional ndarray.
 
-        self.length = self.data.shape[-1]
-        self.width = self.data.shape[-2]
+            * Axis 0 is PI/PO/FF position, the length of this axis is called "width".
+            * Axis 1 has length ``ceil(log2(m))`` for storing all bits.
+            * Axis 2 are the vectors/patterns packed into uint8 words.
+            """
+            for i in range(self.data.shape[-2]):
+                self.data[..., i, :] = np.packbits((a.data >> i) & 1, axis=-1)
+        else:  # we have a BPArray
+            self.data = a.data.copy()  # TODO: support conversion to different m
+            self.m = a.m
+        self.length = a.length
+        self.width = a.width
 
     def __repr__(self):
-        return f'<MVArray length={self.length} width={self.width} m={self.m} nbytes={self.data.nbytes}>'
-
-    def __str__(self):
-        return str([self[idx] for idx in range(self.length)])
-
-    def __getitem__(self, vector_idx):
-        chars = ["0", "X", "-", "1", "P", "R", "F", "N"]
-        return ''.join(chars[v] for v in self.data[:, vector_idx])
+        return f'<BPArray length={self.length} width={self.width} m={self.m} mem={hr_bytes(self.data.nbytes)}>'
 
     def __len__(self):
         return self.length
@@ -359,44 +460,3 @@ def bp_xor(out, *ins):
         out[..., 0, :] |= any_unknown
         out[..., 1, :] &= ~any_unknown
         out[..., 2, :] &= ~any_unknown
-
-
-class BPArray:
-    """An n-dimensional array of m-valued logic values that uses bit-parallel storage.
-
-    The primary use of this format is in aiding efficient bit-parallel logic simulation.
-    The secondary benefit over MVArray is its memory efficiency.
-    Accessing individual values is more expensive than with :py:class:`MVArray`.
-    It is advised to first construct a MVArray, pack it into a :py:class:`BPArray` for simulation and unpack the results
-    back into a :py:class:`MVArray` for value access.
-
-    The values along the last axis (vectors/patterns) are packed into uint8 words.
-    The second-last axis has length ceil(log2(m)) for storing all bits.
-    All other axes stay the same as in MVArray.
-    """
-
-    def __init__(self, a, m=None):
-        if not isinstance(a, MVArray) and not isinstance(a, BPArray):
-            a = MVArray(a, m)
-            self.m = a.m
-        if isinstance(a, MVArray):
-            if m is not None and m != a.m:
-                a = MVArray(a, m)  # cast data
-            self.m = a.m
-            assert self.m in [2, 4, 8]
-            nwords = math.ceil(math.log2(self.m))
-            nbytes = (a.data.shape[-1] - 1) // 8 + 1
-            self.data = np.zeros(a.data.shape[:-1] + (nwords, nbytes), dtype=np.uint8)
-            for i in range(self.data.shape[-2]):
-                self.data[..., i, :] = np.packbits((a.data >> i) & 1, axis=-1)
-        else:  # we have a BPArray
-            self.data = a.data.copy()  # TODO: support conversion to different m
-            self.m = a.m
-        self.length = a.length
-        self.width = a.width
-
-    def __repr__(self):
-        return f'<BPArray length={self.length} width={self.width} m={self.m} bytes={self.data.nbytes}>'
-
-    def __len__(self):
-        return self.length
diff --git a/src/kyupy/logic_sim.py b/src/kyupy/logic_sim.py
index cddde47..993938a 100644
--- a/src/kyupy/logic_sim.py
+++ b/src/kyupy/logic_sim.py
@@ -1,14 +1,29 @@
+"""A high-throughput combinational logic simulator.
+
+The class :py:class:`~kyupy.logic_sim.LogicSim` performs parallel simulations of the combinational part of a circuit.
+The logic operations are performed bit-parallel on packed numpy arrays.
+Simple sequential circuits can be simulated by repeated assignments and propagations.
+However, this simulator ignores the clock network and simply assumes that all state-elements are clocked all the time.
+"""
+
 import math
 
 import numpy as np
 
-from . import logic
+from . import logic, hr_bytes
 
 
 class LogicSim:
     """A bit-parallel naïve combinational simulator for 2-, 4-, or 8-valued logic.
+
+    :param circuit: The circuit to simulate.
+    :type circuit: :py:class:`~kyupy.circuit.Circuit`
+    :param sims: The number of parallel logic simulations to perform.
+    :type sims: int
+    :param m: The arity of the logic, must be 2, 4, or 8.
+    :type m: int
     """
-    def __init__(self, circuit, sims=1, m=8):
+    def __init__(self, circuit, sims=8, m=8):
         assert m in [2, 4, 8]
         self.m = m
         mdim = math.ceil(math.log2(m))
@@ -16,216 +31,165 @@ class LogicSim:
         self.sims = sims
         nbytes = (sims - 1) // 8 + 1
         self.interface = list(circuit.interface) + [n for n in circuit.nodes if 'dff' in n.kind.lower()]
+        self.width = len(self.interface)
+        """The number of bits in the circuit state (number of ports + number of state-elements)."""
         self.state = np.zeros((len(circuit.lines), mdim, nbytes), dtype='uint8')
         self.state_epoch = np.zeros(len(circuit.nodes), dtype='int8') - 1
         self.tmp = np.zeros((5, mdim, nbytes), dtype='uint8')
         self.zero = np.zeros((mdim, nbytes), dtype='uint8')
         self.epoch = 0
 
-        self.fork_vd1 = self.fork_vdx
-        self.const0_vd1 = self.const0_vdx
-        self.input_vd1 = self.fork_vd1
-        self.output_vd1 = self.fork_vd1
-        self.inv_vd1 = self.not_vd1
-        self.ibuff_vd1 = self.not_vd1
-        self.nbuff_vd1 = self.fork_vd1
-        self.xor2_vd1 = self.xor_vd1
-        
-        self.fork_vd2 = self.fork_vdx
-        self.const0_vd2 = self.const0_vdx
-        self.input_vd2 = self.fork_vd2
-        self.output_vd2 = self.fork_vd2
-        self.inv_vd2 = self.not_vd2
-        self.ibuff_vd2 = self.not_vd2
-        self.nbuff_vd2 = self.fork_vd2
-        self.xor2_vd2 = self.xor_vd2
-        
-        self.fork_vd3 = self.fork_vdx
-        self.const0_vd3 = self.const0_vdx
-        self.input_vd3 = self.fork_vd3
-        self.output_vd3 = self.fork_vd3
-        self.inv_vd3 = self.not_vd3
-        self.ibuff_vd3 = self.not_vd3
-        self.nbuff_vd3 = self.fork_vd3
-        self.xor2_vd3 = self.xor_vd3
-        
-        known_fct = [(f[:-4], getattr(self, f)) for f in dir(self) if f.endswith(f'_vd{mdim}')]
+        known_fct = [(f[:-4], getattr(self, f)) for f in dir(self) if f.endswith('_fct')]
         self.node_fct = []
         for n in circuit.nodes:
             t = n.kind.lower().replace('__fork__', 'fork')
+            t = t.replace('nbuff', 'fork')
+            t = t.replace('input', 'fork')
+            t = t.replace('output', 'fork')
             t = t.replace('__const0__', 'const0')
             t = t.replace('__const1__', 'const1')
             t = t.replace('tieh', 'const1')
+            t = t.replace('ibuff', 'not')
+            t = t.replace('inv', 'not')
+
             fcts = [f for n, f in known_fct if t.startswith(n)]
             if len(fcts) < 1:
                 raise ValueError(f'Unknown node kind {n.kind}')
             self.node_fct.append(fcts[0])
 
+    def __repr__(self):
+        return f'<LogicSim {self.circuit.name} sims={self.sims} m={self.m} state_mem={hr_bytes(self.state.nbytes)}>'
+
     def assign(self, stimuli):
-        """Assign stimuli to the primary inputs and state-elements (flip-flops)."""
-        if hasattr(stimuli, 'data'):
-            stimuli = stimuli.data
-        for stim, node in zip(stimuli, self.interface):
+        """Assign stimuli to the primary inputs and state-elements (flip-flops).
+
+        :param stimuli: The input data to assign. Must be in bit-parallel storage format and in a compatible shape.
+        :type stimuli: :py:class:`~kyupy.logic.BPArray`
+        :returns: The given stimuli object.
+        """
+        for node, stim in zip(self.interface, stimuli.data if hasattr(stimuli, 'data') else stimuli):
             if len(node.outs) == 0: continue
-            outputs = [self.state[line.index] if line else self.tmp[3] for line in node.outs]
-            self.node_fct[node.index]([stim], outputs)
+            outputs = [self.state[line] if line else self.tmp[3] for line in node.outs]
+            self.node_fct[node]([stim], outputs)
             for line in node.outs:
-                if line:
-                    self.state_epoch[line.reader.index] = self.epoch
+                if line is not None: self.state_epoch[line.reader] = self.epoch
         for n in self.circuit.nodes:
-            if (n.kind == '__const1__') or (n.kind == '__const0__'):
-                outputs = [self.state[line.index] if line else self.tmp[3] for line in n.outs]
-                self.node_fct[n.index]([], outputs)
-                # print('assign const')
+            if n.kind in ('__const1__', '__const0__'):
+                outputs = [self.state[line] if line else self.tmp[3] for line in n.outs]
+                self.node_fct[n]([], outputs)
                 for line in n.outs:
-                    if line:
-                        self.state_epoch[line.reader.index] = self.epoch
+                    if line is not None: self.state_epoch[line.reader] = self.epoch
+        return stimuli
 
     def capture(self, responses):
-        """Capture the current values at the primary outputs and in the state-elements (flip-flops)."""
-        if hasattr(responses, 'data'):
-            responses = responses.data
-        for resp, node in zip(responses, self.interface):
-            if len(node.ins) == 0: continue
-            resp[...] = self.state[node.ins[0].index]
-        # print(responses)
-
-    def propagate(self):
-        """Propagate the input values towards the outputs (Perform all logic operations in topological order)."""
+        """Capture the current values at the primary outputs and in the state-elements (flip-flops).
+
+        :param responses: A bit-parallel storage target for the responses in a compatible shape.
+        :type responses: :py:class:`~kyupy.logic.BPArray`
+        :returns: The given responses object.
+        """
+        for node, resp in zip(self.interface, responses.data if hasattr(responses, 'data') else responses):
+            if len(node.ins) > 0: resp[...] = self.state[node.ins[0]]
+        return responses
+
+    def propagate(self, inject_cb=None):
+        """Propagate the input values towards the outputs (Perform all logic operations in topological order).
+
+        If the circuit is sequential (it contains flip-flops), one call simulates one clock cycle.
+        Multiple clock cycles are simulated by a assign-propagate-capture loop:
+
+        .. code-block:: python
+
+           # initial state in state_bp
+           for cycle in range(10):  # simulate 10 clock cycles
+               sim.assign(state_bp)
+               sim.propagate()
+               sim.capture(state_bp)
+
+        :param inject_cb: A callback function for manipulating intermediate signal values.
+            This function is called with a line index and its new logic values (in bit-parallel format) after
+            evaluation of a node. The callback may manipulate the given values in-place, the simulation
+            resumes with the manipulated values after the callback returns.
+        :type inject_cb: ``f(int, ndarray)``
+        """
         for node in self.circuit.topological_order():
-            if self.state_epoch[node.index] != self.epoch: continue
-            inputs = [self.state[line.index] if line else self.zero for line in node.ins]
-            outputs = [self.state[line.index] if line else self.tmp[3] for line in node.outs]
+            if self.state_epoch[node] != self.epoch: continue
+            inputs = [self.state[line] if line else self.zero for line in node.ins]
+            outputs = [self.state[line] if line else self.tmp[3] for line in node.outs]
             # print('sim', node)
-            self.node_fct[node.index](inputs, outputs)
+            self.node_fct[node](inputs, outputs)
             for line in node.outs:
-                self.state_epoch[line.reader.index] = self.epoch
+                if inject_cb is not None: inject_cb(line, self.state[line])
+                self.state_epoch[line.reader] = self.epoch
         self.epoch = (self.epoch + 1) % 128
 
-    def fork_vdx(self, inputs, outputs):
+    def cycle(self, state, inject_cb=None):
+        """Assigns the given state, propagates it and captures the new state.
+
+        :param state: A bit-parallel array in a compatible shape holding the current circuit state.
+            The contained data is assigned to the PI and PPI and overwritten by data at the PO and PPO after
+            propagation.
+        :type state: :py:class:`~kyupy.logic.BPArray`
+        :param inject_cb: A callback function for manipulating intermediate signal values. See :py:func:`propagate`.
+        :returns: The given state object.
+        """
+        self.assign(state)
+        self.propagate(inject_cb)
+        return self.capture(state)
+
+    @staticmethod
+    def fork_fct(inputs, outputs):
         for o in outputs: o[...] = inputs[0]
-    
-    def const0_vdx(self, _, outputs):
-        for o in outputs: o[...] = self.zero
-
-    # 2-valued simulation
-
-    def not_vd1(self, inputs, outputs):
-        outputs[0][0] = ~inputs[0][0]
-
-    def const1_vd1(self, _, outputs):
-        for o in outputs: o[...] = self.zero
-        self.not_vd1(outputs, outputs)
-
-    def and_vd1(self, inputs, outputs):
-        o = outputs[0]
-        o[0] = inputs[0][0]
-        for i in inputs[1:]: o[0] &= i[0]
-
-    def or_vd1(self, inputs, outputs):
-        o = outputs[0]
-        o[0] = inputs[0][0]
-        for i in inputs[1:]: o[0] |= i[0]
-
-    def xor_vd1(self, inputs, outputs):
-        o = outputs[0]
-        o[0] = inputs[0][0]
-        for i in inputs[1:]: o[0] ^= i[0]
-
-    def sdff_vd1(self, inputs, outputs):
-        outputs[0][0] = inputs[0][0]
-        if len(outputs) > 1:
-            outputs[1][0] = ~inputs[0][0]
-
-    def dff_vd1(self, inputs, outputs):
-        outputs[0][0] = inputs[0][0]
-        if len(outputs) > 1:
-            outputs[1][0] = ~inputs[0][0]
-
-    def nand_vd1(self, inputs, outputs):
-        self.and_vd1(inputs, outputs)
-        self.not_vd1(outputs, outputs)
 
-    def nor_vd1(self, inputs, outputs):
-        self.or_vd1(inputs, outputs)
-        self.not_vd1(outputs, outputs)
+    @staticmethod
+    def const0_fct(_, outputs):
+        for o in outputs: o[...] = 0
 
-    def xnor_vd1(self, inputs, outputs):
-        self.xor_vd1(inputs, outputs)
-        self.not_vd1(outputs, outputs)
+    @staticmethod
+    def const1_fct(_, outputs):
+        for o in outputs:
+            o[...] = 0
+            logic.bp_not(o, o)
 
-    # 4-valued simulation
-
-    def not_vd2(self, inputs, outputs):
+    @staticmethod
+    def not_fct(inputs, outputs):
         logic.bp_not(outputs[0], inputs[0])
 
-    def and_vd2(self, inputs, outputs):
+    @staticmethod
+    def and_fct(inputs, outputs):
         logic.bp_and(outputs[0], *inputs)
 
-    def or_vd2(self, inputs, outputs):
+    @staticmethod
+    def or_fct(inputs, outputs):
         logic.bp_or(outputs[0], *inputs)
 
-    def xor_vd2(self, inputs, outputs):
+    @staticmethod
+    def xor_fct(inputs, outputs):
         logic.bp_xor(outputs[0], *inputs)
 
-    def sdff_vd2(self, inputs, outputs):
-        self.dff_vd2(inputs, outputs)
+    @staticmethod
+    def sdff_fct(inputs, outputs):
+        logic.bp_buf(outputs[0], inputs[0])
         if len(outputs) > 1:
             logic.bp_not(outputs[1], inputs[0])
 
-    def dff_vd2(self, inputs, outputs):
+    @staticmethod
+    def dff_fct(inputs, outputs):
         logic.bp_buf(outputs[0], inputs[0])
+        if len(outputs) > 1:
+            logic.bp_not(outputs[1], inputs[0])
 
-    def nand_vd2(self, inputs, outputs):
-        self.and_vd2(inputs, outputs)
-        self.not_vd2(outputs, outputs)
-
-    def nor_vd2(self, inputs, outputs):
-        self.or_vd2(inputs, outputs)
-        self.not_vd2(outputs, outputs)
-
-    def xnor_vd2(self, inputs, outputs):
-        self.xor_vd2(inputs, outputs)
-        self.not_vd2(outputs, outputs)
-    
-    def const1_vd2(self, _, outputs):
-        for o in outputs: o[...] = self.zero
-        self.not_vd2(outputs, outputs)
-
-    # 8-valued simulation
-
-    def not_vd3(self, inputs, outputs):
-        logic.bp_not(outputs[0], inputs[0])
-
-    def and_vd3(self, inputs, outputs):
+    @staticmethod
+    def nand_fct(inputs, outputs):
         logic.bp_and(outputs[0], *inputs)
+        logic.bp_not(outputs[0], outputs[0])
 
-    def or_vd3(self, inputs, outputs):
+    @staticmethod
+    def nor_fct(inputs, outputs):
         logic.bp_or(outputs[0], *inputs)
+        logic.bp_not(outputs[0], outputs[0])
 
-    def xor_vd3(self, inputs, outputs):
+    @staticmethod
+    def xnor_fct(inputs, outputs):
         logic.bp_xor(outputs[0], *inputs)
-
-    def sdff_vd3(self, inputs, outputs):
-        self.dff_vd3(inputs, outputs)
-        if len(outputs) > 1:
-            logic.bp_not(outputs[1], inputs[0])
-
-    def dff_vd3(self, inputs, outputs):
-        logic.bp_buf(outputs[0], inputs[0])
-
-    def nand_vd3(self, inputs, outputs):
-        self.and_vd3(inputs, outputs)
-        self.not_vd3(outputs, outputs)
-
-    def nor_vd3(self, inputs, outputs):
-        self.or_vd3(inputs, outputs)
-        self.not_vd3(outputs, outputs)
-
-    def xnor_vd3(self, inputs, outputs):
-        self.xor_vd3(inputs, outputs)
-        self.not_vd3(outputs, outputs)
-        
-    def const1_vd3(self, _, outputs):
-        for o in outputs: o[...] = self.zero
-        self.not_vd3(outputs, outputs)
+        logic.bp_not(outputs[0], outputs[0])
diff --git a/src/kyupy/saed.py b/src/kyupy/saed.py
deleted file mode 100644
index 21771fd..0000000
--- a/src/kyupy/saed.py
+++ /dev/null
@@ -1,289 +0,0 @@
-from kyupy.circuit import Node, Line
-
-
-def pin_index(cell_type, pin):
-    if cell_type.startswith('HADD') and pin == 'B0': return 1
-    if cell_type.startswith('HADD') and pin == 'SO': return 1
-    if cell_type.startswith('MUX21') and pin == 'S': return 2
-    if cell_type.startswith('SDFF') and pin == 'QN': return 1
-    if cell_type.startswith('DFF') and pin == 'QN': return 1
-    if cell_type.startswith('DFF') and pin == 'CLK': return 1
-    if cell_type.startswith('DFF') and pin == 'RSTB': return 2
-    if cell_type.startswith('DFF') and pin == 'SETB': return 3
-    if pin in ['A2', 'IN2', 'SE', 'B', 'CO']: return 1
-    if pin in ['A3', 'IN3', 'SI', 'CI']: return 2
-    if pin == 'A4' or pin == 'IN4' or pin == 'CLK': return 3  # CLK for scan cells SDFF
-    if pin == 'A5' or pin == 'IN5' or pin == 'RSTB': return 4
-    if pin == 'A6' or pin == 'IN6' or pin == 'SETB': return 5
-    return 0
-
-
-def pin_is_output(kind, pin):
-    if 'MUX' in kind and pin == 'S':
-        return False
-    return pin in ['Q', 'QN', 'Z', 'ZN', 'Y', 'CO', 'S', 'SO', 'C1']
-
-
-def add_and_connect(circuit, name, kind, in1=None, in2=None, out=None):
-    n = Node(circuit, name, kind)
-    if in1 is not None:
-        n.ins[0] = in1
-        in1.reader = n
-        in1.reader_pin = 0
-    if in2 is not None:
-        n.ins[1] = in2
-        in2.reader = n
-        in2.reader_pin = 1
-    if out is not None:
-        n.outs[0] = out
-        out.driver = n
-        out.driver_pin = 0
-    return n
-
-
-def split_complex_gates(circuit):
-    node_list = circuit.nodes
-    for n in node_list:
-        name = n.name
-        ins = n.ins
-        outs = n.outs
-        if n.kind.startswith('AO21X'):
-            n.remove()
-            n_and = add_and_connect(circuit, name+'~and', 'AND2', ins[0], ins[1], None)
-            n_or = add_and_connect(circuit, name+'~or', 'OR2', None, ins[2], outs[0])
-            Line(circuit, n_and, n_or)
-        elif n.kind.startswith('AOI21X'):
-            n.remove()
-            n_and = add_and_connect(circuit, name+'~and', 'AND2', ins[0], ins[1], None)
-            n_nor = add_and_connect(circuit, name+'~nor', 'NOR2', None, ins[2], outs[0])
-            Line(circuit, n_and, n_nor)
-        elif n.kind.startswith('OA21X'):
-            n.remove()
-            n_or = add_and_connect(circuit, name+'~or', 'OR2', ins[0], ins[1], None)
-            n_and = add_and_connect(circuit, name+'~and', 'AND2', None, ins[2], outs[0])
-            Line(circuit, n_or, n_and)
-        elif n.kind.startswith('OAI21X'):
-            n.remove()
-            n_or = add_and_connect(circuit, name+'~or', 'OR2', ins[0], ins[1], None)
-            n_nand = add_and_connect(circuit, name+'~nand', 'NAND2', None, ins[2], outs[0])
-            Line(circuit, n_or, n_nand)
-        elif n.kind.startswith('OA22X'):
-            n.remove()
-            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
-            n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
-            n_and = add_and_connect(circuit, name+'~and', 'AND2', None, None, outs[0])
-            Line(circuit, n_or0, n_and)
-            Line(circuit, n_or1, n_and)
-        elif n.kind.startswith('OAI22X'):
-            n.remove()
-            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
-            n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
-            n_nand = add_and_connect(circuit, name+'~nand', 'NAND2', None, None, outs[0])
-            Line(circuit, n_or0, n_nand)
-            Line(circuit, n_or1, n_nand)
-        elif n.kind.startswith('AO22X'):
-            n.remove()
-            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
-            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
-            n_or = add_and_connect(circuit, name+'~or', 'OR2', None, None, outs[0])
-            Line(circuit, n_and0, n_or)
-            Line(circuit, n_and1, n_or)
-        elif n.kind.startswith('AOI22X'):
-            n.remove()
-            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
-            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
-            n_nor = add_and_connect(circuit, name+'~nor', 'NOR2', None, None, outs[0])
-            Line(circuit, n_and0, n_nor)
-            Line(circuit, n_and1, n_nor)
-        elif n.kind.startswith('AO221X'):
-            n.remove()
-            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
-            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
-            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', None, None, None)
-            n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', None, ins[4], outs[0])
-            Line(circuit, n_and0, n_or0)
-            Line(circuit, n_and1, n_or0)
-            Line(circuit, n_or0, n_or1)     
-        elif n.kind.startswith('AOI221X'):
-            n.remove()
-            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
-            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
-            n_or = add_and_connect(circuit, name+'~or', 'OR2', None, None, None)
-            n_nor = add_and_connect(circuit, name+'~nor', 'NOR2', None, ins[4], outs[0])
-            Line(circuit, n_and0, n_or)
-            Line(circuit, n_and1, n_or)
-            Line(circuit, n_or, n_nor)     
-        elif n.kind.startswith('OA221X'):
-            n.remove()
-            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
-            n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
-            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', None, None, None)
-            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', None, ins[4], outs[0])
-            Line(circuit, n_or0, n_and0)
-            Line(circuit, n_or1, n_and0)
-            Line(circuit, n_and0, n_and1)    
-        elif n.kind.startswith('OAI221X'):
-            n.remove()
-            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
-            n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
-            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', None, None, None)
-            n_nand1 = add_and_connect(circuit, name+'~nand1', 'NAND2', None, ins[4], outs[0])
-            Line(circuit, n_or0, n_and0)
-            Line(circuit, n_or1, n_and0)
-            Line(circuit, n_and0, n_nand1)
-        elif n.kind.startswith('AO222X'):
-            n.remove()
-            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
-            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
-            n_and2 = add_and_connect(circuit, name+'~and2', 'AND2', ins[4], ins[5], None)
-            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', None, None, None)
-            n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', None, None, outs[0])
-            Line(circuit, n_and0, n_or0)
-            Line(circuit, n_and1, n_or0)
-            Line(circuit, n_and2, n_or1)
-            Line(circuit, n_or0, n_or1)
-        elif n.kind.startswith('AOI222X'):
-            n.remove()
-            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
-            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
-            n_and2 = add_and_connect(circuit, name+'~and2', 'AND2', ins[4], ins[5], None)
-            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', None, None, None)
-            n_nor1 = add_and_connect(circuit, name+'~nor1', 'NOR2', None, None, outs[0])
-            Line(circuit, n_and0, n_or0)
-            Line(circuit, n_and1, n_or0)
-            Line(circuit, n_and2, n_nor1)
-            Line(circuit, n_or0, n_nor1)
-        elif n.kind.startswith('OA222X'):
-            n.remove()
-            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
-            n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
-            n_or2 = add_and_connect(circuit, name+'~or2', 'OR2', ins[4], ins[5], None)
-            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', None, None, None)
-            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', None, None, outs[0])
-            Line(circuit, n_or0, n_and0)
-            Line(circuit, n_or1, n_and0)
-            Line(circuit, n_or2, n_and1)
-            Line(circuit, n_and0, n_and1)
-        elif n.kind.startswith('OAI222X'):
-            n.remove()
-            n0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
-            n1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
-            n2 = add_and_connect(circuit, name+'~or2', 'OR2', ins[4], ins[5], None)
-            n3 = add_and_connect(circuit, name+'~and0', 'AND2', None, None, None)
-            n4 = add_and_connect(circuit, name+'~nand1', 'NAND2', None, None, outs[0])
-            Line(circuit, n0, n3)
-            Line(circuit, n1, n3)
-            Line(circuit, n2, n4)
-            Line(circuit, n3, n4)
-        elif n.kind.startswith('AND3X'):
-            n.remove()
-            n0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
-            n1 = add_and_connect(circuit, name+'~and1', 'AND2', None, ins[2], outs[0])
-            Line(circuit, n0, n1)
-        elif n.kind.startswith('OR3X'):
-            n.remove()
-            n0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
-            n1 = add_and_connect(circuit, name+'~or1', 'OR2', None, ins[2], outs[0])
-            Line(circuit, n0, n1)
-        elif n.kind.startswith('XOR3X'):
-            n.remove()
-            n0 = add_and_connect(circuit, name+'~xor0', 'XOR2', ins[0], ins[1], None)
-            n1 = add_and_connect(circuit, name+'~xor1', 'XOR2', None, ins[2], outs[0])
-            Line(circuit, n0, n1)
-        elif n.kind.startswith('NAND3X'):
-            n.remove()
-            n0 = add_and_connect(circuit, name+'~and', 'AND2', ins[0], ins[1], None)
-            n1 = add_and_connect(circuit, name+'~nand', 'NAND2', None, ins[2], outs[0])
-            Line(circuit, n0, n1)
-        elif n.kind.startswith('NOR3X'):
-            n.remove()
-            n0 = add_and_connect(circuit, name+'~or', 'OR2', ins[0], ins[1], None)
-            n1 = add_and_connect(circuit, name+'~nor', 'NOR2', None, ins[2], outs[0])
-            Line(circuit, n0, n1)
-        elif n.kind.startswith('XNOR3X'):
-            n.remove()
-            n0 = add_and_connect(circuit, name+'~xor', 'XOR2', ins[0], ins[1], None)
-            n1 = add_and_connect(circuit, name+'~xnor', 'XNOR2', None, ins[2], outs[0])
-            Line(circuit, n0, n1)
-        elif n.kind.startswith('AND4X'):
-            n.remove()
-            n0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
-            n1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
-            n2 = add_and_connect(circuit, name+'~and2', 'AND2', None, None, outs[0])
-            Line(circuit, n0, n2)
-            Line(circuit, n1, n2)
-        elif n.kind.startswith('OR4X'):
-            n.remove()
-            n0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
-            n1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
-            n2 = add_and_connect(circuit, name+'~or2', 'OR2', None, None, outs[0])
-            Line(circuit, n0, n2)
-            Line(circuit, n1, n2)
-        elif n.kind.startswith('NAND4X'):
-            n.remove()
-            n0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
-            n1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
-            n2 = add_and_connect(circuit, name+'~nand2', 'NAND2', None, None, outs[0])
-            Line(circuit, n0, n2)
-            Line(circuit, n1, n2)
-        elif n.kind.startswith('NOR4X'):
-            n.remove()
-            n0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
-            n1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
-            n2 = add_and_connect(circuit, name+'~nor2', 'NOR2', None, None, outs[0])
-            Line(circuit, n0, n2)
-            Line(circuit, n1, n2)
-        elif n.kind.startswith('FADDX'):
-            n.remove()
-            # forks for fan-outs
-            f_a = add_and_connect(circuit, name + '~fork0', '__fork__', ins[0])
-            f_b = add_and_connect(circuit, name + '~fork1', '__fork__', ins[1])
-            f_ci = add_and_connect(circuit, name + '~fork2', '__fork__', ins[2])
-            f_ab = Node(circuit, name + '~fork3')
-            # sum-block
-            n_xor0 = Node(circuit, name + '~xor0', 'XOR2')
-            Line(circuit, f_a, n_xor0)
-            Line(circuit, f_b, n_xor0)
-            Line(circuit, n_xor0, f_ab)
-            if len(outs) > 0 and outs[0] is not None:
-                n_xor1 = add_and_connect(circuit, name + '~xor1', 'XOR2', None, None, outs[0])
-                Line(circuit, f_ab, n_xor1)
-                Line(circuit, f_ci, n_xor1)
-            # carry-block
-            if len(outs) > 1 and outs[1] is not None:
-                n_and0 = Node(circuit, name + '~and0', 'AND2')
-                Line(circuit, f_ab, n_and0)
-                Line(circuit, f_ci, n_and0)
-                n_and1 = Node(circuit, name + '~and1', 'AND2')
-                Line(circuit, f_a, n_and1)
-                Line(circuit, f_b, n_and1)
-                n_or = add_and_connect(circuit, name + '~or0', 'OR2', None, None, outs[1])
-                Line(circuit, n_and0, n_or)
-                Line(circuit, n_and1, n_or)
-        elif n.kind.startswith('HADDX'):
-            n.remove()
-            # forks for fan-outs
-            f_a = add_and_connect(circuit, name + '~fork0', '__fork__', ins[0])
-            f_b = add_and_connect(circuit, name + '~fork1', '__fork__', ins[1])
-            n_xor0 = add_and_connect(circuit, name + '~xor0', 'XOR2', None, None, outs[1])
-            Line(circuit, f_a, n_xor0)
-            Line(circuit, f_b, n_xor0)
-            n_and0 = add_and_connect(circuit, name + '~and0', 'AND2', None, None, outs[0])
-            Line(circuit, f_a, n_and0)
-            Line(circuit, f_b, n_and0)
-        elif n.kind.startswith('MUX21X'):
-            n.remove()
-            f_s = add_and_connect(circuit, name + '~fork0', '__fork__', ins[2])
-            n_not = Node(circuit, name + '~not', 'INV')
-            Line(circuit, f_s, n_not)
-            n_and0 = add_and_connect(circuit, name + '~and0', 'AND2', ins[0])
-            n_and1 = add_and_connect(circuit, name + '~and1', 'AND2', ins[1])
-            n_or0 = add_and_connect(circuit, name + '~or0', 'OR2', None, None, outs[0])
-            Line(circuit, n_not, n_and0)
-            Line(circuit, f_s, n_and1)
-            Line(circuit, n_and0, n_or0)
-            Line(circuit, n_and1, n_or0)
-        elif n.kind.startswith('DFFSSR'):
-            n.kind = 'DFFX1'
-            n_and0 = add_and_connect(circuit, name + '~and0', 'AND2', ins[0], ins[2], None)
-            Line(circuit, n_and0, (n, 0))
diff --git a/src/kyupy/sdf.py b/src/kyupy/sdf.py
index beb58c0..f89d7b3 100644
--- a/src/kyupy/sdf.py
+++ b/src/kyupy/sdf.py
@@ -14,6 +14,7 @@ import numpy as np
 from lark import Lark, Transformer
 
 from . import log, readtext
+from .techlib import TechLib
 
 
 Interconnect = namedtuple('Interconnect', ['orig', 'dest', 'r', 'f'])
@@ -35,7 +36,7 @@ class DelayFile:
         return '\n'.join(f'{n}: {l}' for n, l in self.cells.items()) + '\n' + \
                '\n'.join(str(i) for i in self.interconnects)
 
-    def annotation(self, circuit, pin_index_f, dataset=1, interconnect=True, ffdelays=True):
+    def annotation(self, circuit, tlib=TechLib(), dataset=1, interconnect=True, ffdelays=True):
         """Constructs an 3-dimensional ndarray with timing data for each line in ``circuit``.
 
         An IOPATH delay for a node is annotated to the line connected to the input pin specified in the IOPATH.
@@ -43,29 +44,36 @@ class DelayFile:
         Currently, only ABSOLUTE IOPATH and INTERCONNECT delays are supported.
         Pulse rejection limits are derived from absolute delays, explicit declarations (PATHPULSE etc.) are ignored.
 
-        :param circuit:
-        :param pin_index_f:
-        :param ffdelays:
-        :param interconnect:
-        :type dataset: int or tuple
+        :param circuit: The circuit to annotate. Names from the STIL file are matched to the node names.
+        :type circuit: :class:`~kyupy.circuit.Circuit`
+        :param tlib: A technology library object that provides pin name mappings.
+        :type tlib: :py:class:`~kyupy.techlib.TechLib`
+        :param dataset: SDFs store multiple values for each delay (e.g. minimum, typical, maximum).
+            An integer selects the dataset to use (default is 1 for 'typical').
+            If a tuple is given, the annotator will calculate the average of multiple datasets.
+        :type dataset: ``int`` or ``tuple``
+        :param interconnect: Whether or not to include the delays of interconnects in the annotation.
+            To properly annotate interconnect delays, the circuit model has to include a '__fork__' node on
+            every signal and every fanout-branch. The Verilog parser aids in this by setting the parameter
+            `branchforks=True` in :py:func:`kyupy.verilog.parse`.
+        :type interconnect: ``bool``
+        :param ffdelays: Whether or not to include the delays of flip-flops in the annotation.
+        :type ffdelays: ``bool``
         :return: A 3-dimensional ndarray with timing data.
 
             * Axis 0: line index.
-            * Axis 1: type of timing data: 0=`delay`, 1=`pulse rejection limit`.
-            * Axis 2: The polarity of the output transition of the reading node: 0=`rising`, 1=`falling`.
+            * Axis 1: type of timing data: 0='delay', 1='pulse rejection limit'.
+            * Axis 2: The polarity of the output transition of the reading node: 0='rising', 1='falling'.
 
             The polarity for pulse rejection is determined by the latter transition of the pulse.
-            E.g., timing[42,1,0] is the rejection limit of a negative pulse at the output of the reader of line 42.
+            E.g., ``timing[42, 1, 0]`` is the rejection limit of a negative pulse at the output
+            of the reader of line 42.
         """
         def select_del(_delvals, idx):
-            if type(dataset) is tuple:
-                s = 0
-                for d in dataset:
-                    s += _delvals[idx][d]
-                return s / len(dataset)
-            else:
-                return _delvals[idx][dataset]
-        
+            if isinstance(dataset, tuple):
+                return sum(_delvals[idx][d] for d in dataset) / len(dataset)
+            return _delvals[idx][dataset]
+
         def find_cell(name):
             if name not in circuit.cells:
                 name = name.replace('\\', '')
@@ -74,7 +82,7 @@ class DelayFile:
             if name not in circuit.cells:
                 return None
             return circuit.cells[name]
-        
+
         timing = np.zeros((len(circuit.lines), 2, 2))
         for cn, iopaths in self.cells.items():
             for ipn, opn, *delvals in iopaths:
@@ -85,17 +93,17 @@ class DelayFile:
                 if cell is None:
                     log.warn(f'Cell from SDF not found in circuit: {cn}')
                     continue
-                ipin = pin_index_f(cell.kind, ipn)
-                opin = pin_index_f(cell.kind, opn)
+                ipin = tlib.pin_index(cell.kind, ipn)
+                opin = tlib.pin_index(cell.kind, opn)
                 kind = cell.kind.lower()
 
                 ipn2 = ipn.replace('(posedge A1)', 'A1').replace('(negedge A1)', 'A1')\
                     .replace('(posedge A2)', 'A2').replace('(negedge A2)', 'A2')
-                
+
                 def add_delays(_line):
                     if _line is not None:
-                        timing[_line.index, :, 0] += select_del(delvals, 0)
-                        timing[_line.index, :, 1] += select_del(delvals, 1)
+                        timing[_line, :, 0] += select_del(delvals, 0)
+                        timing[_line, :, 1] += select_del(delvals, 1)
 
                 take_avg = False
                 if kind.startswith('sdff'):
@@ -105,16 +113,16 @@ class DelayFile:
                         add_delays(cell.outs[opin])
                 else:
                     if kind.startswith(('xor', 'xnor')):
-                        ipin = pin_index_f(cell.kind, ipn2)
-                        # print(ipn, ipin, times[cell.i_lines[ipin].index, 0, 0])
-                        take_avg = timing[cell.ins[ipin].index].sum() > 0
+                        ipin = tlib.pin_index(cell.kind, ipn2)
+                        # print(ipn, ipin, times[cell.i_lines[ipin], 0, 0])
+                        take_avg = timing[cell.ins[ipin]].sum() > 0
                     add_delays(cell.ins[ipin])
                     if take_avg:
-                        timing[cell.ins[ipin].index] /= 2
-        
+                        timing[cell.ins[ipin]] /= 2
+
         if not interconnect or self.interconnects is None:
             return timing
-        
+
         for n1, n2, *delvals in self.interconnects:
             delvals = [d if len(d) > 0 else [0, 0, 0] for d in delvals]
             if max(max(delvals)) == 0:
@@ -139,7 +147,7 @@ class DelayFile:
             if c2 is None:
                 log.warn(f'Cell from SDF not found in circuit: {cn2}')
                 continue
-            p1, p2 = pin_index_f(c1.kind, pn1), pin_index_f(c2.kind, pn2)
+            p1, p2 = tlib.pin_index(c1.kind, pn1), tlib.pin_index(c2.kind, pn2)
             line = None
             f1, f2 = c1.outs[p1].reader, c2.ins[p2].driver
             if f1 != f2:  # possible branchfork
@@ -149,8 +157,8 @@ class DelayFile:
             elif len(f2.outs) == 1:  # no fanout?
                 line = f2.ins[0]
             if line is not None:
-                timing[line.index, :, 0] += select_del(delvals, 0)
-                timing[line.index, :, 1] += select_del(delvals, 1)
+                timing[line, :, 0] += select_del(delvals, 0)
+                timing[line, :, 1] += select_del(delvals, 1)
             else:
                 log.warn(f'No branchfork for annotating interconnect delay {c1.name}/{p1}->{c2.name}/{p2}')
         return timing
@@ -184,7 +192,7 @@ class SdfTransformer(Transformer):
         return DelayFile(name, cells)
 
 
-grammar = r"""
+GRAMMAR = r"""
     start: "(DELAYFILE" ( "(SDFVERSION" _NOB ")"
         | "(DESIGN" "\"" NAME "\"" ")"
         | "(DATE" _NOB ")"
@@ -218,7 +226,7 @@ grammar = r"""
 
 def parse(text):
     """Parses the given ``text`` and returns a :class:`DelayFile` object."""
-    return Lark(grammar, parser="lalr", transformer=SdfTransformer()).parse(text)
+    return Lark(GRAMMAR, parser="lalr", transformer=SdfTransformer()).parse(text)
 
 
 def load(file):
diff --git a/src/kyupy/stil.py b/src/kyupy/stil.py
index 5c022ca..75bffc2 100644
--- a/src/kyupy/stil.py
+++ b/src/kyupy/stil.py
@@ -4,7 +4,7 @@ The main purpose of this parser is to load scan pattern sets from STIL files.
 It supports only a very limited subset of STIL.
 
 The functions :py:func:`load` and :py:func:`read` return an intermediate representation (:class:`StilFile` object).
-Call :py:func:`StilFile.tests4v`, :py:func:`StilFile.tests8v`, or :py:func:`StilFile.responses4v` to
+Call :py:func:`StilFile.tests`, :py:func:`StilFile.tests_loc`, or :py:func:`StilFile.responses` to
 obtain the appropriate vector sets.
 """
 
@@ -54,26 +54,26 @@ class StilFile:
                     launch = dict((k, v.replace('\n', '')) for k, v in call.parameters.items())
                 else:
                     capture = dict((k, v.replace('\n', '')) for k, v in call.parameters.items())
-    
+
     def _maps(self, c):
         interface = list(c.interface) + [n for n in c.nodes if 'DFF' in n.kind]
-        intf_pos = dict([(n.name, i) for i, n in enumerate(interface)])
+        intf_pos = dict((n.name, i) for i, n in enumerate(interface))
         pi_map = [intf_pos[n] for n in self.signal_groups['_pi']]
         po_map = [intf_pos[n] for n in self.signal_groups['_po']]
         scan_maps = {}
         scan_inversions = {}
-        for chain_name, chain in self.scan_chains.items():
+        for chain in self.scan_chains.values():
             scan_map = []
             scan_in_inversion = []
             scan_out_inversion = []
             inversion = False
             for n in chain[1:-1]:
-                if n == '!': 
+                if n == '!':
                     inversion = not inversion
                 else:
                     scan_in_inversion.append(inversion)
             scan_in_inversion = list(reversed(scan_in_inversion))
-            inversion = False             
+            inversion = False
             for n in reversed(chain[1:-1]):
                 if n == '!':
                     inversion = not inversion
@@ -85,13 +85,13 @@ class StilFile:
             scan_inversions[chain[0]] = scan_in_inversion
             scan_inversions[chain[-1]] = scan_out_inversion
         return interface, pi_map, po_map, scan_maps, scan_inversions
-        
+
     def tests(self, circuit):
         """Assembles and returns a scan test pattern set for given circuit.
 
         This function assumes a static (stuck-at fault) test.
         """
-        interface, pi_map, po_map, scan_maps, scan_inversions = self._maps(circuit)
+        interface, pi_map, _, scan_maps, scan_inversions = self._maps(circuit)
         tests = logic.MVArray((len(interface), len(self.patterns)))
         for i, p in enumerate(self.patterns):
             for si_port in self.si_ports.keys():
@@ -133,10 +133,10 @@ class StilFile:
             launch.data[po_map, i] = logic.UNASSIGNED
 
         return logic.mv_transition(init, launch)
-                
+
     def responses(self, circuit):
         """Assembles and returns a scan test response pattern set for given circuit."""
-        interface, pi_map, po_map, scan_maps, scan_inversions = self._maps(circuit)
+        interface, _, po_map, scan_maps, scan_inversions = self._maps(circuit)
         resp = logic.MVArray((len(interface), len(self.patterns)))
         # resp = PackedVectors(len(self.patterns), len(interface), 2)
         for i, p in enumerate(self.patterns):
@@ -150,27 +150,27 @@ class StilFile:
                 resp.data[scan_maps[so_port], i] = pattern.data[:, 0]
                 # resp.set_values(i, p.unload[so_port], scan_maps[so_port], scan_inversions[so_port])
         return resp
-        
-        
+
+
 class StilTransformer(Transformer):
     def __init__(self):
         super().__init__()
         self._signal_groups = None
         self._calls = None
         self._scan_chains = None
-        
+
     @staticmethod
     def quoted(args): return args[0][1:-1]
 
     @staticmethod
     def call(args): return Call(args[0], dict(args[1:]))
-        
+
     @staticmethod
     def call_parameter(args): return args[0], args[1].value
 
     @staticmethod
     def signal_group(args): return args[0], args[1:]
-    
+
     @staticmethod
     def scan_chain(args):
         scan_in = None
@@ -187,7 +187,7 @@ class StilTransformer(Transformer):
         return args[0], ([scan_in] + scan_cells + [scan_out])
 
     def signal_groups(self, args): self._signal_groups = dict(args)
-    
+
     def pattern(self, args): self._calls = [c for c in args if isinstance(c, Call)]
 
     def scan_structures(self, args): self._scan_chains = dict(args)
@@ -196,7 +196,7 @@ class StilTransformer(Transformer):
         return StilFile(float(args[0]), self._signal_groups, self._scan_chains, self._calls)
 
 
-grammar = r"""
+GRAMMAR = r"""
     start: "STIL" FLOAT _ignore _block*
     _block: signal_groups | scan_structures | pattern
         | "Header" _ignore
@@ -240,7 +240,7 @@ grammar = r"""
 
 def parse(text):
     """Parses the given ``text`` and returns a :class:`StilFile` object."""
-    return Lark(grammar, parser="lalr", transformer=StilTransformer()).parse(text)
+    return Lark(GRAMMAR, parser="lalr", transformer=StilTransformer()).parse(text)
 
 
 def load(file):
diff --git a/src/kyupy/techlib.py b/src/kyupy/techlib.py
new file mode 100644
index 0000000..5a5a01b
--- /dev/null
+++ b/src/kyupy/techlib.py
@@ -0,0 +1,301 @@
+from .circuit import Node, Line
+
+
+def add_and_connect(circuit, name, kind, in1=None, in2=None, out=None):
+    n = Node(circuit, name, kind)
+    if in1 is not None:
+        n.ins[0] = in1
+        in1.reader = n
+        in1.reader_pin = 0
+    if in2 is not None:
+        n.ins[1] = in2
+        in2.reader = n
+        in2.reader_pin = 1
+    if out is not None:
+        n.outs[0] = out
+        out.driver = n
+        out.driver_pin = 0
+    return n
+
+
+class TechLib:
+    """Provides some information specific to standard cell libraries necessary
+    for loading gate-level designs. :py:class:`~kyupy.circuit.Node` objects do not
+    have pin names. The methods defined here map pin names to pin directions and defined
+    positions in the ``node.ins`` and ``node.outs`` lists. The default implementation
+    provides mappings for SAED-inspired standard cell libraries.
+    """
+
+    @staticmethod
+    def pin_index(kind, pin):
+        """Returns a pin list position for a given node kind and pin name."""
+        for prefix, pins, index in [('HADD', ('B0', 'SO'), 1),
+                                    ('MUX21', ('S',), 2),
+                                    ('DFF', ('QN',), 1),
+                                    ('SDFF', ('QN',), 1),
+                                    ('SDFF', ('CLK',), 3),
+                                    ('SDFF', ('RSTB',), 4),
+                                    ('SDFF', ('SETB',), 5)]:
+            if kind.startswith(prefix) and pin in pins: return index
+        for index, pins in enumerate([('A1', 'IN1', 'D', 'S', 'INP', 'A', 'Q', 'QN', 'Y', 'Z', 'ZN'),
+                                      ('A2', 'IN2', 'CLK', 'CO', 'SE', 'B'),
+                                      ('A3', 'IN3', 'RSTB', 'CI', 'SI'),
+                                      ('A4', 'IN4', 'SETB'),
+                                      ('A5', 'IN5'),
+                                      ('A6', 'IN6')]):
+            if pin in pins: return index
+        raise ValueError(f'Unknown pin index for {kind}.{pin}')
+
+    @staticmethod
+    def pin_is_output(kind, pin):
+        """Returns True, if given pin name of a node kind is an output."""
+        if 'MUX' in kind and pin == 'S': return False
+        return pin in ('Q', 'QN', 'Z', 'ZN', 'Y', 'CO', 'S', 'SO', 'C1')
+
+    @staticmethod
+    def split_complex_gates(circuit):
+        node_list = circuit.nodes
+        for n in node_list:
+            name = n.name
+            ins = n.ins
+            outs = n.outs
+            if n.kind.startswith('AO21X'):
+                n.remove()
+                n_and = add_and_connect(circuit, name+'~and', 'AND2', ins[0], ins[1], None)
+                n_or = add_and_connect(circuit, name+'~or', 'OR2', None, ins[2], outs[0])
+                Line(circuit, n_and, n_or)
+            elif n.kind.startswith('AOI21X'):
+                n.remove()
+                n_and = add_and_connect(circuit, name+'~and', 'AND2', ins[0], ins[1], None)
+                n_nor = add_and_connect(circuit, name+'~nor', 'NOR2', None, ins[2], outs[0])
+                Line(circuit, n_and, n_nor)
+            elif n.kind.startswith('OA21X'):
+                n.remove()
+                n_or = add_and_connect(circuit, name+'~or', 'OR2', ins[0], ins[1], None)
+                n_and = add_and_connect(circuit, name+'~and', 'AND2', None, ins[2], outs[0])
+                Line(circuit, n_or, n_and)
+            elif n.kind.startswith('OAI21X'):
+                n.remove()
+                n_or = add_and_connect(circuit, name+'~or', 'OR2', ins[0], ins[1], None)
+                n_nand = add_and_connect(circuit, name+'~nand', 'NAND2', None, ins[2], outs[0])
+                Line(circuit, n_or, n_nand)
+            elif n.kind.startswith('OA22X'):
+                n.remove()
+                n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
+                n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
+                n_and = add_and_connect(circuit, name+'~and', 'AND2', None, None, outs[0])
+                Line(circuit, n_or0, n_and)
+                Line(circuit, n_or1, n_and)
+            elif n.kind.startswith('OAI22X'):
+                n.remove()
+                n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
+                n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
+                n_nand = add_and_connect(circuit, name+'~nand', 'NAND2', None, None, outs[0])
+                Line(circuit, n_or0, n_nand)
+                Line(circuit, n_or1, n_nand)
+            elif n.kind.startswith('AO22X'):
+                n.remove()
+                n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+                n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
+                n_or = add_and_connect(circuit, name+'~or', 'OR2', None, None, outs[0])
+                Line(circuit, n_and0, n_or)
+                Line(circuit, n_and1, n_or)
+            elif n.kind.startswith('AOI22X'):
+                n.remove()
+                n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+                n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
+                n_nor = add_and_connect(circuit, name+'~nor', 'NOR2', None, None, outs[0])
+                Line(circuit, n_and0, n_nor)
+                Line(circuit, n_and1, n_nor)
+            elif n.kind.startswith('AO221X'):
+                n.remove()
+                n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+                n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
+                n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', None, None, None)
+                n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', None, ins[4], outs[0])
+                Line(circuit, n_and0, n_or0)
+                Line(circuit, n_and1, n_or0)
+                Line(circuit, n_or0, n_or1)
+            elif n.kind.startswith('AOI221X'):
+                n.remove()
+                n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+                n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
+                n_or = add_and_connect(circuit, name+'~or', 'OR2', None, None, None)
+                n_nor = add_and_connect(circuit, name+'~nor', 'NOR2', None, ins[4], outs[0])
+                Line(circuit, n_and0, n_or)
+                Line(circuit, n_and1, n_or)
+                Line(circuit, n_or, n_nor)
+            elif n.kind.startswith('OA221X'):
+                n.remove()
+                n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
+                n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
+                n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', None, None, None)
+                n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', None, ins[4], outs[0])
+                Line(circuit, n_or0, n_and0)
+                Line(circuit, n_or1, n_and0)
+                Line(circuit, n_and0, n_and1)
+            elif n.kind.startswith('OAI221X'):
+                n.remove()
+                n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
+                n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
+                n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', None, None, None)
+                n_nand1 = add_and_connect(circuit, name+'~nand1', 'NAND2', None, ins[4], outs[0])
+                Line(circuit, n_or0, n_and0)
+                Line(circuit, n_or1, n_and0)
+                Line(circuit, n_and0, n_nand1)
+            elif n.kind.startswith('AO222X'):
+                n.remove()
+                n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+                n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
+                n_and2 = add_and_connect(circuit, name+'~and2', 'AND2', ins[4], ins[5], None)
+                n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', None, None, None)
+                n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', None, None, outs[0])
+                Line(circuit, n_and0, n_or0)
+                Line(circuit, n_and1, n_or0)
+                Line(circuit, n_and2, n_or1)
+                Line(circuit, n_or0, n_or1)
+            elif n.kind.startswith('AOI222X'):
+                n.remove()
+                n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+                n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
+                n_and2 = add_and_connect(circuit, name+'~and2', 'AND2', ins[4], ins[5], None)
+                n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', None, None, None)
+                n_nor1 = add_and_connect(circuit, name+'~nor1', 'NOR2', None, None, outs[0])
+                Line(circuit, n_and0, n_or0)
+                Line(circuit, n_and1, n_or0)
+                Line(circuit, n_and2, n_nor1)
+                Line(circuit, n_or0, n_nor1)
+            elif n.kind.startswith('OA222X'):
+                n.remove()
+                n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
+                n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
+                n_or2 = add_and_connect(circuit, name+'~or2', 'OR2', ins[4], ins[5], None)
+                n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', None, None, None)
+                n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', None, None, outs[0])
+                Line(circuit, n_or0, n_and0)
+                Line(circuit, n_or1, n_and0)
+                Line(circuit, n_or2, n_and1)
+                Line(circuit, n_and0, n_and1)
+            elif n.kind.startswith('OAI222X'):
+                n.remove()
+                n0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
+                n1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
+                n2 = add_and_connect(circuit, name+'~or2', 'OR2', ins[4], ins[5], None)
+                n3 = add_and_connect(circuit, name+'~and0', 'AND2', None, None, None)
+                n4 = add_and_connect(circuit, name+'~nand1', 'NAND2', None, None, outs[0])
+                Line(circuit, n0, n3)
+                Line(circuit, n1, n3)
+                Line(circuit, n2, n4)
+                Line(circuit, n3, n4)
+            elif n.kind.startswith('AND3X'):
+                n.remove()
+                n0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+                n1 = add_and_connect(circuit, name+'~and1', 'AND2', None, ins[2], outs[0])
+                Line(circuit, n0, n1)
+            elif n.kind.startswith('OR3X'):
+                n.remove()
+                n0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
+                n1 = add_and_connect(circuit, name+'~or1', 'OR2', None, ins[2], outs[0])
+                Line(circuit, n0, n1)
+            elif n.kind.startswith('XOR3X'):
+                n.remove()
+                n0 = add_and_connect(circuit, name+'~xor0', 'XOR2', ins[0], ins[1], None)
+                n1 = add_and_connect(circuit, name+'~xor1', 'XOR2', None, ins[2], outs[0])
+                Line(circuit, n0, n1)
+            elif n.kind.startswith('NAND3X'):
+                n.remove()
+                n0 = add_and_connect(circuit, name+'~and', 'AND2', ins[0], ins[1], None)
+                n1 = add_and_connect(circuit, name+'~nand', 'NAND2', None, ins[2], outs[0])
+                Line(circuit, n0, n1)
+            elif n.kind.startswith('NOR3X'):
+                n.remove()
+                n0 = add_and_connect(circuit, name+'~or', 'OR2', ins[0], ins[1], None)
+                n1 = add_and_connect(circuit, name+'~nor', 'NOR2', None, ins[2], outs[0])
+                Line(circuit, n0, n1)
+            elif n.kind.startswith('XNOR3X'):
+                n.remove()
+                n0 = add_and_connect(circuit, name+'~xor', 'XOR2', ins[0], ins[1], None)
+                n1 = add_and_connect(circuit, name+'~xnor', 'XNOR2', None, ins[2], outs[0])
+                Line(circuit, n0, n1)
+            elif n.kind.startswith('AND4X'):
+                n.remove()
+                n0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+                n1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
+                n2 = add_and_connect(circuit, name+'~and2', 'AND2', None, None, outs[0])
+                Line(circuit, n0, n2)
+                Line(circuit, n1, n2)
+            elif n.kind.startswith('OR4X'):
+                n.remove()
+                n0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
+                n1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
+                n2 = add_and_connect(circuit, name+'~or2', 'OR2', None, None, outs[0])
+                Line(circuit, n0, n2)
+                Line(circuit, n1, n2)
+            elif n.kind.startswith('NAND4X'):
+                n.remove()
+                n0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+                n1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
+                n2 = add_and_connect(circuit, name+'~nand2', 'NAND2', None, None, outs[0])
+                Line(circuit, n0, n2)
+                Line(circuit, n1, n2)
+            elif n.kind.startswith('NOR4X'):
+                n.remove()
+                n0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
+                n1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
+                n2 = add_and_connect(circuit, name+'~nor2', 'NOR2', None, None, outs[0])
+                Line(circuit, n0, n2)
+                Line(circuit, n1, n2)
+            elif n.kind.startswith('FADDX'):
+                n.remove()
+                # forks for fan-outs
+                f_a = add_and_connect(circuit, name + '~fork0', '__fork__', ins[0])
+                f_b = add_and_connect(circuit, name + '~fork1', '__fork__', ins[1])
+                f_ci = add_and_connect(circuit, name + '~fork2', '__fork__', ins[2])
+                f_ab = Node(circuit, name + '~fork3')
+                # sum-block
+                n_xor0 = Node(circuit, name + '~xor0', 'XOR2')
+                Line(circuit, f_a, n_xor0)
+                Line(circuit, f_b, n_xor0)
+                Line(circuit, n_xor0, f_ab)
+                if len(outs) > 0 and outs[0] is not None:
+                    n_xor1 = add_and_connect(circuit, name + '~xor1', 'XOR2', None, None, outs[0])
+                    Line(circuit, f_ab, n_xor1)
+                    Line(circuit, f_ci, n_xor1)
+                # carry-block
+                if len(outs) > 1 and outs[1] is not None:
+                    n_and0 = Node(circuit, name + '~and0', 'AND2')
+                    Line(circuit, f_ab, n_and0)
+                    Line(circuit, f_ci, n_and0)
+                    n_and1 = Node(circuit, name + '~and1', 'AND2')
+                    Line(circuit, f_a, n_and1)
+                    Line(circuit, f_b, n_and1)
+                    n_or = add_and_connect(circuit, name + '~or0', 'OR2', None, None, outs[1])
+                    Line(circuit, n_and0, n_or)
+                    Line(circuit, n_and1, n_or)
+            elif n.kind.startswith('HADDX'):
+                n.remove()
+                # forks for fan-outs
+                f_a = add_and_connect(circuit, name + '~fork0', '__fork__', ins[0])
+                f_b = add_and_connect(circuit, name + '~fork1', '__fork__', ins[1])
+                n_xor0 = add_and_connect(circuit, name + '~xor0', 'XOR2', None, None, outs[1])
+                Line(circuit, f_a, n_xor0)
+                Line(circuit, f_b, n_xor0)
+                n_and0 = add_and_connect(circuit, name + '~and0', 'AND2', None, None, outs[0])
+                Line(circuit, f_a, n_and0)
+                Line(circuit, f_b, n_and0)
+            elif n.kind.startswith('MUX21X'):
+                n.remove()
+                f_s = add_and_connect(circuit, name + '~fork0', '__fork__', ins[2])
+                n_not = Node(circuit, name + '~not', 'INV')
+                Line(circuit, f_s, n_not)
+                n_and0 = add_and_connect(circuit, name + '~and0', 'AND2', ins[0])
+                n_and1 = add_and_connect(circuit, name + '~and1', 'AND2', ins[1])
+                n_or0 = add_and_connect(circuit, name + '~or0', 'OR2', None, None, outs[0])
+                Line(circuit, n_not, n_and0)
+                Line(circuit, f_s, n_and1)
+                Line(circuit, n_and0, n_or0)
+                Line(circuit, n_and1, n_or0)
+            elif n.kind.startswith('DFFSSR'):
+                n.kind = 'DFFX1'
+                n_and0 = add_and_connect(circuit, name + '~and0', 'AND2', ins[0], ins[2], None)
+                Line(circuit, n_and0, (n, 0))
diff --git a/src/kyupy/verilog.py b/src/kyupy/verilog.py
index 61e76ee..c6b5ab0 100644
--- a/src/kyupy/verilog.py
+++ b/src/kyupy/verilog.py
@@ -10,13 +10,13 @@ from lark import Lark, Transformer
 
 from . import readtext
 from .circuit import Circuit, Node, Line
-from .saed import pin_index, pin_is_output
+from .techlib import TechLib
 
 Instantiation = namedtuple('Instantiation', ['type', 'name', 'pins'])
 
 
 class SignalDeclaration:
-    
+
     def __init__(self, kind, tokens):
         self.left = None
         self.right = None
@@ -27,25 +27,25 @@ class SignalDeclaration:
             self.basename = tokens.children[2]
             self.left = int(tokens.children[0].value)
             self.right = int(tokens.children[1].value)
-    
+
     @property
     def names(self):
         if self.left is None:
             return [self.basename]
         if self.left <= self.right:
             return [f'{self.basename}[{i}]' for i in range(self.left, self.right + 1)]
-        else:
-            return [f'{self.basename}[{i}]' for i in range(self.left, self.right - 1, -1)]
-        
+        return [f'{self.basename}[{i}]' for i in range(self.left, self.right - 1, -1)]
+
     def __repr__(self):
         return f"{self.kind}:{self.basename}[{self.left}:{self.right}]"
 
 
 class VerilogTransformer(Transformer):
-    def __init__(self, branchforks=False):
+    def __init__(self, branchforks=False, tlib=TechLib()):
         super().__init__()
         self._signal_declarations = {}
         self.branchforks = branchforks
+        self.tlib = tlib
 
     @staticmethod
     def name(args):
@@ -57,24 +57,24 @@ class VerilogTransformer(Transformer):
     @staticmethod
     def instantiation(args):
         return Instantiation(args[0], args[1],
-                             dict([(pin.children[0], pin.children[1]) for pin in args[2:]]))
-       
+                             dict((pin.children[0], pin.children[1]) for pin in args[2:]))
+
     def input(self, args):
         for sd in [SignalDeclaration('input', signal) for signal in args]:
             self._signal_declarations[sd.basename] = sd
-    
+
     def inout(self, args):
         for sd in [SignalDeclaration('input', signal) for signal in args]:  # just treat as input
             self._signal_declarations[sd.basename] = sd
-    
+
     def output(self, args):
         for sd in [SignalDeclaration('output', signal) for signal in args]:
             self._signal_declarations[sd.basename] = sd
-            
+
     def wire(self, args):
         for sd in [SignalDeclaration('wire', signal) for signal in args]:
             self._signal_declarations[sd.basename] = sd
-                
+
     def module(self, args):
         c = Circuit(args[0])
         positions = {}
@@ -85,11 +85,11 @@ class VerilogTransformer(Transformer):
                 pos += 1
         assignments = []
         for stmt in args[2:]:  # pass 1: instantiate cells and driven signals
-            if type(stmt) is Instantiation:
+            if isinstance(stmt, Instantiation):
                 n = Node(c, stmt.name, kind=stmt.type)
                 for p, s in stmt.pins.items():
-                    if pin_is_output(n.kind, p):
-                        Line(c, (n, pin_index(stmt.type, p)), Node(c, s))
+                    if self.tlib.pin_is_output(n.kind, p):
+                        Line(c, (n, self.tlib.pin_index(stmt.type, p)), Node(c, s))
             elif stmt is not None and stmt.data == 'assign':
                 assignments.append((stmt.children[0], stmt.children[1]))
         for sd in self._signal_declarations.values():
@@ -108,10 +108,10 @@ class VerilogTransformer(Transformer):
                 assert s1 not in c.forks, 'assignment between two driven signals'
                 Line(c, c.forks[s2], Node(c, s1))
         for stmt in args[2:]:  # pass 2: connect signals to readers
-            if type(stmt) is Instantiation:
+            if isinstance(stmt, Instantiation):
                 for p, s in stmt.pins.items():
                     n = c.cells[stmt.name]
-                    if pin_is_output(n.kind, p): continue
+                    if self.tlib.pin_is_output(n.kind, p): continue
                     if s.startswith("1'b"):
                         const = f'__const{s[3]}__'
                         if const not in c.cells:
@@ -121,7 +121,7 @@ class VerilogTransformer(Transformer):
                         branchfork = Node(c, fork.name + "~" + n.name + "/" + p)
                         Line(c, fork, branchfork)
                         fork = branchfork
-                    Line(c, fork, (n, pin_index(stmt.type, p)))
+                    Line(c, fork, (n, self.tlib.pin_index(stmt.type, p)))
         for sd in self._signal_declarations.values():
             if sd.kind == 'output':
                 for name in sd.names:
@@ -129,14 +129,10 @@ class VerilogTransformer(Transformer):
         return c
 
     @staticmethod
-    def start(args):
-        if len(args) == 1:
-            return args[0]
-        else:
-            return args
+    def start(args): return args[0] if len(args) == 1 else args
 
 
-grammar = """
+GRAMMAR = """
     start: (module)*
     module: "module" name parameters ";" (_statement)* "endmodule"
     parameters: "(" [ name ( "," name )* ] ")"
@@ -158,16 +154,18 @@ grammar = """
     """
 
 
-def parse(text, *, branchforks=False):
+def parse(text, *, branchforks=False, tlib=TechLib()):
     """Parses the given ``text`` as Verilog code.
 
     :param text: A string with Verilog code.
     :param branchforks: If set to ``True``, the returned circuit will include additional `forks` on each fanout branch.
         These forks are needed to correctly annotate interconnect delays
         (see :py:func:`kyupy.sdf.DelayFile.annotation`).
+    :param tlib: A technology library object that provides pin name mappings.
+    :type tlib: :py:class:`~kyupy.techlib.TechLib`
     :return: A :class:`~kyupy.circuit.Circuit` object.
     """
-    return Lark(grammar, parser="lalr", transformer=VerilogTransformer(branchforks)).parse(text)
+    return Lark(GRAMMAR, parser="lalr", transformer=VerilogTransformer(branchforks, tlib)).parse(text)
 
 
 def load(file, *args, **kwargs):
diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py
index 2766997..bd04f10 100644
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@@ -1,10 +1,10 @@
-"""High-Throughput combinational logic timing simulators.
+"""High-throughput combinational logic timing simulators.
 
-These simulators work similarly to :py:class:`kyupy.logic_sim.LogicSim`.
+These simulators work similarly to :py:class:`~kyupy.logic_sim.LogicSim`.
 They propagate values through the combinational circuit from (pseudo) primary inputs to (pseudo) primary outputs.
 Instead of propagating logic values, these simulators propagate signal histories (waveforms).
-They are designed to run many simulations in parallel and while their latencies are quite high, they achieve
-high throughput performance.
+They are designed to run many simulations in parallel and while their latencies are quite high, they can achieve
+high throughput.
 
 The simulators are not event-based and are not capable of simulating sequential circuits directly.
 
@@ -17,13 +17,16 @@ from bisect import bisect, insort_left
 
 import numpy as np
 
-from . import numba
-from . import cuda
+from . import numba, cuda, hr_bytes
 
 
-TMAX = np.float32(2 ** 127)  # almost np.PINF for 32-bit floating point values
-TMAX_OVL = np.float32(1.1 * 2 ** 127)  # almost np.PINF with overflow mark
-TMIN = np.float32(-2 ** 127)  # almost np.NINF for 32-bit floating point values
+TMAX = np.float32(2 ** 127)
+"""A large 32-bit floating point value used to mark the end of a waveform."""
+TMAX_OVL = np.float32(1.1 * 2 ** 127)
+"""A large 32-bit floating point value used to mark the end of a waveform that
+may be incomplete due to an overflow."""
+TMIN = np.float32(-2 ** 127)
+"""A large negative 32-bit floating point value used at the beginning of waveforms that start with logic-1."""
 
 
 class Heap:
@@ -38,7 +41,7 @@ class Heap:
             if self.chunks[loc] == size:
                 del self.released[idx]
                 return loc
-            elif self.chunks[loc] > size:  # split chunk
+            if self.chunks[loc] > size:  # split chunk
                 chunksize = self.chunks[loc]
                 self.chunks[loc] = size
                 self.chunks[loc + size] = chunksize - size
@@ -93,7 +96,23 @@ class Heap:
 
 
 class WaveSim:
-    """A waveform-based combinational logic timing simulator."""
+    """A waveform-based combinational logic timing simulator running on CPU.
+
+    :param circuit: The circuit to simulate.
+    :param timing: The timing annotation of the circuit (see :py:func:`kyupy.sdf.DelayFile.annotation` for details)
+    :param sims: The number of parallel simulations.
+    :param wavecaps: The number of floats available in each waveform. Waveforms are encoding the signal switching
+        history by storing transition times. The waveform capacity roughly corresponds to the number of transitions
+        that can be stored. A capacity of ``n`` can store at least ``n-2`` transitions. If more transitions are
+        generated during simulation, the latest glitch is removed (freeing up two transition times) and an overflow
+        flag is set. If an integer is given, all waveforms are set to that same capacity. With an array of length
+        ``len(circuit.lines)`` the capacity can be controlled for each intermediate waveform individually.
+    :param strip_forks: If enabled, the simulator will not evaluate fork nodes explicitly. This saves simulation time
+        by reducing the number of nodes to simulate, but (interconnect) delay annotations of lines read by fork nodes
+        are ignored.
+    :param keep_waveforms: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
+        memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
+    """
     def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
         self.circuit = circuit
         self.sims = sims
@@ -104,7 +123,7 @@ class WaveSim:
 
         self.cdata = np.zeros((len(self.interface), sims, 7), dtype='float32')
 
-        if type(wavecaps) is int:
+        if isinstance(wavecaps, int):
             wavecaps = [wavecaps] * len(circuit.lines)
 
         intf_wavecap = 4  # sufficient for storing only 1 transition.
@@ -118,7 +137,7 @@ class WaveSim:
 
         # translate circuit structure into self.ops
         ops = []
-        interface_dict = dict([(n, i) for i, n in enumerate(self.interface)])
+        interface_dict = dict((n, i) for i, n in enumerate(self.interface))
         for n in circuit.topological_order():
             if n in interface_dict:
                 inp_idx = self.ppi_offset + interface_dict[n]
@@ -152,7 +171,7 @@ class WaveSim:
                     ops.append((0b0110, o0_idx, i0_idx, i1_idx))
                 elif kind.startswith('xnor'):
                     ops.append((0b1001, o0_idx, i0_idx, i1_idx))
-                elif kind.startswith('not') or kind.startswith('inv'):
+                elif kind.startswith('not') or kind.startswith('inv') or kind.startswith('ibuf'):
                     ops.append((0b0101, o0_idx, i0_idx, i1_idx))
                 elif kind.startswith('buf') or kind.startswith('nbuf'):
                     ops.append((0b1010, o0_idx, i0_idx, i1_idx))
@@ -173,7 +192,7 @@ class WaveSim:
                     prev_line = prev_line.driver.ins[0]
                 stem_idx = prev_line.index
                 for ol in f.outs:
-                    stems[ol.index] = stem_idx
+                    stems[ol] = stem_idx
 
         # calculate level (distance from PI/PPI) and reference count for each line
         levels = np.zeros(self.sat_length, dtype='int32')
@@ -211,7 +230,7 @@ class WaveSim:
                 self.sat[self.ppi_offset + i] = h.alloc(intf_wavecap), intf_wavecap, 0
                 ref_count[self.ppi_offset + i] += 1
             if len(n.ins) > 0:
-                i0_idx = stems[n.ins[0].index] if stems[n.ins[0].index] >= 0 else n.ins[0].index
+                i0_idx = stems[n.ins[0]] if stems[n.ins[0]] >= 0 else n.ins[0]
                 ref_count[i0_idx] += 1
 
         # allocate memory for the rest of the circuit
@@ -240,7 +259,7 @@ class WaveSim:
         # copy memory location to PO/PPO area
         for i, n in enumerate(self.interface):
             if len(n.ins) > 0:
-                self.sat[self.ppo_offset + i] = self.sat[n.ins[0].index]
+                self.sat[self.ppo_offset + i] = self.sat[n.ins[0]]
 
         # pad timing
         self.timing = np.zeros((self.sat_length, 2, 2))
@@ -253,15 +272,32 @@ class WaveSim:
         m0 = ~m1
         self.mask = np.rollaxis(np.vstack((m0, m1)), 1)
 
+    def __repr__(self):
+        total_mem = self.state.nbytes + self.sat.nbytes + self.ops.nbytes + self.cdata.nbytes
+        return f'<WaveSim {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
+               f'levels={len(self.level_starts)} mem={hr_bytes(total_mem)}>'
+
     def get_line_delay(self, line, polarity):
+        """Returns the current delay of the given ``line`` and ``polarity`` in the simulation model."""
         return self.timing[line, 0, polarity]
 
     def set_line_delay(self, line, polarity, delay):
+        """Sets a new ``delay`` for the given ``line`` and ``polarity`` in the simulation model."""
         self.timing[line, 0, polarity] = delay
 
     def assign(self, vectors, time=0.0, offset=0):
+        """Assigns new values to the primary inputs and state-elements.
+
+        :param vectors: The values to assign preferably in 8-valued logic. The values are converted to
+            appropriate waveforms with or one transition (``RISE``, ``FALL``) no transitions
+            (``ZERO``, ``ONE``, and others).
+        :type vectors: :py:class:`~kyupy.logic.BPArray`
+        :param time: The transition time of the generated waveforms.
+        :param offset: The offset into the vector set. The vector assigned to the first simulator is
+            ``vectors[offset]``.
+        """
         nvectors = min(len(vectors) - offset, self.sims)
-        for i, node in enumerate(self.interface):
+        for i in range(len(self.interface)):
             ppi_loc = self.sat[self.ppi_offset + i, 0]
             if ppi_loc < 0: continue
             for p in range(nvectors):
@@ -283,16 +319,21 @@ class WaveSim:
                 self.state[ppi_loc + toggle, p] = TMAX
 
     def propagate(self, sims=None, sd=0.0, seed=1):
-        if sims is None:
-            sims = self.sims
-        else:
-            sims = min(sims, self.sims)
+        """Propagates all waveforms from the (pseudo) primary inputs to the (pseudo) primary outputs.
+
+        :param sims: Number of parallel simulations to execute. If None, all available simulations are performed.
+        :param sd: Standard deviation for injection of random delay variation. Active, if value is positive.
+        :param seed: Random seed for delay variations.
+        """
+        sims = min(sims or self.sims, self.sims)
         for op_start, op_stop in zip(self.level_starts, self.level_stops):
             self.overflows += level_eval(self.ops, op_start, op_stop, self.state, self.sat, 0, sims,
                                          self.timing, sd, seed)
         self.lst_eat_valid = False
 
     def wave(self, line, vector):
+        # """Returns the desired waveform from the simulation state. Only valid, if simulator was
+        # instantiated with ``keep_waveforms=True``."""
         if line < 0:
             return [TMAX]
         mem, wcap, _ = self.sat[line]
@@ -306,7 +347,34 @@ class WaveSim:
     def wave_ppo(self, o, vector):
         return self.wave(self.ppo_offset + o, vector)
 
-    def capture(self, time=TMAX, sd=0, seed=1, cdata=None, offset=0):
+    def capture(self, time=TMAX, sd=0.0, seed=1, cdata=None, offset=0):
+        """Simulates a capture operation at all state-elements and primary outputs.
+
+        The capture analyzes the propagated waveforms at and around the given capture time and returns
+        various results for each capture operation.
+
+        :param time: The desired capture time. By default, a capture of the settled value is performed.
+        :param sd: A standard deviation for uncertainty in the actual capture time.
+        :param seed: The random seed for a capture with uncertainty.
+        :param cdata: An array to copy capture data into (optional). See the return value for details.
+        :param offset: An offset into the supplied capture data array.
+        :return: The capture data as numpy array.
+
+            The 3-dimensional capture data array contains for each interface node (axis 0),
+            and each test (axis 1), seven values:
+
+            0. Probability of capturing a 1 at the given capture time (same as next value, if no
+               standard deviation given).
+            1. A capture value decided by random sampling according to above probability and given seed.
+            2. The final value (assume a very late capture time).
+            3. True, if there was a premature capture (capture error), i.e. final value is different
+               from captured value.
+            4. Earliest arrival time. The time at which the output transitioned from its initial value.
+            5. Latest stabilization time. The time at which the output transitioned to its final value.
+            6. Overflow indicator. If non-zero, some signals in the input cone of this output had more
+               transitions than specified in ``wavecaps``. Some transitions have been discarded, the
+               final values in the waveforms are still valid.
+        """
         for i, node in enumerate(self.interface):
             if len(node.ins) == 0: continue
             for p in range(self.sims):
@@ -319,7 +387,15 @@ class WaveSim:
         return self.cdata
 
     def reassign(self, time=0.0):
-        for i, node in enumerate(self.interface):
+        """Re-assigns the last capture to the appropriate pseudo-primary inputs. Generates a new set of
+        waveforms at the PPIs that start with the previous final value of that PPI, and transitions at the
+        given time to the value captured in a previous simulation. :py:func:`~WaveSim.capture` must be called
+        prior to this function. The final value of each PPI is taken from the randomly sampled concrete logic
+        values in the capture data.
+
+        :param time: The transition time at the inputs (usually 0.0).
+        """
+        for i in range(len(self.interface)):
             ppi_loc = self.sat[self.ppi_offset + i, 0]
             ppo_loc = self.sat[self.ppo_offset + i, 0]
             if ppi_loc < 0 or ppo_loc < 0: continue
@@ -384,8 +460,7 @@ class WaveSim:
                 accs[idx] += 1
         if s_sqrt2 == 0:
             return values
-        else:
-            return accs
+        return accs
 
     def vals(self, line, vector, times, sd=0):
         return self._vals(line, vector, times, sd)
@@ -462,7 +537,7 @@ def rand_gauss(seed, sd):
         return 1.0
     while True:
         x = -6.0
-        for i in range(12):
+        for _ in range(12):
             seed = int(0xDEECE66D) * seed + 0xB
             x += float((seed >> 8) & 0xffffff) / float(1 << 24)
         x *= sd
@@ -539,12 +614,17 @@ def wave_eval(op, state, sat, st_idx, line_times, sd=0.0, seed=0):
         state[z_mem + z_cur, st_idx] = TMAX_OVL
     else:
         state[z_mem + z_cur, st_idx] = a if a > b else b  # propagate overflow flags by storing biggest TMAX from input
-        
+
     return overflows
 
 
 class WaveSimCuda(WaveSim):
-    """A GPU-accelerated waveform-based combinational logic timing simulator."""
+    """A GPU-accelerated waveform-based combinational logic timing simulator.
+
+    The API is the same as for :py:class:`WaveSim`.
+    All internal memories are mirrored into GPU memory upon construction.
+    Some operations like access to single waveforms can involve large communication overheads.
+    """
     def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
         super().__init__(circuit, timing, sims, wavecaps, strip_forks, keep_waveforms)
 
@@ -559,6 +639,12 @@ class WaveSimCuda(WaveSim):
 
         self._block_dim = (32, 16)
 
+    def __repr__(self):
+        total_mem = self.state.nbytes + self.sat.nbytes + self.ops.nbytes + self.timing.nbytes + \
+                    self.tdata.nbytes + self.cdata.nbytes
+        return f'<WaveSimCuda {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
+               f'levels={len(self.level_starts)} mem={hr_bytes(total_mem)}>'
+
     def get_line_delay(self, line, polarity):
         return self.d_timing[line, 0, polarity]
 
@@ -586,10 +672,7 @@ class WaveSimCuda(WaveSim):
         return gx, gy
 
     def propagate(self, sims=None, sd=0.0, seed=1):
-        if sims is None:
-            sims = self.sims
-        else:
-            sims = min(sims, self.sims)
+        sims = min(sims or self.sims, self.sims)
         for op_start, op_stop in zip(self.level_starts, self.level_stops):
             grid_dim = self._grid_dim(sims, op_stop - op_start)
             wave_kernel[grid_dim, self._block_dim](self.d_ops, op_start, op_stop, self.d_state, self.sat, int(0),
@@ -599,10 +682,10 @@ class WaveSimCuda(WaveSim):
 
     def wave(self, line, vector):
         if line < 0:
-            return None
+            return [TMAX]
         mem, wcap, _ = self.sat[line]
         if mem < 0:
-            return None
+            return [TMAX]
         return self.d_state[mem:mem + wcap, vector]
 
     def capture(self, time=TMAX, sd=0, seed=1, cdata=None, offset=0):
@@ -655,7 +738,7 @@ def reassign_kernel(state, sat, ppi_offset, ppo_offset, cdata, ppi_time):
     if vector >= state.shape[-1]: return
     if ppo_offset + y >= len(sat): return
 
-    ppo, ppo_cap, _ = sat[ppo_offset + y]
+    ppo, _, _ = sat[ppo_offset + y]
     ppi, ppi_cap, _ = sat[ppi_offset + y]
     if ppo < 0: return
     if ppi < 0: return
@@ -765,7 +848,7 @@ def rand_gauss_dev(seed, sd):
         return 1.0
     while True:
         x = -6.0
-        for i in range(12):
+        for _ in range(12):
             seed = int(0xDEECE66D) * seed + 0xB
             x += float((seed >> 8) & 0xffffff) / float(1 << 24)
         x *= sd
diff --git a/tests/test_bench.py b/tests/test_bench.py
index 25b9b1b..44ddf7c 100644
--- a/tests/test_bench.py
+++ b/tests/test_bench.py
@@ -4,9 +4,9 @@ from kyupy import bench
 def test_b01(mydir):
     with open(mydir / 'b01.bench', 'r') as f:
         c = bench.parse(f.read())
-        assert 92 == len(c.nodes)
+        assert len(c.nodes) == 92
     c = bench.load(mydir / 'b01.bench')
-    assert 92 == len(c.nodes)
+    assert len(c.nodes) == 92
 
 
 def test_simple():
diff --git a/tests/test_logic_sim.py b/tests/test_logic_sim.py
index 990eec7..76edb95 100644
--- a/tests/test_logic_sim.py
+++ b/tests/test_logic_sim.py
@@ -49,7 +49,7 @@ def test_4v():
     assert mva[14] == 'X-XXX'
     assert mva[15] == 'XXXXX'
 
-    
+
 def test_8v():
     c = bench.parse('input(x, y) output(a, o, n, xo) a=and(x,y) o=or(x,y) n=not(x) xo=xor(x,y)')
     s = LogicSim(c, 64, m=8)
@@ -71,7 +71,7 @@ def test_8v():
 
     for i in range(64):
         assert resp[i] == mva[i]
-        
+
 
 def test_b01(mydir):
     c = bench.load(mydir / 'b01.bench')
diff --git a/tests/test_sdf.py b/tests/test_sdf.py
index 8b30b68..b09469e 100644
--- a/tests/test_sdf.py
+++ b/tests/test_sdf.py
@@ -1,5 +1,4 @@
 from kyupy import sdf, verilog
-from kyupy.saed import pin_index
 
 
 def test_parse():
@@ -81,20 +80,20 @@ def test_b14(mydir):
 def test_gates(mydir):
     c = verilog.load(mydir / 'gates.v')
     df = sdf.load(mydir / 'gates.sdf')
-    lt = df.annotation(c, pin_index, dataset=1)
+    lt = df.annotation(c, dataset=1)
     nand_a = c.cells['nandgate'].ins[0]
     nand_b = c.cells['nandgate'].ins[1]
     and_a = c.cells['andgate'].ins[0]
     and_b = c.cells['andgate'].ins[1]
 
-    assert lt[nand_a.index, 0, 0] == 0.103
-    assert lt[nand_a.index, 0, 1] == 0.127
+    assert lt[nand_a, 0, 0] == 0.103
+    assert lt[nand_a, 0, 1] == 0.127
 
-    assert lt[nand_b.index, 0, 0] == 0.086
-    assert lt[nand_b.index, 0, 1] == 0.104
+    assert lt[nand_b, 0, 0] == 0.086
+    assert lt[nand_b, 0, 1] == 0.104
 
-    assert lt[and_a.index, 0, 0] == 0.378
-    assert lt[and_a.index, 0, 1] == 0.377
+    assert lt[and_a, 0, 0] == 0.378
+    assert lt[and_a, 0, 1] == 0.377
 
-    assert lt[and_b.index, 0, 0] == 0.375
-    assert lt[and_b.index, 0, 1] == 0.370
+    assert lt[and_b, 0, 0] == 0.375
+    assert lt[and_b, 0, 1] == 0.370
diff --git a/tests/test_stil.py b/tests/test_stil.py
index 1f0d89b..63f19e4 100644
--- a/tests/test_stil.py
+++ b/tests/test_stil.py
@@ -3,7 +3,6 @@ from kyupy import stil
 
 def test_b14(mydir):
     s = stil.load(mydir / 'b14.stuck.stil.gz')
-    assert 10 == len(s.signal_groups)
-    assert 1 == len(s.scan_chains)
-    assert 2163 == len(s.calls)
-
+    assert len(s.signal_groups) == 10
+    assert len(s.scan_chains) == 1
+    assert len(s.calls) == 2163
diff --git a/tests/test_wave_sim.py b/tests/test_wave_sim.py
index bea26d3..8ddb94d 100644
--- a/tests/test_wave_sim.py
+++ b/tests/test_wave_sim.py
@@ -3,7 +3,6 @@ import numpy as np
 from kyupy.wave_sim import WaveSim, WaveSimCuda, wave_eval, TMIN, TMAX
 from kyupy.logic_sim import LogicSim
 from kyupy import verilog, sdf, logic
-from kyupy.saed import pin_index
 from kyupy.logic import MVArray, BPArray
 
 
@@ -19,7 +18,7 @@ def test_wave_eval():
     line_times[1, 0, 1] = 0.4
     line_times[1, 1, 0] = 0.3
     line_times[1, 1, 1] = 0.4
-    
+
     state = np.zeros((3*16, 1)) + TMAX  # 3 waveforms of capacity 16
     state[::16, 0] = 16  # first entry is capacity
     a = state[0:16, 0]
@@ -31,29 +30,29 @@ def test_wave_eval():
     sat[2] = 32, 16, 0
 
     wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
-    assert TMIN == z[0]
+    assert z[0] == TMIN
 
     a[0] = TMIN
     wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
-    assert TMIN == z[0]
-    
+    assert z[0] == TMIN
+
     b[0] = TMIN
     wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
-    assert TMAX == z[0]
+    assert z[0] == TMAX
 
     a[0] = 1  # A _/^^^
     b[0] = 2  # B __/^^
     wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
-    assert TMIN == z[0]  # ^^^\___ B -> Z fall delay
-    assert 2.4 == z[1]
-    assert TMAX == z[2]
+    assert z[0] == TMIN  # ^^^\___ B -> Z fall delay
+    assert z[1] == 2.4
+    assert z[2] == TMAX
 
     a[0] = TMIN  # A ^^^^^^
     b[0] = TMIN  # B ^^^\__
     b[1] = 2
     wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
-    assert 2.3 == z[0]  # ___/^^^ B -> Z rise delay
-    assert TMAX == z[1]
+    assert z[0] == 2.3  # ___/^^^ B -> Z rise delay
+    assert z[1] == TMAX
 
     # pos pulse of 0.35 at B -> 0.45 after delays
     a[0] = TMIN  # A ^^^^^^^^
@@ -61,9 +60,9 @@ def test_wave_eval():
     b[1] = 2     # B ^^\__/^^
     b[2] = 2.35
     wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
-    assert 2.3 == z[0]  # __/^^\__
-    assert 2.75 == z[1]
-    assert TMAX == z[2]
+    assert z[0] == 2.3  # __/^^\__
+    assert z[1] == 2.75
+    assert z[2] == TMAX
 
     # neg pulse of 0.45 at B -> 0.35 after delays
     a[0] = TMIN  # A ^^^^^^^^
@@ -71,10 +70,10 @@ def test_wave_eval():
     b[1] = 2.45
     b[2] = TMAX
     wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
-    assert TMIN == z[0]  # ^^\__/^^
-    assert 2.4 == z[1]
-    assert 2.75 == z[2]
-    assert TMAX == z[3]
+    assert z[0] == TMIN  # ^^\__/^^
+    assert z[1] == 2.4
+    assert z[2] == 2.75
+    assert z[3] == TMAX
 
     # neg pulse of 0.35 at B -> 0.25 after delays (filtered)
     a[0] = TMIN  # A ^^^^^^^^
@@ -82,8 +81,8 @@ def test_wave_eval():
     b[1] = 2.35
     b[2] = TMAX
     wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
-    assert TMIN == z[0]  # ^^^^^^
-    assert TMAX == z[1]
+    assert z[0] == TMIN  # ^^^^^^
+    assert z[1] == TMAX
 
     # pos pulse of 0.25 at B -> 0.35 after delays (filtered)
     a[0] = TMIN  # A ^^^^^^^^
@@ -91,7 +90,7 @@ def test_wave_eval():
     b[1] = 2  # B ^^\__/^^
     b[2] = 2.25
     wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
-    assert TMAX == z[0]  # ______
+    assert z[0] == TMAX  # ______
 
 
 def compare_to_logic_sim(wsim):
@@ -118,7 +117,7 @@ def compare_to_logic_sim(wsim):
     exp_bp = BPArray(tests_bp)
     lsim.capture(exp_bp)
     exp = MVArray(exp_bp)
-    
+
     for i in range(8):
         exp_str = exp[i].replace('R', '1').replace('F', '0').replace('P', '0').replace('N', '1')
         res_str = resp[i].replace('R', '1').replace('F', '0').replace('P', '0').replace('N', '1')
@@ -128,7 +127,7 @@ def compare_to_logic_sim(wsim):
 def test_b14(mydir):
     c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
     df = sdf.load(mydir / 'b14.sdf.gz')
-    lt = df.annotation(c, pin_index)
+    lt = df.annotation(c)
     wsim = WaveSim(c, lt, 8)
     compare_to_logic_sim(wsim)
 
@@ -136,7 +135,7 @@ def test_b14(mydir):
 def test_b14_strip_forks(mydir):
     c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
     df = sdf.load(mydir / 'b14.sdf.gz')
-    lt = df.annotation(c, pin_index)
+    lt = df.annotation(c)
     wsim = WaveSim(c, lt, 8, strip_forks=True)
     compare_to_logic_sim(wsim)
 
@@ -144,6 +143,6 @@ def test_b14_strip_forks(mydir):
 def test_b14_cuda(mydir):
     c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
     df = sdf.load(mydir / 'b14.sdf.gz')
-    lt = df.annotation(c, pin_index)
+    lt = df.annotation(c)
     wsim = WaveSimCuda(c, lt, 8)
     compare_to_logic_sim(wsim)