diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c563798
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+**/__pycache__
+**/.ipynb_checkpoints
+**/.pytest_cache
+**/.DS_Store
+**/*.pyc
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ccf43e7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,33 @@
+KyuPy - Processing VLSI Circuits With Ease
+==========================================
+
+KyuPy is a python package for high-performance processing and analysis of
+non-hierarchical VLSI designs. Its purpose is to provide a rapid prototyping
+platform to aid and accelerate research in the fields of VLSI test, diagnosis
+and reliability. KyuPy is freely available under the MIT license.
+
+Main Features
+-------------
+
+* Partial [lark](https://github.com/lark-parser/lark)-parsers for common files used with synthesized designs: bench, gate-level verilog, standard delay format (SDF), standard test interface language (STIL)
+* Bit-parallel gate-level 2-, 4-, and 8-valued logic simulation
+* GPU-accelerated high-throughput gate-level timing simulation
+* High-performance through the use of [numpy](https://numpy.org) and [numba](https://numba.pydata.org)
+
+
+Getting Started
+---------------
+
+KyuPy requires python 3.6+ and the following packages:
+* [lark-parser](https://pypi.org/project/lark-parser)
+* [numpy](https://pypi.org/project/numpy)
+* [numba](https://pypi.org/project/numba) (required only for GPU/CUDA support)
+
+GPU/CUDA support may [require some additional setup](https://numba.pydata.org/numba-doc/latest/cuda/index.html). If CUDA or numba is not available, the package will automatically fall back to pure python execution.
+
+This repository contains tests that can be run with:
+```
+pytest
+```
+
+Usage examples to get familiar with the API can be found in the Jupyter Notebook [UsageExamples.ipynb](UsageExamples.ipynb).
diff --git a/UsageExamples.ipynb b/UsageExamples.ipynb
new file mode 100644
index 0000000..75fbe06
--- /dev/null
+++ b/UsageExamples.ipynb
@@ -0,0 +1,645 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loading and Exploring Gate-Level Circuits"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Example of parsing the bench data format to make simple gate-level circuits."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0000000.334 W Cuda unavailable. Falling back to pure python\n"
+     ]
+    }
+   ],
+   "source": [
+    "from kyupy import bench\n",
+    "\n",
+    "# parse a file\n",
+    "b01 = bench.parse('tests/b01.bench')\n",
+    "\n",
+    "# ... or specify the circuit as string \n",
+    "mycircuit = bench.parse('input(a,b) output(o1,o2,o3) x=buf(a) o1=not(x) o2=buf(x) o3=buf(x)')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Circuits are objects of the class `Circuit`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Circuit 'tests/b01' with 92 nodes, 130 lines, 4 ports>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "b01"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Circuit with 10 nodes, 8 lines, 5 ports>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mycircuit"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Circuits are containers for two types of elements: nodes and lines.\n",
+    "* A `Node` is a named entity in a circuit (e.g. a gate, a standard cell, a named signal, or a fan-out point) that has connections to other nodes.\n",
+    "* A `Line` is a directional 1:1 connection between two Nodes.\n",
+    "\n",
+    "Use the `dump()` method to get a string representation of all nodes and their connections."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "None(0,1,2,3,4)\n",
+      "0:__fork__\"a\"  >1\n",
+      "1:__fork__\"b\"  \n",
+      "2:__fork__\"o1\" <2 \n",
+      "3:__fork__\"o2\" <4 \n",
+      "4:__fork__\"o3\" <6 \n",
+      "5:buf\"x\" <1 >0\n",
+      "6:__fork__\"x\" <0 >3 >5 >7\n",
+      "7:not\"o1\" <3 >2\n",
+      "8:buf\"o2\" <5 >4\n",
+      "9:buf\"o3\" <7 >6\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(mycircuit.dump())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The first line of the dump starts with the circuit name (\"None\" for `mycircuit`), followed by the node-IDs of all the ports (inputs and outputs) of the circuit.\n",
+    "\n",
+    "Each of the following lines describes one node.\n",
+    "Each node in the circuit has a unique ID, a type, a name, and line-connections. This information is given on each line in that order.\n",
+    "\n",
+    "A line in the circuit has a unique ID, a driver node and a receiver node. The connections in the dump show the direction (\">\" for output, \"<\" for input) and the line-ID. For example in `mycircuit`: Node-0 has one output connected to Line-1, and this Line-1 is connected to the input of Node-5.\n",
+    "\n",
+    "The `interface` is the list of nodes forming the ports (inputs and outputs):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[0:__fork__\"a\"  >1,\n",
+       " 1:__fork__\"b\"  ,\n",
+       " 2:__fork__\"o1\" <2 ,\n",
+       " 3:__fork__\"o2\" <4 ,\n",
+       " 4:__fork__\"o3\" <6 ]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mycircuit.interface"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Nodes\n",
+    "\n",
+    "There are two types of nodes: __forks__ and __cells__.\n",
+    "\n",
+    "Forks have the special type `__fork__` while cells can be of various types (`buf`, `not`, `and`, `nor`, etc.).\n",
+    "Forks are used to label signals with names and to connect a one cell to multiple other cells (fan-out).\n",
+    "The names among all forks and among all cells within a circuit are unique.\n",
+    "Thus, a fork and a cell are allowed to share the same name.\n",
+    "\n",
+    "Nodes in circuits can be accessed by ID or by name."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7:not\"o1\" <3 >2"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mycircuit.nodes[7]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "6:__fork__\"x\" <0 >3 >5 >7"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mycircuit.forks['x']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "5:buf\"x\" <1 >0"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mycircuit.cells['x']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Nodes have an `index` (the node ID), a `kind` (the type), a `name`, as well as `ins` (input pins) and `outs` (output pins)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(6, '__fork__', 'x', [0], [3, 5, 7])"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "n = mycircuit.nodes[6]\n",
+    "n.index, n.kind, n.name, n.ins, n.outs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The inputs and outputs of a node are lists containing `Line` objects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "kyupy.circuit.Line"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(n.ins[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Lines\n",
+    "\n",
+    "A line is a directional connection between one driving node (`driver`) and one reading node (`reader`).\n",
+    "\n",
+    "A line also knows to which node pins it is connected to: `driver_pin`, `reader_pin`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(5, 6:__fork__\"x\" <0 >3 >5 >7, 8:buf\"o2\" <5 >4, 1, 0)"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "l = mycircuit.nodes[6].outs[1]\n",
+    "l.index, l.driver, l.reader, l.driver_pin, l.reader_pin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Basic Analysis Examples\n",
+    "### Cell type statistics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "defaultdict(<class 'int'>, {'DFF': 5, 'AND': 1, 'NAND': 28, 'OR': 1, 'NOT': 10})\n"
+     ]
+    }
+   ],
+   "source": [
+    "from collections import defaultdict\n",
+    "\n",
+    "counts = defaultdict(int)\n",
+    "\n",
+    "for n in b01.cells.values():\n",
+    "    counts[n.kind] += 1\n",
+    "\n",
+    "print(counts)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Tracing a scan chain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Circuit 'b14' with 15864 nodes, 23087 lines, 91 ports>"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from kyupy import verilog\n",
+    "\n",
+    "b14 = verilog.parse('tests/b14.v.gz')\n",
+    "b14"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "chain length 229\n",
+      "['Scan_Out', 'u04_opt1329', 'u04_opt1328', 'wr_reg', 'u04_opt11', 'state_reg_0_0', 'reg3_reg_28_0', 'reg3_reg_27_0', 'reg3_reg_26_0', 'reg3_reg_25_0', 'reg3_reg_24_0', 'u04_opt1123', 'reg3_reg_23_0', 'reg3_reg_22_0', 'reg3_reg_21_0', 'u04_opt1118', 'reg3_reg_20_0', 'reg3_reg_19_0', 'reg3_reg_18_0', 'reg3_reg_17_0', 'reg3_reg_16_0', 'reg3_reg_15_0', 'reg3_reg_14_0', 'reg3_reg_13_0', 'reg3_reg_12_0', 'reg3_reg_11_0', 'reg3_reg_10_0', 'reg3_reg_9_0', 'reg3_reg_8_0', 'reg3_reg_7_0', 'reg3_reg_6_0', 'reg3_reg_5_0', 'reg3_reg_4_0', 'reg3_reg_3_0', 'reg3_reg_2_0', 'reg3_reg_1_0', 'reg3_reg_0_0', 'reg2_reg_31_0', 'reg2_reg_30_0', 'reg2_reg_29_0', 'reg2_reg_28_0', 'reg2_reg_27_0', 'reg2_reg_26_0', 'reg2_reg_25_0', 'reg2_reg_24_0', 'reg2_reg_23_0', 'reg2_reg_22_0', 'reg2_reg_21_0', 'reg2_reg_20_0', 'reg2_reg_19_0', 'reg2_reg_18_0', 'reg2_reg_17_0', 'reg2_reg_16_0', 'reg2_reg_15_0', 'reg2_reg_14_0', 'reg2_reg_13_0', 'reg2_reg_12_0', 'reg2_reg_11_0', 'reg2_reg_10_0', 'reg2_reg_9_0', 'reg2_reg_8_0', 'reg2_reg_7_0', 'reg2_reg_6_0', 'reg2_reg_5_0', 'reg2_reg_4_0', 'reg2_reg_3_0', 'reg2_reg_2_0', 'reg2_reg_1_0', 'reg2_reg_0_0', 'reg1_reg_31_0', 'reg1_reg_30_0', 'reg1_reg_29_0', 'reg1_reg_28_0', 'reg1_reg_27_0', 'reg1_reg_26_0', 'reg1_reg_25_0', 'reg1_reg_24_0', 'reg1_reg_23_0', 'reg1_reg_22_0', 'reg1_reg_21_0', 'reg1_reg_20_0', 'reg1_reg_19_0', 'reg1_reg_18_0', 'reg1_reg_17_0', 'reg1_reg_16_0', 'reg1_reg_15_0', 'reg1_reg_14_0', 'reg1_reg_13_0', 'reg1_reg_12_0', 'reg1_reg_11_0', 'reg1_reg_10_0', 'reg1_reg_9_0', 'reg1_reg_8_0', 'reg1_reg_7_0', 'reg1_reg_6_0', 'reg1_reg_5_0', 'reg1_reg_4_0', 'reg1_reg_3_0', 'reg1_reg_2_0', 'reg1_reg_1_0', 'reg1_reg_0_0', 'reg0_reg_31_0', 'reg0_reg_30_0', 'reg0_reg_29_0', 'reg0_reg_28_0', 'reg0_reg_27_0', 'reg0_reg_26_0', 'reg0_reg_25_0', 'reg0_reg_24_0', 'reg0_reg_23_0', 'reg0_reg_22_0', 'reg0_reg_21_0', 'reg0_reg_20_0', 'reg0_reg_19_0', 'reg0_reg_18_0', 'reg0_reg_17_0', 'reg0_reg_16_0', 'reg0_reg_15_0', 'reg0_reg_14_0', 'reg0_reg_13_0', 'reg0_reg_12_0', 'reg0_reg_11_0', 'reg0_reg_10_0', 'reg0_reg_9_0', 'reg0_reg_8_0', 'reg0_reg_7_0', 'reg0_reg_6_0', 'reg0_reg_5_0', 'reg0_reg_4_0', 'reg0_reg_3_0', 'reg0_reg_2_0', 'reg0_reg_1_0', 'reg0_reg_0_0', 'rd_reg', 'datao_reg_31_0', 'datao_reg_30_0', 'datao_reg_29_0', 'datao_reg_28_0', 'datao_reg_27_0', 'datao_reg_26_0', 'datao_reg_25_0', 'datao_reg_24_0', 'datao_reg_23_0', 'datao_reg_22_0', 'datao_reg_21_0', 'datao_reg_20_0', 'datao_reg_19_0', 'datao_reg_18_0', 'datao_reg_17_0', 'datao_reg_16_0', 'datao_reg_15_0', 'datao_reg_14_0', 'datao_reg_13_0', 'datao_reg_12_0', 'datao_reg_11_0', 'datao_reg_10_0', 'datao_reg_9_0', 'datao_reg_8_0', 'datao_reg_7_0', 'datao_reg_6_0', 'datao_reg_5_0', 'datao_reg_4_0', 'datao_reg_3_0', 'datao_reg_2_0', 'datao_reg_1_0', 'datao_reg_0_0', 'd_reg_1_0', 'd_reg_0_0', 'addr_reg_19_0', 'addr_reg_18_0', 'addr_reg_17_0', 'addr_reg_16_0', 'addr_reg_15_0', 'addr_reg_14_0', 'addr_reg_13_0', 'addr_reg_12_0', 'addr_reg_11_0', 'addr_reg_10_0', 'addr_reg_9_0', 'addr_reg_8_0', 'addr_reg_7_0', 'addr_reg_6_0', 'addr_reg_5_0', 'addr_reg_4_0', 'addr_reg_3_0', 'addr_reg_2_0', 'addr_reg_1_0', 'addr_reg_0_0', 'u04_opt1491', 'u04_opt1492', 'u04_opt1364', 'u04_opt1411', 'IR_reg_31_0', 'IR_reg_30_0', 'IR_reg_29_0', 'u04_opt1427', 'IR_reg_28_0', 'IR_reg_27_0', 'IR_reg_26_0', 'IR_reg_25_0', 'IR_reg_24_0', 'IR_reg_23_0', 'IR_reg_22_0', 'IR_reg_21_0', 'IR_reg_20_0', 'IR_reg_19_0', 'IR_reg_18_0', 'IR_reg_17_0', 'IR_reg_16_0', 'IR_reg_15_0', 'IR_reg_14_0', 'IR_reg_13_0', 'IR_reg_12_0', 'IR_reg_11_0', 'IR_reg_10_0', 'IR_reg_9_0', 'IR_reg_8_0', 'IR_reg_7_0', 'IR_reg_6_0', 'IR_reg_5_0', 'IR_reg_4_0', 'IR_reg_3_0', 'IR_reg_2_0', 'u04_opt1347', 'IR_reg_1_0', 'U14573', 'IR_reg_0_0', 'B_reg', 'Scan_In']\n"
+     ]
+    }
+   ],
+   "source": [
+    "chain = []\n",
+    "cell = b14.cells['Scan_Out']\n",
+    "chain.append(cell)\n",
+    "while len(cell.ins) > 0:\n",
+    "    cell = cell.ins[2 if 'SDFF' in cell.kind else 0].driver\n",
+    "    if '__fork__' not in cell.kind:\n",
+    "        chain.append(cell)\n",
+    "        \n",
+    "print('chain length', len(chain))\n",
+    "print([c.name for c in chain])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loading SDFs and STILs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from kyupy import verilog, sdf\n",
+    "from kyupy.saed import pin_index\n",
+    "from kyupy import stil\n",
+    "\n",
+    "b14 = verilog.parse('tests/b14.v.gz')\n",
+    "df = sdf.parse('tests/b14.sdf.gz')\n",
+    "lt = df.annotation(b14, pin_index, interconnect=False)\n",
+    "s = stil.parse('tests/b14.stil.gz')\n",
+    "t = s.tests8v(b14)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[[0., 0.],\n",
+       "        [0., 0.]],\n",
+       "\n",
+       "       [[0., 0.],\n",
+       "        [0., 0.]],\n",
+       "\n",
+       "       [[0., 0.],\n",
+       "        [0., 0.]],\n",
+       "\n",
+       "       ...,\n",
+       "\n",
+       "       [[0., 0.],\n",
+       "        [0., 0.]],\n",
+       "\n",
+       "       [[0., 0.],\n",
+       "        [0., 0.]],\n",
+       "\n",
+       "       [[0., 0.],\n",
+       "        [0., 0.]]])"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'00-RFRF01F10FFRFF1FR1F1RR010F0F1RRR-------F------------------------------------------------11110110011100110111111110111000010000001111010111001111110110010101100100001000101001101010010011010000001111110111101110110001011010100011010001111010011101001000011111011101111101010111001100100011111100000101110'"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 32 Parallel Time Simulations with Waveform Capacity 16\n",
+    "\n",
+    "This code will fall back to pure python if no CUDA card is available. This will be quite slow.\n",
+    "\n",
+    "Instanciate simulator:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from kyupy.wave_sim_cuda import WaveSimCuda, TMAX\n",
+    "import numpy as np\n",
+    "\n",
+    "wsim = WaveSimCuda(b14, lt, sims=32, wavecaps=16)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Main Simulation Loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nvectors = 32 #len(t)\n",
+    "r = np.zeros((len(wsim.interface), nvectors, 1))\n",
+    "\n",
+    "for offset in range(0, nvectors, wsim.sims):\n",
+    "    wsim.assign(t, offset=offset)\n",
+    "    wsim.propagate(sims=nvectors-offset)\n",
+    "    cdata = wsim.capture(time=TMAX, offset=offset)\n",
+    "    r = cdata[...,0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Output some captures data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(306, 32, 6)"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cdata.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0., 0., 0., ..., 0., 0., 0.],\n",
+       "       [0., 0., 0., ..., 0., 0., 0.],\n",
+       "       [0., 0., 0., ..., 0., 0., 0.],\n",
+       "       ...,\n",
+       "       [1., 1., 1., ..., 1., 1., 1.],\n",
+       "       [0., 0., 0., ..., 0., 0., 0.],\n",
+       "       [0., 0., 0., ..., 1., 1., 1.]], dtype=float32)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "r"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Check for CUDA Support\n",
+    "\n",
+    "Try this code to check if CUDA is set up correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba import cuda\n",
+    "\n",
+    "cuda.detect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/kyupy/__init__.py b/kyupy/__init__.py
new file mode 100644
index 0000000..087659c
--- /dev/null
+++ b/kyupy/__init__.py
@@ -0,0 +1,106 @@
+"""This package provides tools for high-performance processing and validation
+of non-hierarchical VLSI circuits to aid rapid prototyping of research code
+in the fields of VLSI test, diagnosis and reliability.
+"""
+
+import time
+import importlib.util
+
+
+class Log:
+    def __init__(self):
+        self.start = time.perf_counter()
+        self.logfile = None
+
+    def log(self, level, message):
+        t = time.perf_counter() - self.start
+        if self.logfile is None:
+            print(f'{t:011.3f} {level} {message}')
+        else:
+            self.logfile.write(f'{t:011.3f} {level} {message}\n')
+            self.logfile.flush()
+
+    def info(self, message): self.log('-', message)
+
+    def warn(self, message): self.log('W', message)
+
+    def error(self, message): self.log('E', message)
+
+
+log = Log()
+
+
+class MockNumba:
+    @staticmethod
+    def njit(func):
+        def inner(*args, **kwargs):
+            return func(*args, **kwargs)
+        return inner
+
+
+class MockCuda:
+
+    def __init__(self):
+        self.x = 0
+        self.y = 0
+
+    def jit(self, device=False):
+        outer = self
+
+        def make_launcher(func):
+            class Launcher(object):
+                def __init__(self, funcc):
+                    self.func = funcc
+
+                def __call__(self, *args, **kwargs):
+                    # print(f'device func call {self.func.__name__}')
+                    return self.func(*args, **kwargs)
+
+                def __getitem__(self, item):
+                    grid_dim, block_dim = item
+                    # print(f'kernel call {self.func.__name__} grid_dim:{grid_dim} block_dim:{block_dim}')
+
+                    def inner(*args, **kwargs):
+                        for grid_x in range(grid_dim[0]):
+                            for grid_y in range(grid_dim[1]):
+                                for block_x in range(block_dim[0]):
+                                    for block_y in range(block_dim[1]):
+                                        outer.x = grid_x * block_dim[0] + block_x
+                                        outer.y = grid_y * block_dim[1] + block_y
+                                        self.func(*args, **kwargs)
+                    return inner
+            return Launcher(func)
+
+        return make_launcher
+
+    @staticmethod
+    def to_device(array, to=None):
+        if to is not None:
+            to[...] = array
+            return to
+        return array.copy()
+
+    def synchronize(self):
+        pass
+
+    def grid(self, dims):
+        return self.x, self.y
+
+
+if importlib.util.find_spec('numba') is not None:
+    import numba
+    import numba.cuda
+    from numba.cuda.cudadrv.error import CudaSupportError
+    try:
+        list(numba.cuda.gpus)
+        from numba import cuda
+    except CudaSupportError:
+        log.warn('Cuda unavailable. Falling back to pure python')
+        cuda = MockCuda()
+else:
+    numba = MockNumba()
+    cuda = MockCuda()
+    log.warn('Numba unavailable. Falling back to pure python')
+
+
+
diff --git a/kyupy/bench.py b/kyupy/bench.py
new file mode 100644
index 0000000..cf0662d
--- /dev/null
+++ b/kyupy/bench.py
@@ -0,0 +1,43 @@
+from lark import Lark, Transformer
+from .circuit import Circuit, Node, Line
+
+
+class BenchTransformer(Transformer):
+    
+    def __init__(self, name):
+        super().__init__()
+        self.c = Circuit(name)
+    
+    def start(self, _): return self.c
+        
+    def parameters(self, args): return [self.c.get_or_add_fork(name) for name in args]
+        
+    def interface(self, args): self.c.interface.extend(args[0])
+
+    def assignment(self, args):
+        name, cell_type, drivers = args
+        cell = Node(self.c, str(name), str(cell_type))
+        Line(self.c, cell, self.c.get_or_add_fork(str(name)))
+        [Line(self.c, d, cell) for d in drivers]
+        
+
+def parse(bench):
+    grammar = r"""
+    start: (statement)*
+    statement: input | output | assignment
+    input: ("INPUT" | "input") parameters -> interface
+    output: ("OUTPUT" | "output") parameters -> interface
+    assignment: NAME "=" NAME parameters
+    parameters: "(" [ NAME ( "," NAME )* ] ")"
+    NAME: /[-_a-z0-9]+/i
+    %ignore ( /\r?\n/ | "#" /[^\n]*/ | /[\t\f ]/ )+
+    """
+    name = None
+    if '(' not in str(bench):  # No parentheses?: Assuming it is a file name.
+        name = str(bench).replace('.bench', '')
+        with open(bench, 'r') as f:
+            text = f.read()
+    else:
+        text = bench
+    return Lark(grammar, parser="lalr", transformer=BenchTransformer(name)).parse(text)
+
diff --git a/kyupy/bittools.py b/kyupy/bittools.py
new file mode 100644
index 0000000..df4c033
--- /dev/null
+++ b/kyupy/bittools.py
@@ -0,0 +1,23 @@
+import numpy as np
+import importlib.util
+if importlib.util.find_spec('numba') is not None:
+    import numba
+else:
+    from . import numba
+    print('Numba unavailable. Falling back to pure python')
+
+
+_pop_count_lut = np.asarray([bin(x).count('1') for x in range(256)])
+
+
+def popcount(a):
+    return np.sum(_pop_count_lut[a])
+
+
+_bit_in_lut = np.array([2 ** x for x in range(7, -1, -1)], dtype='uint8')
+
+
+@numba.njit
+def bit_in(a, pos):
+    return a[pos >> 3] & _bit_in_lut[pos & 7]
+
diff --git a/kyupy/circuit.py b/kyupy/circuit.py
new file mode 100644
index 0000000..bad9a67
--- /dev/null
+++ b/kyupy/circuit.py
@@ -0,0 +1,236 @@
+from collections import deque
+
+
+class GrowingList(list):
+    def __setitem__(self, index, value):
+        if index >= len(self):
+            self.extend([None] * (index + 1 - len(self)))
+        super().__setitem__(index, value)
+
+
+class IndexList(list):
+    def __delitem__(self, index):
+        if index == len(self) - 1:
+            super().__delitem__(index)
+        else:
+            replacement = self.pop()
+            replacement.index = index
+            super().__setitem__(index, replacement)
+
+
+class Node:
+    """A Node is a named entity in a circuit (e.g. a gate, a standard cell,
+    a named signal, or a fan-out point) that has connections to other nodes.
+    Each node contains:
+
+    * `self.index`: a circuit-unique integer index.
+    * `self.kind`: a type describing its function (e.g. 'AND', 'NOR').
+      The type '__fork__' is special. It signifies a named signal
+      or a fan-out in the circuit. Any other type is considered a physical cell.
+    * `self.name`: a name. Names must be unique among all forks and all cells
+      in the circuit. However, a fork (`self.kind=='__fork__'`) and a cell with
+      the same name may coexist.
+    * `self.ins`: a list of input connections (objects of class `Line`)
+    * `self.outs`: a list of output connections (objects of class `Line`).
+    """
+    def __init__(self, circuit, name, kind='__fork__'):
+        if kind == '__fork__':
+            if name in circuit.forks:
+                raise ValueError(f'fork of name {name} already exists.')
+            circuit.forks[name] = self
+        else:
+            if name in circuit.cells:
+                raise ValueError(f'cell of name {name} already exists.')
+            circuit.cells[name] = self
+        self.index = len(circuit.nodes)
+        circuit.nodes.append(self)
+        self.circuit = circuit
+        self.name = name
+        self.kind = kind
+        self.ins = GrowingList()
+        self.outs = GrowingList()
+
+    def __repr__(self):
+        ins = ' '.join([f'<{line.index}' if line is not None else '<None' for line in self.ins])
+        outs = ' '.join([f'>{line.index}' if line is not None else '>None' for line in self.outs])
+        return f'{self.index}:{self.kind}"{self.name}" {ins} {outs}'
+
+    def remove(self):
+        if self.circuit is not None:
+            del self.circuit.nodes[self.index]
+            if self.kind == '__fork__':
+                del self.circuit.forks[self.name]
+            else:
+                del self.circuit.cells[self.name]
+            self.circuit = None
+
+
+class Line:
+    """A Line is a directional 1:1 connection between two Nodes. It always
+    connects an output of a node (called `driver`) to an input of a node
+    (called `reader`) and has a circuit-unique index (`self.index`).
+
+    Furthermore, `self.driver_pin` and `self.reader_pin` are the
+    integer indices of the connected pins of the nodes. They always correspond
+    to the positions of the line in the connection lists of the nodes:
+
+    * `self.driver.outs[self.driver_pin] == self`
+    * `self.reader.ins[self.reader_pin] == self`
+
+    A Line always connects a single driver to a single reader. If a signal fans out to
+    multiple readers, a '__fork__' Node needs to be added.
+    """
+    def __init__(self, circuit, driver, reader):
+        self.index = len(circuit.lines)
+        circuit.lines.append(self)
+        if type(driver) is Node:
+            self.driver = driver
+            self.driver_pin = len(driver.outs)
+            for pin, line in enumerate(driver.outs):
+                if line is None:
+                    self.driver_pin = pin
+                    break
+        else:
+            self.driver, self.driver_pin = driver
+        if type(reader) is Node:
+            self.reader = reader
+            self.reader_pin = len(reader.ins)
+            for pin, line in enumerate(reader.ins):
+                if line is None:
+                    self.reader_pin = pin
+                    break
+        else:
+            self.reader, self.reader_pin = reader
+        self.driver.outs[self.driver_pin] = self
+        self.reader.ins[self.reader_pin] = self
+
+    def remove(self):
+        circuit = None
+        if self.driver is not None:
+            self.driver.outs[self.driver_pin] = None
+            circuit = self.driver.circuit
+        if self.reader is not None:
+            self.reader.ins[self.reader_pin] = None
+            circuit = self.reader.circuit
+        if circuit is not None:
+            del circuit.lines[self.index]
+        self.driver = None
+        self.reader = None
+
+    def __repr__(self):
+        return f'{self.index}'
+
+    def __lt__(self, other):
+        return self.index < other.index
+
+
+class Circuit:
+    """A Circuit is a container for interconnected nodes and lines.
+
+    All contained lines have unique indices, so have all contained nodes.
+    These indices can be used to store additional data about nodes or lines
+    by allocating an array `my_data` of length `len(self.nodes)` and then
+    accessing it by `my_data[n.index]`. The indices may change iff lines or
+    nodes are removed from the circuit.
+
+    Nodes come in two flavors (cells and forks, see `Node`). The names of
+    these nodes are kept unique within these two flavors.
+    """
+    def __init__(self, name=None):
+        self.name = name
+        self.nodes = IndexList()
+        self.lines = IndexList()
+        self.interface = GrowingList()
+        self.cells = {}
+        self.forks = {}
+
+    def get_or_add_fork(self, name):
+        return self.forks[name] if name in self.forks else Node(self, name)
+    
+    def copy(self):
+        c = Circuit(self.name)
+        for node in self.nodes:
+            Node(c, node.name, node.kind)
+        for line in self.lines:
+            d = c.forks[line.driver.name] if line.driver.kind == '__fork__' else c.cells[line.driver.name]
+            r = c.forks[line.reader.name] if line.reader.kind == '__fork__' else c.cells[line.reader.name]
+            Line(c, (d, line.driver_pin), (r, line.reader_pin))
+        for node in self.interface:
+            if node.kind == '__fork__':
+                n = c.forks[node.name]
+            else:
+                n = c.cells[node.name]
+            c.interface.append(n)
+        return c
+    
+    def dump(self):
+        header = f'{self.name}({",".join([str(n.index) for n in self.interface])})\n'
+        return header + '\n'.join([str(n) for n in self.nodes])
+
+    def __repr__(self):
+        name = f" '{self.name}'" if self.name else ''
+        return f'<Circuit{name} with {len(self.nodes)} nodes, {len(self.lines)} lines, {len(self.interface)} ports>'
+
+    def topological_order(self):
+        visit_count = [0] * len(self.nodes)
+        queue = deque(n for n in self.nodes if len(n.ins) == 0 or 'DFF' in n.kind)
+        while len(queue) > 0:
+            n = queue.popleft()
+            for line in n.outs:
+                if line is None: continue
+                succ = line.reader
+                visit_count[succ.index] += 1
+                if visit_count[succ.index] == len(succ.ins) and 'DFF' not in succ.kind:
+                    queue.append(succ)
+            yield n
+
+    def topological_line_order(self):
+        for n in self.topological_order():
+            for line in n.outs:
+                if line is not None:
+                    yield line
+
+    def reversed_topological_order(self):
+        visit_count = [0] * len(self.nodes)
+        queue = deque(n for n in self.nodes if len(n.outs) == 0 or 'DFF' in n.kind)
+        while len(queue) > 0:
+            n = queue.popleft()
+            for line in n.ins:
+                pred = line.driver
+                visit_count[pred.index] += 1
+                if visit_count[pred.index] == len(pred.outs) and 'DFF' not in pred.kind:
+                    queue.append(pred)
+            yield n
+
+    def fanin(self, origin_nodes):
+        marks = [False] * len(self.nodes)
+        for n in origin_nodes:
+            marks[n.index] = True
+        for n in self.reversed_topological_order():
+            if not marks[n.index]:
+                for line in n.outs:
+                    if line is not None:
+                        marks[n.index] |= marks[line.reader.index]
+            if marks[n.index]:
+                yield n
+
+    def fanout_free_regions(self):
+        for stem in self.reversed_topological_order():
+            if len(stem.outs) == 1 and 'DFF' not in stem.kind: continue
+            region = []
+            if 'DFF' in stem.kind:
+                n = stem.ins[0]
+                if len(n.driver.outs) == 1 and 'DFF' not in n.driver.kind:
+                    queue = deque([n.driver])
+                else:
+                    queue = deque()
+            else:
+                queue = deque(n.driver for n in stem.ins
+                              if len(n.driver.outs) == 1 and 'DFF' not in n.driver.kind)
+            while len(queue) > 0:
+                n = queue.popleft()
+                preds = [pred.driver for pred in n.ins
+                         if len(pred.driver.outs) == 1 and 'DFF' not in pred.driver.kind]
+                queue.extend(preds)
+                region.append(n)
+            yield stem, region
diff --git a/kyupy/logic_sim.py b/kyupy/logic_sim.py
new file mode 100644
index 0000000..1ede8ee
--- /dev/null
+++ b/kyupy/logic_sim.py
@@ -0,0 +1,418 @@
+import numpy as np
+from . import packed_vectors
+
+
+class LogicSim:
+    """A bit-parallel naive combinational logic simulator supporting 1, 4, or 8-valued logics.
+    """
+    def __init__(self, circuit, nvectors=1, vdim=1):
+        self.circuit = circuit
+        self.nvectors = nvectors
+        nbytes = (nvectors - 1) // 8 + 1
+        self.interface = list(circuit.interface) + [n for n in circuit.nodes if 'dff' in n.kind.lower()]
+        self.state = np.zeros((len(circuit.lines), vdim, nbytes), dtype='uint8')
+        self.state_epoch = np.zeros(len(circuit.nodes), dtype='int8') - 1
+        self.tmp = np.zeros((5, vdim, nbytes), dtype='uint8')
+        self.zero = np.zeros((vdim, nbytes), dtype='uint8')
+        if vdim > 1:
+            self.zero[1] = 255
+        self.epoch = 0
+
+        self.fork_vd1 = self.fork_vdx
+        self.const0_vd1 = self.const0_vdx
+        self.input_vd1 = self.fork_vd1
+        self.output_vd1 = self.fork_vd1
+        self.inv_vd1 = self.not_vd1
+        self.nbuff_vd1 = self.fork_vd1
+        self.xor2_vd1 = self.xor_vd1
+        
+        self.fork_vd2 = self.fork_vdx
+        self.const0_vd2 = self.const0_vdx
+        self.input_vd2 = self.fork_vd2
+        self.output_vd2 = self.fork_vd2
+        self.inv_vd2 = self.not_vd2
+        self.nbuff_vd2 = self.fork_vd2
+        self.xor2_vd2 = self.xor_vd2
+        
+        self.fork_vd3 = self.fork_vdx
+        self.const0_vd3 = self.const0_vdx
+        self.input_vd3 = self.fork_vd3
+        self.output_vd3 = self.fork_vd3
+        self.inv_vd3 = self.not_vd3
+        self.nbuff_vd3 = self.fork_vd3
+        self.xor2_vd3 = self.xor_vd3
+        
+        known_fct = [(f[:-4], getattr(self, f)) for f in dir(self) if f.endswith(f'_vd{vdim}')]
+        self.node_fct = []
+        for n in circuit.nodes:
+            t = n.kind.lower().replace('__fork__', 'fork')
+            t = t.replace('__const0__', 'const0')
+            t = t.replace('__const1__', 'const1')
+            t = t.replace('tieh', 'const1')
+            # t = t.replace('xor', 'or').replace('xnor', 'nor')
+            fcts = [f for n, f in known_fct if t.startswith(n)]
+            if len(fcts) < 1:
+                raise ValueError(f'Unknown node kind {n.kind}')
+            self.node_fct.append(fcts[0])
+
+    def assign(self, stimuli):
+        if isinstance(stimuli, packed_vectors.PackedVectors):
+            stimuli = stimuli.bits
+        for (stim, node) in zip(stimuli, self.interface):
+            if len(node.outs) == 0: continue
+            outputs = [self.state[line.index] if line else self.tmp[3] for line in node.outs]
+            self.node_fct[node.index]([stim], outputs)
+            for line in node.outs:
+                if line:
+                    self.state_epoch[line.reader.index] = self.epoch
+        for n in self.circuit.nodes:
+            if (n.kind == '__const1__') or (n.kind == '__const0__'):
+                outputs = [self.state[line.index] if line else self.tmp[3] for line in n.outs]
+                self.node_fct[n.index]([], outputs)
+                # print('assign const')
+                for line in n.outs:
+                    if line:
+                        self.state_epoch[line.reader.index] = self.epoch
+
+    def capture(self, responses):
+        if isinstance(responses, packed_vectors.PackedVectors):
+            responses = responses.bits
+        for (resp, node) in zip(responses, self.interface):
+            if len(node.ins) == 0: continue
+            resp[...] = self.state[node.ins[0].index]
+
+    def propagate(self):
+        for node in self.circuit.topological_order():
+            if self.state_epoch[node.index] != self.epoch: continue
+            inputs = [self.state[line.index] if line else self.zero for line in node.ins]
+            outputs = [self.state[line.index] if line else self.tmp[3] for line in node.outs]
+            # print('sim', node)
+            self.node_fct[node.index](inputs, outputs)
+            for line in node.outs:
+                self.state_epoch[line.reader.index] = self.epoch
+        self.epoch = (self.epoch + 1) % 128
+
+    @staticmethod
+    def fork_vdx(inputs, outputs):
+        for o in outputs: o[...] = inputs[0]
+    
+    def const0_vdx(self, _, outputs):
+        for o in outputs: o[...] = self.zero
+
+    # 2-valued simulation
+
+    @staticmethod
+    def not_vd1(inputs, outputs):
+        outputs[0][0] = ~inputs[0][0]
+
+    def const1_vd1(self, _, outputs):
+        for o in outputs: o[...] = self.zero
+        self.not_vd1(outputs, outputs)
+
+    @staticmethod
+    def and_vd1(inputs, outputs):
+        o = outputs[0]
+        o[0] = inputs[0][0]
+        for i in inputs[1:]: o[0] &= i[0]
+
+    @staticmethod
+    def or_vd1(inputs, outputs):
+        o = outputs[0]
+        o[0] = inputs[0][0]
+        for i in inputs[1:]: o[0] |= i[0]
+
+    @staticmethod
+    def xor_vd1(inputs, outputs):
+        o = outputs[0]
+        o[0] = inputs[0][0]
+        for i in inputs[1:]: o[0] ^= i[0]
+
+    @staticmethod
+    def sdff_vd1(inputs, outputs):
+        outputs[0][0] = inputs[0][0]
+        if len(outputs) > 1:
+            outputs[1][0] = ~inputs[0][0]
+
+    @staticmethod
+    def dff_vd1(inputs, outputs):
+        outputs[0][0] = inputs[0][0]
+        if len(outputs) > 1:
+            outputs[1][0] = ~inputs[0][0]
+
+    def nand_vd1(self, inputs, outputs):
+        self.and_vd1(inputs, outputs)
+        self.not_vd1(outputs, outputs)
+
+    def nor_vd1(self, inputs, outputs):
+        self.or_vd1(inputs, outputs)
+        self.not_vd1(outputs, outputs)
+
+    def xnor_vd1(self, inputs, outputs):
+        self.xor_vd1(inputs, outputs)
+        self.not_vd1(outputs, outputs)
+
+    # 4-valued simulation
+    # sym [0] [1] (value, care)
+    #  0   0   1
+    #  1   1   1
+    #  -   0   0
+    #  X   1   0
+
+    @staticmethod
+    def not_vd2(inputs, outputs):
+        # 4-valued not:
+        # i: 0 1 - X
+        # o: 1 0 X X
+        # o0 1 0 1 1
+        # o1 1 1 0 0
+
+        outputs[0][0] = ~inputs[0][0] | ~inputs[0][1]  # value = 0 or DC
+        outputs[0][1] = inputs[0][1]  # care = C
+
+    def and_vd2(self, inputs, outputs):
+        # 4-valued:    o[0]:     o[1]:
+        #    0 1 - X   0 1 - X   0 1 - X
+        # 0  0 0 0 0   0 0 0 0   1 1 1 1
+        # 1  0 1 X X   0 1 1 1   1 1 0 0
+        # -  0 X X X   0 1 1 1   1 0 0 0
+        # X  0 X X X   0 1 1 1   1 0 0 0
+
+        i = inputs[0]
+        any0 = self.tmp[0]
+        anyd = self.tmp[1]
+        any0[0] = ~i[0] & i[1]
+        anyd[0] = ~i[1]
+        for i in inputs[1:]:
+            any0[0] |= ~i[0] & i[1]
+            anyd[0] |= ~i[1]
+        o = outputs[0]
+        o[0] = ~any0[0]  # value = no0
+        o[1] = any0[0] | ~anyd[0]  # care = any0 or noDC
+
+    def or_vd2(self, inputs, outputs):
+        # 4-valued:    o[0]:     o[1]:
+        #    0 1 - X   0 1 - X   0 1 - X
+        # 0  0 1 X X   0 1 1 1   1 1 0 0
+        # 1  1 1 1 1   1 1 1 1   1 1 1 1
+        # -  X 1 X X   1 1 1 1   0 1 0 0
+        # X  X 1 X X   1 1 1 1   0 1 0 0
+
+        i = inputs[0]
+        any1 = self.tmp[0]
+        anyd = self.tmp[1]
+        any1[0] = i[0] & i[1]
+        anyd[0] = ~i[1]
+        for i in inputs[1:]:
+            any1[0] |= i[0] & i[1]
+            anyd[0] |= ~i[1]
+        o = outputs[0]
+        o[0] = any1[0] | anyd[0]  # value = any1 or anyDC
+        o[1] = any1[0] | ~anyd[0]  # care = any1 or noDC
+
+    def xor_vd2(self, inputs, outputs):
+        # 4-valued:    o[0]:     o[1]:
+        #    0 1 - X   0 1 - X   0 1 - X
+        # 0  0 1 X X   0 1 1 1   1 1 0 0
+        # 1  1 0 X X   1 0 1 1   1 1 0 0
+        # -  X X X X   1 1 1 1   0 0 0 0
+        # X  X X X X   1 1 1 1   0 0 0 0
+
+        i = inputs[0]
+        odd1 = self.tmp[0]
+        anyd = self.tmp[1]
+        odd1[0] = i[0] & i[1]
+        anyd[0] = ~i[1]
+        for i in inputs[1:]:
+            odd1[0] ^= i[0] & i[1]
+            anyd[0] |= ~i[1]
+        o = outputs[0]
+        o[0] = odd1[0] | anyd[0]  # value = odd1 or anyDC
+        o[1] = ~anyd[0]  # care = noDC
+
+    def sdff_vd2(self, inputs, outputs):
+        self.dff_vd2(inputs, outputs)
+        if len(outputs) > 1:
+            outputs[1][0] = ~inputs[0][0] | ~inputs[0][1]  # value = 0 or DC
+            outputs[1][1] = inputs[0][1]  # care = C
+
+    @staticmethod
+    def dff_vd2(inputs, outputs):
+        outputs[0][0] = inputs[0][0] | ~inputs[0][1]  # value = 1 or DC
+        outputs[0][1] = inputs[0][1]  # care = C
+
+    def nand_vd2(self, inputs, outputs):
+        self.and_vd2(inputs, outputs)
+        self.not_vd2(outputs, outputs)
+
+    def nor_vd2(self, inputs, outputs):
+        self.or_vd2(inputs, outputs)
+        self.not_vd2(outputs, outputs)
+
+    def xnor_vd2(self, inputs, outputs):
+        self.xor_vd2(inputs, outputs)
+        self.not_vd2(outputs, outputs)
+    
+    def const1_vd2(self, _, outputs):
+        for o in outputs: o[...] = self.zero
+        self.not_vd2(outputs, outputs)
+
+    # 8-valued simulation
+    # sym [0] [1] [2] (initial value, ~final value, toggles present?)
+    #  0   0   1   0
+    #  1   1   0   0
+    #  -   0   0   0
+    #  X   1   1   0
+    #  R   0   0   1  _/"
+    #  F   1   1   1  "\_
+    #  P   0   1   1  _/\_
+    #  N   1   0   1  "\/"
+
+    def not_vd3(self, inputs, outputs):
+        # 8-valued not:
+        # i: 0 1 - X R F P N
+        # i0 0 1 0 1 0 1 0 1
+        # i1 1 0 0 1 0 1 1 0
+        # i2 0 0 0 0 1 1 1 1
+        # o: 1 0 X X F R N P
+        # o0 1 0 1 1 1 0 1 0
+        # o1 0 1 1 1 1 0 0 1
+        # o2 0 0 0 0 1 1 1 1
+        i = inputs[0]
+        dc = self.tmp[0]
+        dc[0] = ~(i[0] ^ i[1]) & ~i[2]
+        dc = self.tmp[0]
+        outputs[0][0] = ~i[0] | dc[0]  # init.v = ~i0 or DC
+        outputs[0][1] = ~i[1] | dc[0]  # init.v = ~i1 or DC
+        outputs[0][2] = i[2]  # toggles = i2
+
+    def and_vd3(self, inputs, outputs):
+        # 8-valued:           o[0]:            o[1]:            o[2]:
+        #    0 1 - X R F P N  0 1 - X R F P N  0 1 - X R F P N  0 1 - X R F P N
+        # 0  0 0 0 0 0 0 0 0  0 0 0 0 0 0 0 0  1 1 1 1 1 1 1 1  0 0 0 0 0 0 0 0
+        # 1  0 1 X X R F P N  0 1 1 1 0 1 0 1  1 0 1 1 0 1 1 0  0 0 0 0 1 1 1 1
+        # -  0 X X X X X X X  0 1 1 1 1 1 1 1  1 1 1 1 1 1 1 1  0 0 0 0 0 0 0 0
+        # X  0 X X X X X X X  0 1 1 1 1 1 1 1  1 1 1 1 1 1 1 1  0 0 0 0 0 0 0 0
+        # R  0 R X X R R P R  0 0 1 1 0 0 0 0  1 0 1 1 0 0 1 0  0 1 0 0 1 1 1 1
+        # F  0 F X X R F P F  0 1 1 1 0 1 0 1  1 1 1 1 0 1 1 1  0 1 0 0 1 1 1 1
+        # P  0 P X X P P P P  0 0 1 1 0 0 0 0  1 1 1 1 1 1 1 1  0 1 0 0 1 1 1 1
+        # N  0 N X X R F P N  0 1 1 1 0 1 0 1  1 0 1 1 0 1 1 0  0 1 0 0 1 1 1 1
+        i = inputs[0]
+        anyi0 = self.tmp[0]
+        anyf0 = self.tmp[1]
+        anyd = self.tmp[2]
+        any0 = self.tmp[3]
+        any_t = self.tmp[4]
+        anyd[0] = ~(i[0] ^ i[1]) & ~i[2]
+        anyi0[0] = ~i[0] & ~anyd[0]
+        anyf0[0] = i[1] & ~anyd[0]
+        any_t[0] = i[2]
+        any0[0] = anyi0[0] & anyf0[0] & ~i[2]
+        for i in inputs[1:]:
+            dc = ~(i[0] ^ i[1]) & ~i[2]
+            anyd[0] |= dc
+            anyi0[0] |= ~i[0] & ~dc
+            anyf0[0] |= i[1] & ~dc
+            any_t[0] |= i[2]
+            any0[0] |= ~i[0] & ~dc & i[1] & ~i[2]
+        o = outputs[0]
+        o[0] = (~anyi0[0] | anyd[0]) & ~any0[0]  # initial = no_i0 or DC
+        o[1] = anyf0[0] | anyd[0]  # ~final = ~no_f0 or DC
+        o[2] = any_t[0] & ~(anyd[0] | any0[0])  # toggle = anyT and noDC and no0
+
+    def or_vd3(self, inputs, outputs):
+        # 8-valued:           o[0]:            o[1]:            o[2]:
+        #    0 1 - X R F P N  0 1 - X R F P N  0 1 - X R F P N  0 1 - X R F P N
+        # 0  0 1 X X R F P N  0 1 1 1 0 1 0 1  1 0 1 1 0 1 1 0  0 0 0 0 1 1 1 1
+        # 1  1 1 1 1 1 1 1 1  1 1 1 1 1 1 1 1  0 0 0 0 0 0 0 0  0 0 0 0 0 0 0 0
+        # -  X 1 X X X X X X  1 1 1 1 1 1 1 1  1 0 1 1 1 1 1 1  0 0 0 0 0 0 0 0
+        # X  X 1 X X X X X X  1 1 1 1 1 1 1 1  1 0 1 1 1 1 1 1  0 0 0 0 0 0 0 0
+        # R  R 1 X X R N R R  0 1 1 1 0 1 0 0  0 0 1 1 0 0 0 0  1 0 0 0 1 1 1 1
+        # F  F 1 X X N F F F  1 1 1 1 1 1 1 1  1 0 1 1 0 1 1 1  1 0 0 0 1 1 1 1
+        # P  P 1 X X R F P N  0 1 1 1 0 1 0 1  1 0 1 1 0 1 1 0  1 0 0 0 1 1 1 1
+        # N  N 1 X X R F N N  1 1 1 1 0 1 1 1  0 0 1 1 0 1 0 0  1 0 0 0 1 1 1 1
+        i = inputs[0]
+        anyi1 = self.tmp[0]
+        anyf1 = self.tmp[1]
+        anyd = self.tmp[2]
+        any1 = self.tmp[3]
+        any_t = self.tmp[4]
+        anyd[0] = ~(i[0] ^ i[1]) & ~i[2]
+        anyi1[0] = i[0] & ~anyd[0]
+        anyf1[0] = ~i[1] & ~anyd[0]
+        any_t[0] = i[2]
+        any1[0] = (anyi1[0] & anyf1[0]) & ~i[2]
+        for i in inputs[1:]:
+            dc = ~(i[0] ^ i[1]) & ~i[2]
+            anyd[0] |= dc
+            anyi1[0] |= i[0] & ~dc
+            anyf1[0] |= ~i[1] & ~dc
+            any_t[0] |= i[2]
+            any1[0] |= i[0] & ~dc & ~i[1] & ~i[2]
+        o = outputs[0]
+        o[0] = anyi1[0] | anyd[0]  # initial = i1 or DC
+        o[1] = (~anyf1[0] | anyd[0]) & ~any1[0]  # ~final = f1 or DC
+        o[2] = any_t[0] & ~(anyd[0] | any1[0])  # toggle = anyT and no(DC or 1)
+
+    def xor_vd3(self, inputs, outputs):
+        # 8-valued:           o[0]:            o[1]:            o[2]:
+        #    0 1 - X R F P N  0 1 - X R F P N  0 1 - X R F P N  0 1 - X R F P N
+        # 0  0 1 X X R F P N  0 1 1 1 0 1 0 1  1 0 1 1 0 1 1 0  0 0 0 0 1 1 1 1
+        # 1  1 0 X X F R N P  1 0 1 1 1 0 1 0  0 1 1 1 1 0 0 1  0 0 0 0 1 1 1 1
+        # -  X X X X X X X X  1 1 1 1 1 1 1 1  1 1 1 1 1 1 1 1  0 0 0 0 0 0 0 0
+        # X  X X X X X X X X  1 1 1 1 1 1 1 1  1 1 1 1 1 1 1 1  0 0 0 0 0 0 0 0
+        # R  R F X X P N R F  0 1 1 1 0 1 0 1  0 1 1 1 1 0 0 1  1 1 0 0 1 1 1 1
+        # F  F R X X N P F R  1 0 1 1 1 0 1 0  1 0 1 1 0 1 1 0  1 1 0 0 1 1 1 1
+        # P  P N X X R F P N  0 1 1 1 0 1 0 1  1 0 1 1 0 1 1 0  1 1 0 0 1 1 1 1
+        # N  N P X X F R N P  1 0 1 1 1 0 1 0  0 1 1 1 1 0 0 1  1 1 0 0 1 1 1 1
+        i = inputs[0]
+        odd0 = self.tmp[0]
+        odd1 = self.tmp[1]
+        anyd = self.tmp[2]
+        anyt = self.tmp[3]
+        odd0[0] = i[0]
+        odd1[0] = i[1]
+        anyd[0] = ~(i[0] ^ i[1]) & ~i[2]
+        anyt[0] = i[2]
+        for i in inputs[1:]:
+            odd0[0] ^= i[0]
+            odd1[0] ^= i[1]
+            anyd[0] |= ~(i[0] ^ i[1]) & ~i[2]
+            anyt[0] |= i[2]
+        o = outputs[0]
+        o[0] = odd0[0] | anyd[0]
+        o[1] = ~odd1[0] | anyd[0]
+        o[2] = anyt[0] & ~anyd[0]
+        
+    def sdff_vd3(self, inputs, outputs):
+        self.dff_vd3(inputs, outputs)
+        if len(outputs) > 1:
+            i = inputs[0]
+            dc = self.tmp[0]
+            dc[0] = ~(i[0] ^ i[1]) & ~i[2]
+            outputs[1][0] = ~i[0] | dc[0]  # value = 1 or DC
+            outputs[1][1] = ~i[1] | dc[0]  # value = 1 or DC
+            outputs[1][2] = i[2]  # toggle = T
+
+    def dff_vd3(self, inputs, outputs):
+        i = inputs[0]
+        dc = self.tmp[0]
+        dc[0] = ~(i[0] ^ i[1]) & ~i[2]
+        outputs[0][0] = i[0] | dc[0]  # value = 1 or DC
+        outputs[0][1] = i[1] | dc[0]  # value = 1 or DC
+        outputs[0][2] = i[2]  # toggle = T
+
+    def nand_vd3(self, inputs, outputs):
+        self.and_vd3(inputs, outputs)
+        self.not_vd3(outputs, outputs)
+
+    def nor_vd3(self, inputs, outputs):
+        self.or_vd3(inputs, outputs)
+        self.not_vd3(outputs, outputs)
+
+    def xnor_vd3(self, inputs, outputs):
+        self.xor_vd3(inputs, outputs)
+        self.not_vd3(outputs, outputs)
+        
+    def const1_vd3(self, _, outputs):
+        for o in outputs: o[...] = self.zero
+        self.not_vd3(outputs, outputs)
diff --git a/kyupy/packed_vectors.py b/kyupy/packed_vectors.py
new file mode 100644
index 0000000..ffd3dda
--- /dev/null
+++ b/kyupy/packed_vectors.py
@@ -0,0 +1,299 @@
+import numpy as np
+from .bittools import popcount, bit_in
+
+
+class PackedVectors:
+    def __init__(self, nvectors=8, width=1, vdim=1, from_cache=None):
+        if from_cache is not None:
+            self.bits = np.array(from_cache)
+            self.width, self.vdim, nbytes = self.bits.shape
+        else:
+            self.bits = np.zeros((width, vdim, (nvectors - 1) // 8 + 1), dtype='uint8')
+            self.vdim = vdim
+            self.width = width
+        self.nvectors = nvectors
+        m1 = np.array([2 ** x for x in range(7, -1, -1)], dtype='uint8')
+        m0 = ~m1
+        self.mask = np.rollaxis(np.vstack((m0, m1)), 1)
+
+    @classmethod
+    def from_pair(cls, init, final):
+        assert init.nvectors == final.nvectors
+        assert len(init.bits) == len(final.bits)
+        init_v = init.bits[:, 0]
+        if init.vdim == 3:
+            init_c = (init.bits[:, 0] ^ init.bits[:, 1]) | init.bits[:, 2]
+        elif init.vdim == 2:
+            init_c = init.bits[:, 1]
+        else:
+            init_c = ~np.zeros_like(init.bits[:, 0])
+        final_v = final.bits[:, 0]
+        if final.vdim == 3:
+            final_c = (final.bits[:, 0] ^ final.bits[:, 1]) | final.bits[:, 2]
+            final_v = ~final.bits[:, 1]
+        elif final.vdim == 2:
+            final_c = final.bits[:, 1]
+        else:
+            final_c = ~np.zeros_like(final.bits[:, 0])
+        c = init_c & final_c
+        a0 = init_v & c
+        a1 = ~final_v & c
+        a2 = (init_v ^ final_v) & c
+        p = PackedVectors(init.nvectors, len(init.bits), 3)
+        p.bits[:, 0] = a0
+        p.bits[:, 1] = a1
+        p.bits[:, 2] = a2
+        return p
+        
+    def transition_vectors(self):
+        a = PackedVectors(self.nvectors-1, self.width, 3)
+        for pos in range(self.width):
+            for vidx in range(self.nvectors-1):
+                tr = self.get_value(vidx, pos) + self.get_value(vidx+1, pos)
+                if tr == '00':
+                    a.set_value(vidx, pos, '0')
+                elif tr == '11':
+                    a.set_value(vidx, pos, '1')
+                elif tr == '01':
+                    a.set_value(vidx, pos, 'R')
+                elif tr == '10':
+                    a.set_value(vidx, pos, 'F')
+                elif tr == '--':
+                    a.set_value(vidx, pos, '-')
+                else:
+                    a.set_value(vidx, pos, 'X')
+        return a
+        
+    def __add__(self, other):
+        a = PackedVectors(self.nvectors + other.nvectors, self.width, max(self.vdim, other.vdim))
+        # a.bits[:self.bits.shape[0], 0] = self.bits[:, 0]
+        # if self.vdim == 2:
+        #    a.bits[:self.bits.shape[0], 1] = self.care_bits
+        # elif self.vdim == 3:
+        #    a.bits[:self.bits.shape[0], 1] = ~self.value_bits
+        #    a.bits[:self.bits.shape[0], 2] = self.toggle_bits
+        for i in range(self.nvectors):
+            a[i] = self[i]
+        for i in range(len(other)):
+            a[self.nvectors+i] = other[i]
+        return a
+
+    def __len__(self):
+        return self.nvectors
+    
+    def randomize(self, one_probability=0.5):
+        for data in self.bits:
+            data[0] = np.packbits((np.random.rand(self.nvectors) < one_probability).astype(int))
+            if self.vdim == 2:
+                data[1] = 255
+            elif self.vdim == 3:
+                data[1] = ~np.packbits((np.random.rand(self.nvectors) < one_probability).astype(int))
+                data[2] = data[0] ^ ~data[1]
+            
+    def copy(self, selection_mask=None):
+        if selection_mask is not None:
+            cpy = PackedVectors(popcount(selection_mask), len(self.bits), self.vdim)
+            cur = 0
+            for vidx in range(self.nvectors):
+                if bit_in(selection_mask, vidx):
+                    cpy[cur] = self[vidx]
+                    cur += 1
+        else:
+            cpy = PackedVectors(self.nvectors, len(self.bits), self.vdim)
+            np.copyto(cpy.bits, self.bits)
+        return cpy
+
+    @property
+    def care_bits(self):
+        if self.vdim == 1:
+            return self.bits[:, 0] | 255
+        elif self.vdim == 2:
+            return self.bits[:, 1]
+        elif self.vdim == 3:
+            return (self.bits[:, 0] ^ self.bits[:, 1]) | self.bits[:, 2]
+
+    @property
+    def initial_bits(self):
+        return self.bits[:, 0]
+
+    @property
+    def value_bits(self):
+        if self.vdim == 3:
+            return ~self.bits[:, 1]
+        else:
+            return self.bits[:, 0]
+
+    @property
+    def toggle_bits(self):
+        if self.vdim == 3:
+            return self.bits[:, 2]
+        else:
+            return self.bits[:, 0] & 0
+
+    def get_value(self, vector, position):
+        if vector >= self.nvectors:
+            raise IndexError(f'vector out of range: {vector} >= {self.nvectors}')
+        a = self.bits[position, :, vector // 8]
+        m = self.mask[vector % 8]
+        if self.vdim == 1:
+            return '1' if a[0] & m[1] else '0'
+        elif self.vdim == 2:
+            if a[0] & m[1]:
+                return '1' if a[1] & m[1] else 'X'
+            else:
+                return '0' if a[1] & m[1] else '-'
+        elif self.vdim == 3:
+            if a[2] & m[1]:
+                if a[0] & m[1]:
+                    return 'F' if a[1] & m[1] else 'N'
+                else:
+                    return 'P' if a[1] & m[1] else 'R'
+            else:
+                if a[0] & m[1]:
+                    return 'X' if a[1] & m[1] else '1'
+                else:
+                    return '0' if a[1] & m[1] else '-'                
+
+    def get_values_for_position(self, position):
+        return ''.join(self.get_value(x, position) for x in range(self.nvectors))
+
+    def set_value(self, vector, position, v):
+        if vector >= self.nvectors:
+            raise IndexError(f'vector out of range: {vector} >= {self.nvectors}')
+        a = self.bits[position, :, vector // 8]
+        m = self.mask[vector % 8]
+        if self.vdim == 1:
+            self._set_value_vd1(a, m, v)
+        elif self.vdim == 2:
+            self._set_value_vd2(a, m, v)
+        elif self.vdim == 3:
+            self._set_value_vd3(a, m, v)
+    
+    def set_values(self, vector, v, mapping=None, inversions=None):
+        if vector >= self.nvectors:
+            raise IndexError(f'vector out of range: {vector} >= {self.nvectors}')
+        if not mapping:
+            mapping = [y for y in range(len(v))]
+        if inversions is None:
+            inversions = [False] * len(v)
+        for i, c in enumerate(v):
+            if inversions[i]:
+                if c == '1':
+                    c = '0'
+                elif c == '0':
+                    c = '1'
+                elif c == 'H':
+                    c = 'L'
+                elif c == 'L':
+                    c = 'H'
+                elif c == 'R':
+                    c = 'F'
+                elif c == 'F':
+                    c = 'R'
+            self.set_value(vector, mapping[i], c)
+    
+    def set_values_for_position(self, position, values):
+        for i, v in enumerate(values):
+            self.set_value(i, position, v)
+            
+    def __setitem__(self, vector, value):
+        for i, c in enumerate(value):
+            self.set_value(vector, i, c)
+
+    def __getitem__(self, vector):
+        if isinstance(vector, slice):
+            first = self.get_values_for_position(0)[vector]
+            ret = PackedVectors(len(first), self.width, self.vdim)
+            ret.set_values_for_position(0, first)
+            for pos in range(1, self.width):
+                ret.set_values_for_position(pos, self.get_values_for_position(pos)[vector])
+            return ret
+        return ''.join(self.get_value(vector, pos) for pos in range(len(self.bits)))
+
+    @staticmethod
+    def _set_value_vd1(a, m, v):
+        if v in [True, 1, '1', 'H', 'h']:
+            a[0] |= m[1]
+        else:
+            a[0] &= m[0]
+    
+    @staticmethod
+    def _set_value_vd2(a, m, v):
+        if v in [True, 1, '1', 'H', 'h']:
+            a[0] |= m[1]
+            a[1] |= m[1]
+        elif v in [False, 0, '0', 'L', 'l']:
+            a[0] &= m[0]
+            a[1] |= m[1]
+        elif v in ['X', 'x']:
+            a[0] |= m[1]
+            a[1] &= m[0]
+        else:
+            a[0] &= m[0]
+            a[1] &= m[0]
+
+    #   i fb act
+    # a 0 1 2
+    # - 0 0 0  None, '-'
+    # 0 0 1 0  False, 0, '0', 'l', 'L'
+    # 1 1 0 0  True, 1, '1', 'h', 'H'
+    # X 1 1 0  'x', 'X'
+    # / 0 0 1  '/', 'r', 'R'
+    # ^ 0 1 1  '^', 'p', 'P'
+    # v 1 0 1  'v', 'n', 'N'
+    # \ 1 1 1  '\', 'f', 'F'
+    @staticmethod
+    def _set_value_vd3(a, m, v):
+        if v in [False, 0, '0', 'L', 'l']:
+            a[0] &= m[0]
+            a[1] |= m[1]
+            a[2] &= m[0]
+        elif v in [True, 1, '1', 'H', 'h']:
+            a[0] |= m[1]
+            a[1] &= m[0]
+            a[2] &= m[0]
+        elif v in ['X', 'x']:
+            a[0] |= m[1]
+            a[1] |= m[1]
+            a[2] &= m[0]
+        elif v in ['/', 'r', 'R']:
+            a[0] &= m[0]
+            a[1] &= m[0]
+            a[2] |= m[1]
+        elif v in ['^', 'p', 'P']:
+            a[0] &= m[0]
+            a[1] |= m[1]
+            a[2] |= m[1]
+        elif v in ['v', 'n', 'N']:
+            a[0] |= m[1]
+            a[1] &= m[0]
+            a[2] |= m[1]
+        elif v in ['\\', 'f', 'F']:
+            a[0] |= m[1]
+            a[1] |= m[1]
+            a[2] |= m[1]
+        else:
+            a[0] &= m[0]
+            a[1] &= m[0]
+            a[2] &= m[0]
+                                    
+    def __repr__(self):
+        return f'<PackedVectors nvectors={self.nvectors}, width={self.width}, vdim={self.vdim}>'
+
+    def __str__(self):
+        lst = []
+        for p in range(self.nvectors):
+            lst.append(''.join(self.get_value(p, w) for w in range(len(self.bits))))
+        if len(lst) == 0: return ''
+        if len(lst[0]) > 64:
+            lst = [s[:32] + '...' + s[-32:] for s in lst]
+        if len(lst) <= 16:
+            return '\n'.join(lst)
+        else:
+            return '\n'.join(lst[:8]) + '\n...\n' + '\n'.join(lst[-8:])
+            
+    def diff(self, other, out=None):
+        if out is None:
+            out = np.zeros((self.width, self.bits.shape[-1]), dtype='uint8')
+        out[...] = (self.value_bits ^ other.value_bits) & self.care_bits & other.care_bits
+        return out
diff --git a/kyupy/saed.py b/kyupy/saed.py
new file mode 100644
index 0000000..10fca85
--- /dev/null
+++ b/kyupy/saed.py
@@ -0,0 +1,185 @@
+from kyupy.circuit import Node, Line
+
+
+def pin_index(cell_type, pin):
+    if cell_type.startswith('MUX21') and pin == 'S': return 2
+    if cell_type.startswith('SDFF') and pin == 'QN': return 1
+    if cell_type.startswith('DFF') and pin == 'QN': return 1
+    if cell_type.startswith('DFF') and pin == 'CLK': return 1
+    if pin in ['A2', 'IN2', 'SE', 'B', 'CO']: return 1
+    if pin in ['A3', 'IN3', 'SI', 'CI']: return 2
+    if pin == 'A4' or pin == 'IN4' or pin == 'CLK': return 3  # CLK for scan cells SDFF
+    if pin == 'A5' or pin == 'IN5' or pin == 'RSTB': return 4
+    if pin == 'A6' or pin == 'IN6' or pin == 'SETB': return 5
+    return 0
+
+
+def pin_is_output(kind, pin):
+    if 'MUX' in kind and pin == 'S':
+        return False
+    return pin in ['Q', 'QN', 'Z', 'ZN', 'Y', 'CO', 'S']
+
+
+def add_and_connect(circuit, name, kind, in1=None, in2=None, out=None):
+    n = Node(circuit, name, kind)
+    if in1 is not None:
+        n.ins[0] = in1
+        in1.reader = n
+        in1.reader_pin = 0
+    if in2 is not None:
+        n.ins[1] = in2
+        in2.reader = n
+        in2.reader_pin = 1
+    if out is not None:
+        n.outs[0] = out
+        out.driver = n
+        out.driver_pin = 0
+    return n
+
+
+def split_complex_gates(circuit):
+    node_list = circuit.nodes
+    for n in node_list:
+        name = n.name
+        ins = n.ins
+        outs = n.outs
+        if n.kind.startswith('AO21X'):
+            n.remove()
+            n_and = add_and_connect(circuit, name+'~and', 'AND2', ins[0], ins[1], None)
+            n_or = add_and_connect(circuit, name+'~or', 'OR2', None, ins[2], outs[0])
+            Line(circuit, n_and, n_or)
+        elif n.kind.startswith('AOI21X'):
+            n.remove()
+            n_and = add_and_connect(circuit, name+'~and', 'AND2', ins[0], ins[1], None)
+            n_nor = add_and_connect(circuit, name+'~nor', 'NOR2', None, ins[2], outs[0])
+            Line(circuit, n_and, n_nor)
+        elif n.kind.startswith('OA21X'):
+            n.remove()
+            n_or = add_and_connect(circuit, name+'~or', 'OR2', ins[0], ins[1], None)
+            n_and = add_and_connect(circuit, name+'~and', 'AND2', None, ins[2], outs[0])
+            Line(circuit, n_or, n_and)
+        elif n.kind.startswith('OAI21'):
+            n.remove()
+            n_or = add_and_connect(circuit, name+'~or', 'OR2', ins[0], ins[1], None)
+            n_nand = add_and_connect(circuit, name+'~nand', 'NAND2', None, ins[2], outs[0])
+            Line(circuit, n_or, n_nand)
+        elif n.kind.startswith('OA22X'):
+            n.remove()
+            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
+            n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
+            n_and = add_and_connect(circuit, name+'~and', 'AND2', None, None, outs[0])
+            Line(circuit, n_or0, n_and)
+            Line(circuit, n_or1, n_and)
+        elif n.kind.startswith('AO22X'):
+            n.remove()
+            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
+            n_or = add_and_connect(circuit, name+'~or', 'OR2', None, None, outs[0])
+            Line(circuit, n_and0, n_or)
+            Line(circuit, n_and1, n_or)
+        elif n.kind.startswith('AO221X'):
+            n.remove()
+            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
+            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', None, None, None)
+            n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', None, ins[4], outs[0])
+            Line(circuit, n_and0, n_or0)
+            Line(circuit, n_and1, n_or0)
+            Line(circuit, n_or0, n_or1)     
+        elif n.kind.startswith('AOI221X'):
+            n.remove()
+            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
+            n_or = add_and_connect(circuit, name+'~or', 'OR2', None, None, None)
+            n_nor = add_and_connect(circuit, name+'~nor', 'NOR2', None, ins[4], outs[0])
+            Line(circuit, n_and0, n_or)
+            Line(circuit, n_and1, n_or)
+            Line(circuit, n_or, n_nor)     
+        elif n.kind.startswith('OA221X'):
+            n.remove()
+            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
+            n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
+            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', None, None, None)
+            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', None, ins[4], outs[0])
+            Line(circuit, n_or0, n_and0)
+            Line(circuit, n_or1, n_and0)
+            Line(circuit, n_and0, n_and1)    
+        elif n.kind.startswith('AO222X'):
+            n.remove()
+            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
+            n_and2 = add_and_connect(circuit, name+'~and2', 'AND2', ins[4], ins[5], None)
+            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', None, None, None)
+            n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', None, None, outs[0])
+            Line(circuit, n_and0, n_or0)
+            Line(circuit, n_and1, n_or0)
+            Line(circuit, n_and2, n_or1)
+            Line(circuit, n_or0, n_or1)
+        elif n.kind.startswith('AOI222X'):
+            n.remove()
+            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', ins[0], ins[1], None)
+            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', ins[2], ins[3], None)
+            n_and2 = add_and_connect(circuit, name+'~and2', 'AND2', ins[4], ins[5], None)
+            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', None, None, None)
+            n_nor1 = add_and_connect(circuit, name+'~nor1', 'NOR2', None, None, outs[0])
+            Line(circuit, n_and0, n_or0)
+            Line(circuit, n_and1, n_or0)
+            Line(circuit, n_and2, n_nor1)
+            Line(circuit, n_or0, n_nor1)
+        elif n.kind.startswith('OA222X'):
+            n.remove()
+            n_or0 = add_and_connect(circuit, name+'~or0', 'OR2', ins[0], ins[1], None)
+            n_or1 = add_and_connect(circuit, name+'~or1', 'OR2', ins[2], ins[3], None)
+            n_or2 = add_and_connect(circuit, name+'~or2', 'OR2', ins[4], ins[5], None)
+            n_and0 = add_and_connect(circuit, name+'~and0', 'AND2', None, None, None)
+            n_and1 = add_and_connect(circuit, name+'~and1', 'AND2', None, None, outs[0])
+            Line(circuit, n_or0, n_and0)
+            Line(circuit, n_or1, n_and0)
+            Line(circuit, n_or2, n_and1)
+            Line(circuit, n_and0, n_and1)
+        elif n.kind.startswith('NOR3X'):
+            n.remove()
+            n_or = add_and_connect(circuit, name+'~or', 'OR2', ins[0], ins[1], None)
+            n_nor = add_and_connect(circuit, name+'~nor', 'NOR2', None, ins[2], outs[0])
+            Line(circuit, n_or, n_nor)
+        elif n.kind.startswith('FADDX'):
+            n.remove()
+            # forks for fan-outs
+            f_a = add_and_connect(circuit, name + '~fork0', '__fork__', ins[0])
+            f_b = add_and_connect(circuit, name + '~fork1', '__fork__', ins[1])
+            f_ci = add_and_connect(circuit, name + '~fork2', '__fork__', ins[2])
+            f_ab = Node(circuit, name + '~fork3')
+            # sum-block
+            n_xor0 = Node(circuit, name + '~xor0', 'XOR2')
+            Line(circuit, f_a, n_xor0)
+            Line(circuit, f_b, n_xor0)
+            Line(circuit, n_xor0, f_ab)
+            if len(outs) > 0 and outs[0] is not None:
+                n_xor1 = add_and_connect(circuit, name + '~xor1', 'XOR2', None, None, outs[0])
+                Line(circuit, f_ab, n_xor1)
+                Line(circuit, f_ci, n_xor1)
+            # carry-block
+            if len(outs) > 1 and outs[1] is not None:
+                n_and0 = Node(circuit, name + '~and0', 'AND2')
+                Line(circuit, f_ab, n_and0)
+                Line(circuit, f_ci, n_and0)
+                n_and1 = Node(circuit, name + '~and1', 'AND2')
+                Line(circuit, f_a, n_and1)
+                Line(circuit, f_b, n_and1)
+                n_or = add_and_connect(circuit, name + '~or0', 'OR2', None, None, outs[1])
+                Line(circuit, n_and0, n_or)
+                Line(circuit, n_and1, n_or)
+        elif n.kind.startswith('MUX21X'):
+            n.remove()
+            f_s = add_and_connect(circuit, name + '~fork0', '__fork__', ins[2])
+            n_not = Node(circuit, name + '~not', 'INV')
+            Line(circuit, f_s, n_not)
+            n_and0 = add_and_connect(circuit, name + '~and0', 'AND2', ins[0])
+            n_and1 = add_and_connect(circuit, name + '~and1', 'AND2', ins[1])
+            n_or0 = add_and_connect(circuit, name + '~or0', 'OR2', None, None, outs[0])
+            Line(circuit, n_not, n_and0)
+            Line(circuit, f_s, n_and1)
+            Line(circuit, n_and0, n_or0)
+            Line(circuit, n_and1, n_or0)
+
+
diff --git a/kyupy/sdf.py b/kyupy/sdf.py
new file mode 100644
index 0000000..e6db7f7
--- /dev/null
+++ b/kyupy/sdf.py
@@ -0,0 +1,213 @@
+import numpy as np
+from lark import Lark, Transformer
+from collections import namedtuple
+from . import log
+import gzip
+
+Interconnect = namedtuple('Interconnect', ['orig', 'dest', 'r', 'f'])
+IOPath = namedtuple('IOPath', ['ipin', 'opin', 'r', 'f'])
+
+
+class DelayFile:
+    def __init__(self, name, cells):
+        self.name = name
+        if None in cells:
+            self.interconnects = cells[None]
+        else:
+            self.interconnects = None
+        self.cells = dict((n, l) for n, l in cells.items() if n)
+
+    def __repr__(self):
+        return '\n'.join(f'{n}: {l}' for n, l in self.cells.items()) + '\n' + \
+               '\n'.join(str(i) for i in self.interconnects)
+
+    def annotation(self, circuit, pin_index_f, dataset=1, interconnect=True, ffdelays=True):
+        """
+        Constructs an 3-dimensional array with timing data for each line in `circuit`.
+        Dimension 1 of the returned array is the line index.
+        Dimension 2 is the type of timing data: 0:`delay`, 1:`pulse rejection limit`.
+        Dimension 3 is the polarity at the output of the reading node: 0:`rising`, 1:`falling`.
+
+        The polarity for pulse rejection is determined by the latter transition of the pulse.
+        E.g., timing[42,1,0] is the rejection limit of a negative pulse at the output of the reader of line 42.
+
+        An IOPATH delay for a node is annotated to the line connected to the input pin specified in the IOPATH.
+
+        Currently, only ABSOLUTE IOPATH and INTERCONNECT delays are supported.
+        Pulse rejection limits are derived from absolute delays, explicit declarations (PATHPULSE etc.) are ignored.
+
+
+        :param ffdelays:
+        :param interconnect:
+        :param pin_index_f:
+        :param circuit:
+        :type dataset: int or tuple
+        """
+        def select_del(_delvals, idx):
+            if type(dataset) is tuple:
+                s = 0
+                for d in dataset:
+                    s += _delvals[idx][d]
+                return s / len(dataset)
+            else:
+                return _delvals[idx][dataset]
+        
+        def find_cell(name):
+            if name not in circuit.cells:
+                name = name.replace('\\', '')
+            if name not in circuit.cells:
+                name = name.replace('[', '_').replace(']', '_')
+            if name not in circuit.cells:
+                return None
+            return circuit.cells[name]
+        
+        timing = np.zeros((len(circuit.lines), 2, 2))
+        for cn, iopaths in self.cells.items():
+            for ipn, opn, *delvals in iopaths:
+                delvals = [d if len(d) > 0 else [0, 0, 0] for d in delvals]
+                if max(max(delvals)) == 0:
+                    continue
+                cell = find_cell(cn)
+                if cell is None:
+                    log.warn(f'Cell from SDF not found in circuit: {cn}')
+                    continue
+                ipin = pin_index_f(cell.kind, ipn)
+                opin = pin_index_f(cell.kind, opn)
+                kind = cell.kind.lower()
+
+                ipn2 = ipn.replace('(posedge A1)', 'A1').replace('(negedge A1)', 'A1')\
+                    .replace('(posedge A2)', 'A2').replace('(negedge A2)', 'A2')
+                
+                def add_delays(_line):
+                    if _line is not None:
+                        timing[_line.index, :, 0] += select_del(delvals, 0)
+                        timing[_line.index, :, 1] += select_del(delvals, 1)
+
+                take_avg = False
+                if kind.startswith('sdff'):
+                    if not ipn.startswith('(posedge CLK'):
+                        continue
+                    if ffdelays and (len(cell.outs) > opin):
+                        add_delays(cell.outs[opin])
+                else:
+                    if kind.startswith(('xor', 'xnor')):
+                        ipin = pin_index_f(cell.kind, ipn2)
+                        # print(ipn, ipin, times[cell.i_lines[ipin].index, 0, 0])
+                        take_avg = timing[cell.ins[ipin].index].sum() > 0
+                    add_delays(cell.ins[ipin])
+                    if take_avg:
+                        timing[cell.ins[ipin].index] /= 2
+        
+        if not interconnect or self.interconnects is None:
+            return timing
+        
+        for n1, n2, *delvals in self.interconnects:
+            delvals = [d if len(d) > 0 else [0, 0, 0] for d in delvals]
+            if max(max(delvals)) == 0:
+                continue
+            if '/' in n1:
+                i = n1.rfind('/')
+                cn1 = n1[0:i]
+                pn1 = n1[i+1:]
+            else:
+                cn1, pn1 = (n1, 'Z')
+            if '/' in n2:
+                i = n2.rfind('/')
+                cn2 = n2[0:i]
+                pn2 = n2[i+1:]
+            else:
+                cn2, pn2 = (n2, 'IN')
+            c1 = find_cell(cn1)
+            if c1 is None:
+                log.warn(f'Cell from SDF not found in circuit: {cn1}')
+                continue
+            c2 = find_cell(cn2)
+            if c2 is None:
+                log.warn(f'Cell from SDF not found in circuit: {cn2}')
+                continue
+            p1, p2 = pin_index_f(c1.kind, pn1), pin_index_f(c2.kind, pn2)
+            line = None
+            f1, f2 = c1.outs[p1].reader, c2.ins[p2].driver
+            if f1 != f2:  # possible branchfork
+                assert len(f2.ins) == 1
+                line = f2.ins[0]
+                assert f1.outs[f2.ins[0].driver_pin] == line
+            elif len(f2.outs) == 1:  # no fanout?
+                line = f2.ins[0]
+            if line is not None:
+                timing[line.index, :, 0] += select_del(delvals, 0)
+                timing[line.index, :, 1] += select_del(delvals, 1)
+            else:
+                log.warn(f'No branchfork for annotating interconnect delay {c1.name}/{p1}->{c2.name}/{p2}')
+        return timing
+
+
+def sanitize(args):
+    if len(args) == 3: args.append(args[2])
+    return [str(args[0]), str(args[1])] + args[2:]
+
+
+class SdfTransformer(Transformer):
+    @staticmethod
+    def triple(args): return [float(a.value[:-1]) if len(a.value) > 1 else 0.0 for a in args]
+
+    @staticmethod
+    def interconnect(args): return Interconnect(*sanitize(args))
+
+    @staticmethod
+    def iopath(args): return IOPath(*sanitize(args))
+
+    @staticmethod
+    def cell(args):
+        name = next((a for a in args if isinstance(a, str)), None)
+        entries = [e for a in args if hasattr(a, 'children') for e in a.children]
+        return name, entries
+
+    @staticmethod
+    def start(args):
+        name = next((a for a in args if isinstance(a, str)), None)
+        cells = dict(t for t in args if isinstance(t, tuple))
+        return DelayFile(name, cells)
+
+
+def parse(sdf) -> DelayFile:
+    grammar = r"""
+    start: "(DELAYFILE" ( "(SDFVERSION" _NOB ")"
+        | "(DESIGN" "\"" NAME "\"" ")"
+        | "(DATE" _NOB ")"
+        | "(VENDOR" _NOB ")"
+        | "(PROGRAM" _NOB ")"
+        | "(VERSION" _NOB ")"
+        | "(DIVIDER" _NOB ")"
+        | "(VOLTAGE" _NOB ")"
+        | "(PROCESS" _NOB? ")"
+        | "(TEMPERATURE" _NOB ")"
+        | "(TIMESCALE" _NOB ")"
+        | cell )* ")"
+    cell: "(CELL" ( "(CELLTYPE" _NOB ")"
+        | "(INSTANCE" ID? ")"
+        | "(TIMINGCHECK" _ignore* ")"
+        | delay )* ")"
+    delay: "(DELAY" "(ABSOLUTE" (interconnect | iopath)* ")" ")"
+    interconnect: "(INTERCONNECT" ID ID triple* ")"
+    iopath: "(IOPATH" ID_OR_EDGE ID_OR_EDGE triple* ")"
+    NAME: /[^"]+/
+    ID_OR_EDGE: ( /[^() ]+/ | "(" /[^)]+/ ")" )
+    ID: ( /[^"() ]+/ | "\"" /[^"]+/ "\"" )
+    triple: "(" ( /[-.0-9]*:/ /[-.0-9]*:/ /[-.0-9]*\)/ | ")" )
+    _ignore: "(" _NOB? _ignore* ")" _NOB?
+    _NOB: /[^()]+/
+    COMMENT: "//" /[^\n]*/
+    %ignore ( /\r?\n/ | COMMENT )+
+    %ignore /[\t\f ]+/
+    """
+    if '\n' not in str(sdf):  # One line?: Assuming it is a file name.
+        if str(sdf).endswith('.gz'):
+            with gzip.open(sdf, 'rt') as f:
+                text = f.read()
+        else:
+            with open(sdf, 'r') as f:
+                text = f.read()
+    else:
+        text = str(sdf)
+    return Lark(grammar, parser="lalr", transformer=SdfTransformer()).parse(text)
diff --git a/kyupy/stil.py b/kyupy/stil.py
new file mode 100644
index 0000000..a0081b4
--- /dev/null
+++ b/kyupy/stil.py
@@ -0,0 +1,249 @@
+from lark import Lark, Transformer
+from collections import namedtuple
+import re
+import gzip
+from .packed_vectors import PackedVectors
+from .logic_sim import LogicSim
+
+
+Call = namedtuple('Call', ['name', 'parameters'])
+ScanPattern = namedtuple('ScanPattern', ['load', 'launch', 'capture', 'unload'])
+
+
+class StilFile:
+    def __init__(self, version, signal_groups, scan_chains, calls):
+        self.version = version
+        self.signal_groups = signal_groups
+        self.scan_chains = scan_chains
+        self.si_ports = dict((v[0], k) for k, v in scan_chains.items())
+        self.so_ports = dict((v[-1], k) for k, v in scan_chains.items())
+        self.calls = calls
+        self.patterns = []
+        launch = {}
+        capture = {}
+        load = {}
+        for call in self.calls:
+            if call.name == 'load_unload':
+                unload = {}
+                for so_port in self.so_ports:
+                    if so_port in call.parameters:
+                        unload[so_port] = call.parameters[so_port].replace('\n', '')
+                if len(capture) > 0:
+                    self.patterns.append(ScanPattern(load, launch, capture, unload))
+                    capture = {}
+                    launch = {}
+                load = {}
+                for si_port in self.si_ports:
+                    if si_port in call.parameters:
+                        load[si_port] = call.parameters[si_port].replace('\n', '')
+            if call.name.endswith('_launch') or call.name.endswith('_capture'):
+                if len(launch) == 0:
+                    launch = dict((k, v.replace('\n', '')) for k, v in call.parameters.items())
+                else:
+                    capture = dict((k, v.replace('\n', '')) for k, v in call.parameters.items())
+    
+    def _maps(self, c):
+        interface = list(c.interface) + [n for n in c.nodes if 'DFF' in n.kind]
+        intf_pos = dict([(n.name, i) for i, n in enumerate(interface)])
+        pi_map = [intf_pos[n] for n in self.signal_groups['_pi']]
+        po_map = [intf_pos[n] for n in self.signal_groups['_po']]
+        scan_maps = {}
+        scan_inversions = {}
+        for chain_name, chain in self.scan_chains.items():
+            scan_map = []
+            scan_in_inversion = []
+            scan_out_inversion = []
+            inversion = False
+            for n in chain[1:-1]:
+                if n == '!': 
+                    inversion = not inversion
+                else:
+                    scan_in_inversion.append(inversion)
+            scan_in_inversion = list(reversed(scan_in_inversion))
+            inversion = False             
+            for n in reversed(chain[1:-1]):
+                if n == '!':
+                    inversion = not inversion
+                else:
+                    scan_map.append(intf_pos[n])
+                    scan_out_inversion.append(inversion)
+            scan_maps[chain[0]] = scan_map
+            scan_maps[chain[-1]] = scan_map
+            scan_inversions[chain[0]] = scan_in_inversion
+            scan_inversions[chain[-1]] = scan_out_inversion
+        return interface, pi_map, po_map, scan_maps, scan_inversions
+        
+    def tests(self, c):
+        interface, pi_map, po_map, scan_maps, scan_inversions = self._maps(c)
+        tests = PackedVectors(len(self.patterns), len(interface), 2)
+        for i, p in enumerate(self.patterns):
+            for si_port in self.si_ports.keys():
+                tests.set_values(i, p.load[si_port], scan_maps[si_port], scan_inversions[si_port])
+            tests.set_values(i, p.launch['_pi'], pi_map)
+        return tests
+
+    def tests8v(self, c):
+        interface, pi_map, po_map, scan_maps, scan_inversions = self._maps(c)
+        init = PackedVectors(len(self.patterns), len(interface), 2)
+        for i, p in enumerate(self.patterns):
+            # init.set_values(i, '0' * len(interface))
+            for si_port in self.si_ports.keys():
+                init.set_values(i, p.load[si_port], scan_maps[si_port], scan_inversions[si_port])
+            init.set_values(i, p.launch['_pi'], pi_map)
+        sim4v = LogicSim(c, len(init), 2)
+        sim4v.assign(init)
+        sim4v.propagate()
+        launch = init.copy()
+        sim4v.capture(launch)
+        for i, p in enumerate(self.patterns):
+            # if there was no launch clock, then init = launch
+            if ('P' not in p.launch['_pi']) or ('P' not in p.capture['_pi']):
+                for si_port in self.si_ports.keys():
+                    launch.set_values(i, p.load[si_port], scan_maps[si_port], scan_inversions[si_port])
+            if 'P' in p.capture['_pi']:
+                launch.set_values(i, p.capture['_pi'], pi_map)
+        
+        return PackedVectors.from_pair(init, launch)
+                
+    def responses(self, c):
+        interface, pi_map, po_map, scan_maps, scan_inversions = self._maps(c)
+        resp = PackedVectors(len(self.patterns), len(interface), 2)
+        for i, p in enumerate(self.patterns):
+            resp.set_values(i, p.capture['_po'], po_map)
+            for so_port in self.so_ports.keys():
+                resp.set_values(i, p.unload[so_port], scan_maps[so_port], scan_inversions[so_port])
+        return resp
+        
+        
+class StilTransformer(Transformer):
+    def __init__(self):
+        super().__init__()
+        self._signal_groups = None
+        self._calls = None
+        self._scan_chains = None
+        
+    @staticmethod
+    def quoted(args): return args[0][1:-1]
+
+    @staticmethod
+    def call(args): return Call(args[0], dict(args[1:]))
+        
+    @staticmethod
+    def call_parameter(args): return args[0], args[1].value
+
+    @staticmethod
+    def signal_group(args): return args[0], args[1:]
+    
+    @staticmethod
+    def scan_chain(args):
+        scan_in = None
+        scan_cells = None
+        scan_out = None
+        for t in args[1:]:
+            if t.data == 'scan_in':
+                scan_in = t.children[0]
+            elif t.data == 'scan_out':
+                scan_out = t.children[0]
+            if t.data == 'scan_cells':
+                scan_cells = [n.replace('.SI', '') for n in t.children]
+                scan_cells = [re.sub(r'.*\.', '', s) if '.' in s else s for s in scan_cells]
+        return args[0], ([scan_in] + scan_cells + [scan_out])
+
+    def signal_groups(self, args): self._signal_groups = dict(args)
+    
+    def pattern(self, args): self._calls = [c for c in args if isinstance(c, Call)]
+
+    def scan_structures(self, args): self._scan_chains = dict(args)
+
+    def start(self, args):
+        return StilFile(float(args[0]), self._signal_groups, self._scan_chains, self._calls)
+        
+
+def parse(stil):
+    grammar = r"""
+    start: "STIL" FLOAT _ignore _block*
+    _block: signal_groups | scan_structures | pattern
+        | "Header" _ignore
+        | "Signals" _ignore
+        | "Timing" _ignore
+        | "PatternBurst" quoted _ignore
+        | "PatternExec" _ignore
+        | "Procedures" _ignore
+        | "MacroDefs" _ignore
+    
+    signal_groups: "SignalGroups" "{" signal_group* "}"
+    signal_group: quoted "=" "'" quoted ( "+" quoted)* "'" _ignore? ";"?
+    
+    scan_structures: "ScanStructures" "{" scan_chain* "}"
+    scan_chain: "ScanChain" quoted "{" ( scan_length
+        | scan_in | scan_out | scan_inversion | scan_cells | scan_master_clock )* "}"
+    scan_length: "ScanLength" /[0-9]+/ ";"
+    scan_in: "ScanIn" quoted ";"
+    scan_out: "ScanOut" quoted ";"
+    scan_inversion: "ScanInversion" /[0-9]+/ ";"
+    scan_cells: "ScanCells" (quoted | /!/)* ";"
+    scan_master_clock: "ScanMasterClock" quoted ";"
+    
+    pattern: "Pattern" quoted "{" ( label | w | c | macro | ann | call )* "}"
+    label: quoted ":"
+    w: "W" quoted ";"
+    c: "C" _ignore
+    macro: "Macro" quoted ";"
+    ann: "Ann" _ignore
+    call: "Call" quoted "{" call_parameter* "}"
+    call_parameter: quoted "=" /[^;]+/ ";"
+        
+    quoted: /"[^"]*"/
+    FLOAT: /[-0-9.]+/
+    _ignore: "{" _NOB? _ignore_inner* "}"
+    _ignore_inner: "{" _NOB? _ignore_inner* "}" _NOB?
+    _NOB: /[^{}]+/
+    %ignore ( /\r?\n/ | "//" /[^\n]*/ | /[\t\f ]/ )+
+    """
+    if '\n' not in str(stil):  # One line?: Assuming it is a file name.
+        if str(stil).endswith('.gz'):
+            with gzip.open(stil, 'rt') as f:
+                text = f.read()
+        else:
+            with open(stil, 'r') as f:
+                text = f.read()
+    else:
+        text = str(stil)
+    return Lark(grammar, parser="lalr", transformer=StilTransformer()).parse(text)
+
+
+def extract_scan_pattens(stil_calls):
+    pats = []
+    pi = None
+    scan_in = None
+    for call in stil_calls:
+        if call.name == 'load_unload':
+            scan_out = call.parameters.get('Scan_Out')
+            if scan_out is not None:
+                scan_out = scan_out.replace('\n', '')
+            if pi: pats.append(ScanPattern(scan_in, pi, None, scan_out))
+            scan_in = call.parameters.get('Scan_In')
+            if scan_in is not None:
+                scan_in = scan_in.replace('\n', '')
+        if call.name == 'allclock_capture':
+            pi = call.parameters['_pi'].replace('\n', '')
+    return pats
+
+
+def match_patterns(stil_file, pats, interface):    
+    intf_pos = dict([(n.name, i) for i, n in enumerate(interface)])
+    pi_map = [intf_pos[n] for n in stil_file.signal_groups['_pi']]
+    scan_map = [intf_pos[re.sub(r'b..\.', '', n)] for n in reversed(stil_file.scan_chains['1'])]
+    # print(scan_map)
+    tests = PackedVectors(len(pats), len(interface), 2)
+    for i, p in enumerate(pats):
+        tests.set_values(i, p.scan_in, scan_map)
+        tests.set_values(i, p.pi, pi_map)
+
+    resp = PackedVectors(len(pats), len(interface), 2)
+    for i, p in enumerate(pats):
+        resp.set_values(i, p.pi, pi_map)
+        resp.set_values(i, p.scan_out, scan_map)
+
+    return tests, resp
+
diff --git a/kyupy/verilog.py b/kyupy/verilog.py
new file mode 100644
index 0000000..2516a81
--- /dev/null
+++ b/kyupy/verilog.py
@@ -0,0 +1,161 @@
+from lark import Lark, Transformer
+from collections import namedtuple
+import gzip
+from .circuit import Circuit, Node, Line
+from .saed import pin_index, pin_is_output
+
+Instantiation = namedtuple('Instantiation', ['type', 'name', 'pins'])
+
+
+class SignalDeclaration:
+    
+    def __init__(self, kind, tokens):
+        self.left = None
+        self.right = None
+        self.kind = kind
+        if len(tokens.children) == 1:
+            self.basename = tokens.children[0]
+        else:
+            self.basename = tokens.children[2]
+            self.left = int(tokens.children[0].value)
+            self.right = int(tokens.children[1].value)
+    
+    @property
+    def names(self):
+        if self.left is None:
+            return [self.basename]
+        if self.left <= self.right:
+            return [f'{self.basename}[{i}]' for i in range(self.left, self.right + 1)]
+        else:
+            return [f'{self.basename}[{i}]' for i in range(self.left, self.right - 1, -1)]
+        
+    def __repr__(self):
+        return f"{self.kind}:{self.basename}[{self.left}:{self.right}]"
+
+
+class VerilogTransformer(Transformer):
+    def __init__(self, branchforks=False):
+        super().__init__()
+        self._signal_declarations = {}
+        self.branchforks = branchforks
+
+    @staticmethod
+    def name(args):
+        s = args[0].value
+        if s[0] == '\\':
+            s = s[1:-1]
+        return s
+
+    @staticmethod
+    def instantiation(args):
+        return Instantiation(args[0], args[1],
+                             dict([(pin.children[0], pin.children[1]) for pin in args[2:]]))
+       
+    def input(self, args):
+        for sd in [SignalDeclaration('input', signal) for signal in args]:
+            self._signal_declarations[sd.basename] = sd
+    
+    def inout(self, args):
+        for sd in [SignalDeclaration('input', signal) for signal in args]:  # just treat as input
+            self._signal_declarations[sd.basename] = sd
+    
+    def output(self, args):
+        for sd in [SignalDeclaration('output', signal) for signal in args]:
+            self._signal_declarations[sd.basename] = sd
+            
+    def wire(self, args):
+        for sd in [SignalDeclaration('wire', signal) for signal in args]:
+            self._signal_declarations[sd.basename] = sd
+                
+    def module(self, args):
+        c = Circuit(args[0])
+        positions = {}
+        pos = 0
+        for intf_sig in args[1].children:
+            for name in self._signal_declarations[intf_sig].names:
+                positions[name] = pos
+                pos += 1
+        assignments = []
+        for stmt in args[2:]:  # pass 1: instantiate cells and driven signals
+            if type(stmt) is Instantiation:
+                n = Node(c, stmt.name, kind=stmt.type)
+                for p, s in stmt.pins.items():
+                    if pin_is_output(n.kind, p):
+                        Line(c, (n, pin_index(stmt.type, p)), Node(c, s))
+            elif stmt is not None and stmt.data == 'assign':
+                assignments.append((stmt.children[0], stmt.children[1]))
+        for sd in self._signal_declarations.values():
+            if sd.kind == 'output' or sd.kind == 'input':
+                for name in sd.names:
+                    n = Node(c, name, kind=sd.kind)
+                    if name in positions:
+                        c.interface[positions[name]] = n
+                    if sd.kind == 'input':
+                        Line(c, n, Node(c, name))
+        for s1, s2 in assignments:  # pass 1.5: process signal assignments
+            if s1 in c.forks:
+                assert s2 not in c.forks, 'assignment between two driven signals'
+                Line(c, c.forks[s1], Node(c, s2))
+            elif s2 in c.forks:
+                assert s1 not in c.forks, 'assignment between two driven signals'
+                Line(c, c.forks[s2], Node(c, s1))
+        for stmt in args[2:]:  # pass 2: connect signals to readers
+            if type(stmt) is Instantiation:
+                for p, s in stmt.pins.items():
+                    n = c.cells[stmt.name]
+                    if pin_is_output(n.kind, p): continue
+                    if s.startswith("1'b"):
+                        const = f'__const{s[3]}__'
+                        if const not in c.cells:
+                            Line(c, Node(c, const, const), Node(c, s))
+                    fork = c.forks[s]
+                    if self.branchforks:
+                        branchfork = Node(c, fork.name + "~" + n.name)
+                        Line(c, fork, branchfork)
+                        fork = branchfork
+                    Line(c, fork, (n, pin_index(stmt.type, p)))
+        for sd in self._signal_declarations.values():
+            if sd.kind == 'output':
+                for name in sd.names:
+                    Line(c, c.forks[name], c.cells[name])
+        return c
+
+    @staticmethod
+    def start(args):
+        if len(args) == 1:
+            return args[0]
+        else:
+            return args
+        
+
+def parse(verilog, branchforks=False) -> Circuit:
+    grammar = """
+    start: (module)*
+    module: "module" name parameters ";" (_statement)* "endmodule"
+    parameters: "(" [ name ( "," name )* ] ")"
+    _statement: input | output | inout | tri | wire | assign | instantiation
+    input: "input" signal ( "," signal )* ";"
+    output: "output" signal ( "," signal )* ";"
+    inout: "inout" signal ( "," signal )* ";"
+    tri: "tri" name ";"
+    wire: "wire" signal ( "," signal )* ";"
+    assign: "assign" name "=" name ";"
+    instantiation: name name "(" [ pin ( "," pin )* ] ")" ";"
+    pin: "." name "(" name ")"
+    signal: ( name | "[" /[0-9]+/ ":" /[0-9]+/ "]" name )
+    
+    name: ( /[a-z_][a-z0-9_\\[\\]]*/i | /\\\\[^\\t \\r\\n]+[\\t \\r\\n](\\[[0-9]+\\])?/i | /1'b0/i | /1'b1/i )
+    COMMENT: "//" /[^\\n]*/
+    %ignore ( /\\r?\\n/ | COMMENT )+
+    %ignore /[\\t \\f]+/
+    """
+    if '\n' not in str(verilog):  # One line?: Assuming it is a file name.
+        if str(verilog).endswith('.gz'):
+            with gzip.open(verilog, 'rt') as f:
+                text = f.read()
+        else:
+            with open(verilog, 'r') as f:
+                text = f.read()
+    else:
+        text = str(verilog)
+    return Lark(grammar, parser="lalr", transformer=VerilogTransformer(branchforks)).parse(text)
diff --git a/kyupy/wave_sim.py b/kyupy/wave_sim.py
new file mode 100644
index 0000000..153d431
--- /dev/null
+++ b/kyupy/wave_sim.py
@@ -0,0 +1,401 @@
+import numpy as np
+import math
+from . import numba
+
+
+TMAX = np.float32(2 ** 127)  # almost np.PINF for 32-bit floating point values
+TMIN = np.float32(-2 ** 127)  # almost np.NINF for 32-bit floating point values
+
+
+class WaveSim:
+    def __init__(self, circuit, timing, sims=8, wavecaps=16):
+        self.circuit = circuit
+        self.sims = sims
+        self.overflows = 0
+        self.interface = list(circuit.interface) + [n for n in circuit.nodes if 'dff' in n.kind.lower()]
+
+        self.lst_eat_valid = False
+
+        self.cdata = np.zeros((len(self.interface), sims, 6), dtype='float32')
+
+        if type(wavecaps) is int:
+            wavecaps = [wavecaps] * len(circuit.lines)
+
+        intf_wavecap = 4  # sufficient for storing only 1 transition.
+
+        # state allocation table. maps line and interface indices to self.state memory locations
+
+        self.sat = np.zeros((len(circuit.lines) + 2 + 2 * len(self.interface), 2), dtype='int')
+        self.sat[:, 0] = -1
+        filled = 0
+        for lidx, cap in enumerate(wavecaps):
+            self.sat[lidx] = filled, cap
+            filled += cap
+
+        self.zero_idx = len(circuit.lines)
+        self.sat[self.zero_idx] = filled, intf_wavecap
+        filled += intf_wavecap
+        self.tmp_idx = self.zero_idx + 1
+        self.sat[self.tmp_idx] = filled, intf_wavecap
+        filled += intf_wavecap
+
+        self.ppi_offset = self.tmp_idx + 1
+        self.ppo_offset = self.ppi_offset + len(self.interface)
+        for i, n in enumerate(self.interface):
+            if len(n.outs) > 0:
+                self.sat[self.ppi_offset + i] = filled, intf_wavecap
+                filled += intf_wavecap
+            if len(n.ins) > 0:
+                self.sat[self.ppo_offset + i] = self.sat[n.ins[0].index]
+
+        # pad timing
+        self.timing = np.zeros((len(self.sat), 2, 2))
+        self.timing[:len(timing)] = timing
+
+        # allocate self.state
+        self.state = np.zeros((filled, sims), dtype='float32') + TMAX
+
+        # generate self.ops
+        ops = []
+        interface_dict = dict([(n, i) for i, n in enumerate(self.interface)])
+        for n in circuit.topological_order():
+            if n in interface_dict:
+                inp_idx = self.ppi_offset + interface_dict[n]
+                if len(n.outs) > 0 and n.outs[0] is not None:
+                    ops.append((0b1010, n.outs[0].index, inp_idx, self.zero_idx))
+                if 'dff' in n.kind.lower():
+                    if len(n.outs) > 1 and n.outs[1] is not None:
+                        ops.append((0b0101, n.outs[1].index, inp_idx, self.zero_idx))
+                else:
+                    for o_line in n.outs[1:]:
+                        if o_line is not None:
+                            ops.append((0b1010, o_line.index, inp_idx, self.zero_idx))
+            else:
+                o0_idx = self.tmp_idx
+                i0_idx = self.zero_idx
+                i1_idx = self.zero_idx
+                if len(n.outs) > 0 and n.outs[0] is not None:
+                    o0_idx = n.outs[0].index
+                else:
+                    print(f'no outputs for {n}')
+                if len(n.ins) > 0 and n.ins[0] is not None: i0_idx = n.ins[0].index
+                if len(n.ins) > 1 and n.ins[1] is not None: i1_idx = n.ins[1].index
+                kind = n.kind.lower()
+                if kind == '__fork__':
+                    for o_line in n.outs:
+                        ops.append((0b1010, o_line.index, i0_idx, i1_idx))
+                elif kind.startswith('nand'):
+                    ops.append((0b0111, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('nor'):
+                    ops.append((0b0001, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('and'):
+                    ops.append((0b1000, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('or'):
+                    ops.append((0b1110, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('xor'):
+                    ops.append((0b0110, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('xnor'):
+                    ops.append((0b1001, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('not') or kind.startswith('inv'):
+                    ops.append((0b0101, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('buf') or kind.startswith('nbuf'):
+                    ops.append((0b1010, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('__const1__') or kind.startswith('tieh'):
+                    ops.append((0b0101, o0_idx, i0_idx, i1_idx))
+                elif kind.startswith('__const0__') or kind.startswith('tiel'):
+                    ops.append((0b1010, o0_idx, i0_idx, i1_idx))
+                else:
+                    print('unknown gate type', kind)
+        self.ops = np.asarray(ops, dtype='int32')
+
+        # generate level data
+        levels = np.zeros(len(self.sat), dtype='int32')
+        level_starts = [0]
+        current_level = 1
+        for i, op in enumerate(self.ops):
+            if levels[op[2]] >= current_level or levels[op[3]] >= current_level:
+                current_level += 1
+                level_starts.append(i)
+            levels[op[1]] = current_level
+        self.level_starts = np.asarray(level_starts, dtype='int32')
+        self.level_stops = np.asarray(level_starts[1:] + [len(self.ops)], dtype='int32')
+
+        m1 = np.array([2 ** x for x in range(7, -1, -1)], dtype='uint8')
+        m0 = ~m1
+        self.mask = np.rollaxis(np.vstack((m0, m1)), 1)
+
+    def get_line_delay(self, line, polarity):
+        return self.timing[line, 0, polarity]
+
+    def set_line_delay(self, line, polarity, delay):
+        self.timing[line, 0, polarity] = delay
+
+    def assign(self, vectors, time=0.0, offset=0):
+        nvectors = min(vectors.nvectors - offset, self.sims)
+        for i, node in enumerate(self.interface):
+            ppi_loc = self.sat[self.ppi_offset + i, 0]
+            if ppi_loc < 0: continue
+            for p in range(nvectors):
+                vector = p + offset
+                a = vectors.bits[i, :, vector // 8]
+                m = self.mask[vector % 8]
+                toggle = 0
+                if a[0] & m[1]:
+                    self.state[ppi_loc, p] = TMIN
+                    toggle += 1
+                if (len(a) > 2) and (a[2] & m[1]) and ((a[0] & m[1]) == (a[1] & m[1])):
+                    self.state[ppi_loc + toggle, p] = time
+                    toggle += 1
+                self.state[ppi_loc + toggle, p] = TMAX
+
+    def propagate(self, sims=None, sd=0.0, seed=1):
+        if sims is None:
+            sims = self.sims
+        else:
+            sims = min(sims, self.sims)
+        for op_start, op_stop in zip(self.level_starts, self.level_stops):
+            self.overflows += level_eval(self.ops, op_start, op_stop, self.state, self.sat, 0, sims,
+                                         self.timing, sd, seed)
+        self.lst_eat_valid = False
+
+    def wave(self, line, vector):
+        if line < 0:
+            return [TMAX]
+        mem, wcap = self.sat[line]
+        if mem < 0:
+            return [TMAX]
+        return self.state[mem:mem + wcap, vector]
+
+    def wave_ppi(self, i, vector):
+        return self.wave(self.ppi_offset + i, vector)
+
+    def wave_ppo(self, o, vector):
+        return self.wave(self.ppo_offset + o, vector)
+
+    def capture(self, time=TMAX, sd=0, seed=1, probabilities=None, offset=0):
+        for i, node in enumerate(self.interface):
+            if len(node.ins) == 0: continue
+            for p in range(self.sims):
+                self.cdata[i, p] = self.capture_wave(self.ppo_offset + i, p, time, sd, seed)
+        if probabilities is not None:
+            assert offset < probabilities.shape[1]
+            cap_dim = min(probabilities.shape[1] - offset, self.sims)
+            probabilities[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim, 0]
+        self.lst_eat_valid = True
+        return self.cdata
+
+    def reassign(self, time=0.0):
+        for i, node in enumerate(self.interface):
+            ppi_loc = self.sat[self.ppi_offset + i]
+            ppo_loc = self.sat[self.ppo_offset + i]
+            if ppi_loc < 0 or ppo_loc < 0: continue
+            for sidx in range(self.sims):
+                ival = self.val(self.ppi_offset + i, sidx, TMAX) > 0.5
+                oval = self.cdata[i, sidx, 1] > 0.5
+                toggle = 0
+                if ival:
+                    self.state[ppi_loc, sidx] = TMIN
+                    toggle += 1
+                if ival != oval:
+                    self.state[ppi_loc + toggle, sidx] = time
+                    toggle += 1
+                self.state[ppi_loc + toggle, sidx] = TMAX
+
+    def eat(self, line, vector):
+        eat = TMAX
+        for t in self.wave(line, vector):
+            if t >= TMAX: break
+            if t <= TMIN: continue
+            eat = min(eat, t)
+        return eat
+
+    def lst(self, line, vector):
+        lst = TMIN
+        for t in self.wave(line, vector):
+            if t >= TMAX: break
+            if t <= TMIN: continue
+            lst = max(lst, t)
+        return lst
+
+    def lst_ppo(self, o, vector):
+        if not self.lst_eat_valid:
+            self.capture()
+        return self.cdata[o, vector, 5]
+
+    def toggles(self, line, vector):
+        tog = 0
+        for t in self.wave(line, vector):
+            if t >= TMAX: break
+            if t <= TMIN: continue
+            tog += 1
+        return tog
+
+    def _vals(self, idx, vector, times, sd=0.0):
+        s_sqrt2 = sd * math.sqrt(2)
+        m = 0.5
+        accs = [0.0] * len(times)
+        values = [0] * len(times)
+        for t in self.wave(idx, vector):
+            if t >= TMAX: break
+            for idx, time in enumerate(times):
+                if t < time:
+                    values[idx] = values[idx] ^ 1
+            m = -m
+            if t <= TMIN: continue
+            if s_sqrt2 > 0:
+                for idx, time in enumerate(times):
+                    accs[idx] += m * (1 + math.erf((t - time) / s_sqrt2))
+        if (m < 0) and (s_sqrt2 > 0):
+            for idx, time in enumerate(times):
+                accs[idx] += 1
+        if s_sqrt2 == 0:
+            return values
+        else:
+            return accs
+
+    def vals(self, line, vector, times, sd=0):
+        return self._vals(line, vector, times, sd)
+
+    def val(self, line, vector, time=TMAX, sd=0):
+        return self.capture_wave(line, vector, time, sd)[0]
+
+    def vals_ppo(self, o, vector, times, sd=0):
+        return self._vals(self.ppo_offset + o, vector, times, sd)
+
+    def val_ppo(self, o, vector, time=TMAX, sd=0):
+        if not self.lst_eat_valid:
+            self.capture(time, sd)
+        return self.cdata[o, vector, 0]
+
+    def capture_wave(self, line, vector, time=TMAX, sd=0.0, seed=1):
+        s_sqrt2 = sd * math.sqrt(2)
+        m = 0.5
+        acc = 0.0
+        eat = TMAX
+        lst = TMIN
+        tog = 0
+        val = int(0)
+        final = int(0)
+        for t in self.wave(line, vector):
+            if t >= TMAX: break
+            m = -m
+            final ^= 1
+            if t < time:
+                val ^= 1
+            if t <= TMIN: continue
+            if s_sqrt2 > 0:
+                acc += m * (1 + math.erf((t - time) / s_sqrt2))
+            eat = min(eat, t)
+            lst = max(lst, t)
+            tog += 1
+        if s_sqrt2 > 0:
+            if m < 0:
+                acc += 1
+            if acc >= 0.99:
+                val = 1
+            elif acc > 0.01:
+                seed = (seed << 4) + (vector << 20) + (line-self.ppo_offset << 1)
+                seed = int(0xDEECE66D) * seed + 0xB
+                seed = int(0xDEECE66D) * seed + 0xB
+                rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
+                val = rnd < acc
+            else:
+                val = 0
+        else:
+            acc = val
+
+        return acc, val, final, (val != final), eat, lst
+
+
+@numba.njit
+def level_eval(ops, op_start, op_stop, state, sat, st_start, st_stop, line_times, sd, seed):
+    overflows = 0
+    for op_idx in range(op_start, op_stop):
+        op = ops[op_idx]
+        for st_idx in range(st_start, st_stop):
+            overflows += wave_eval(op, state, sat, st_idx, line_times, sd, seed)
+    return overflows
+
+
+@numba.njit
+def rand_gauss(seed, sd):
+    clamp = 0.5
+    if sd <= 0.0:
+        return 1.0
+    while True:
+        x = -6.0
+        for i in range(12):
+            seed = int(0xDEECE66D) * seed + 0xB
+            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
+        x *= sd
+        if abs(x) <= clamp:
+            break
+    return x + 1.0
+
+
+@numba.njit
+def wave_eval(op, state, sat, st_idx, line_times, sd=0.0, seed=0):
+    lut, z_idx, a_idx, b_idx = op
+    overflows = int(0)
+
+    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
+
+    a_mem = sat[a_idx, 0]
+    b_mem = sat[b_idx, 0]
+    z_mem, z_cap = sat[z_idx]
+
+    a_cur = int(0)
+    b_cur = int(0)
+    z_cur = lut & 1
+    if z_cur == 1:
+        state[z_mem, st_idx] = TMIN
+
+    a = state[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss(_seed ^ a_mem ^ z_cur, sd)
+    b = state[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss(_seed ^ b_mem ^ z_cur, sd)
+
+    previous_t = TMIN
+
+    current_t = min(a, b)
+    inputs = int(0)
+
+    while current_t < TMAX:
+        z_val = z_cur & 1
+        if b < a:
+            b_cur += 1
+            b = state[b_mem + b_cur, st_idx]
+            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ b_mem ^ z_val ^ 1, sd)
+            thresh = line_times[b_idx, 1, z_val] * rand_gauss(_seed ^ b_mem ^ z_val, sd)
+            inputs ^= 2
+            next_t = b
+        else:
+            a_cur += 1
+            a = state[a_mem + a_cur, st_idx]
+            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ a_mem ^ z_val ^ 1, sd)
+            thresh = line_times[a_idx, 1, z_val] * rand_gauss(_seed ^ a_mem ^ z_val, sd)
+            inputs ^= 1
+            next_t = a
+
+        if (z_cur & 1) != ((lut >> inputs) & 1):
+            # we generate a toggle in z_mem, if:
+            #   ( it is the first toggle in z_mem OR
+            #   following toggle is earlier OR
+            #   pulse is wide enough ) AND enough space in z_mem.
+            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
+                if z_cur < (z_cap - 1):
+                    state[z_mem + z_cur, st_idx] = current_t
+                    previous_t = current_t
+                    z_cur += 1
+                else:
+                    overflows += 1
+                    previous_t = state[z_mem + z_cur - 1, st_idx]
+                    z_cur -= 1
+            else:
+                z_cur -= 1
+                if z_cur > 0:
+                    previous_t = state[z_mem + z_cur - 1, st_idx]
+                else:
+                    previous_t = TMIN
+        current_t = min(a, b)
+
+    state[z_mem + z_cur, st_idx] = TMAX
+    return overflows
diff --git a/kyupy/wave_sim_cuda.py b/kyupy/wave_sim_cuda.py
new file mode 100644
index 0000000..9e92901
--- /dev/null
+++ b/kyupy/wave_sim_cuda.py
@@ -0,0 +1,282 @@
+import numpy as np
+import math
+from .wave_sim import WaveSim
+from . import cuda
+
+TMAX = np.float32(2 ** 127)  # almost np.PINF for 32-bit floating point values
+TMIN = np.float32(-2 ** 127)  # almost np.NINF for 32-bit floating point values
+
+
+class WaveSimCuda(WaveSim):
+    def __init__(self, circuit, timing, sims=8, wavecaps=16):
+        super().__init__(circuit, timing, sims, wavecaps)
+
+        self.tdata = np.zeros((len(self.interface), 3, (sims - 1) // 8 + 1), dtype='uint8')
+
+        self.d_state = cuda.to_device(self.state)
+        self.d_sat = cuda.to_device(self.sat)
+        self.d_ops = cuda.to_device(self.ops)
+        self.d_timing = cuda.to_device(self.timing)
+        self.d_tdata = cuda.to_device(self.tdata)
+        self.d_cdata = cuda.to_device(self.cdata)
+
+        self._block_dim = (32, 16)
+
+    def get_line_delay(self, line, polarity):
+        return self.d_timing[line, 0, polarity]
+
+    def set_line_delay(self, line, polarity, delay):
+        self.d_timing[line, 0, polarity] = delay
+
+    def assign(self, vectors, time=0.0, offset=0):
+        assert (offset % 8) == 0
+        byte_offset = offset // 8
+        assert byte_offset < vectors.bits.shape[-1]
+        pdim = min(vectors.bits.shape[-1] - byte_offset, self.tdata.shape[-1])
+
+        self.tdata[..., 0:pdim] = vectors.bits[..., byte_offset:pdim + byte_offset]
+        if vectors.vdim == 1:
+            self.tdata[:, 1, 0:pdim] = ~self.tdata[:, 1, 0:pdim]
+            self.tdata[:, 2, 0:pdim] = 0
+        cuda.to_device(self.tdata, to=self.d_tdata)
+
+        grid_dim = self._grid_dim(self.sims, len(self.interface))
+        assign_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppi_offset,
+                                                 len(self.interface), self.d_tdata, time)
+
+    def _grid_dim(self, x, y):
+        gx = math.ceil(x / self._block_dim[0])
+        gy = math.ceil(y / self._block_dim[1])
+        return gx, gy
+
+    def propagate(self, sims=None, sd=0.0, seed=1):
+        if sims is None:
+            sims = self.sims
+        else:
+            sims = min(sims, self.sims)
+        for op_start, op_stop in zip(self.level_starts, self.level_stops):
+            grid_dim = self._grid_dim(sims, op_stop - op_start)
+            wave_kernel[grid_dim, self._block_dim](self.d_ops, op_start, op_stop, self.d_state, self.sat, int(0),
+                                                   sims, self.d_timing, sd, seed)
+        cuda.synchronize()
+        self.lst_eat_valid = False
+
+    def wave(self, line, vector):
+        if line < 0:
+            return None
+        mem, wcap = self.sat[line]
+        if mem < 0:
+            return None
+        return self.d_state[mem:mem + wcap, vector]
+    
+    def capture(self, time=TMAX, sd=0, seed=1, probabilities=None, offset=0):
+        grid_dim = self._grid_dim(self.sims, len(self.interface))
+        capture_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppo_offset,
+                                                  self.d_cdata, time, sd * math.sqrt(2), seed)
+        self.cdata[...] = self.d_cdata
+        if probabilities is not None:
+            assert offset < probabilities.shape[1]
+            cap_dim = min(probabilities.shape[1] - offset, self.sims)
+            probabilities[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim, 0]
+        self.lst_eat_valid = True
+        return self.cdata
+
+    def reassign(self, time=0.0):
+        grid_dim = self._grid_dim(self.sims, len(self.interface))
+        reassign_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppi_offset, self.ppo_offset,
+                                                   self.d_cdata, time)
+        cuda.synchronize()
+
+
+@cuda.jit()
+def reassign_kernel(state, sat, ppi_offset, ppo_offset, cdata, ppi_time):
+    vector, y = cuda.grid(2)
+    if vector >= state.shape[-1]: return
+    if ppo_offset + y >= len(sat): return
+
+    ppo, ppo_cap = sat[ppo_offset + y]
+    ppi, ppi_cap = sat[ppi_offset + y]
+    if ppo < 0: return
+    if ppi < 0: return
+
+    ppo_val = int(cdata[y, vector, 1])
+    ppi_val = int(0)
+    for tidx in range(ppi_cap):
+        t = state[ppi + tidx, vector]
+        if t >= TMAX: break
+        ppi_val ^= 1
+    
+    # make new waveform at PPI
+    toggle = 0
+    if ppi_val:
+        state[ppi + toggle, vector] = TMIN
+        toggle += 1
+    if ppi_val != ppo_val:
+        state[ppi + toggle, vector] = ppi_time
+        toggle += 1
+    state[ppi + toggle, vector] = TMAX
+
+
+@cuda.jit()
+def capture_kernel(state, sat, ppo_offset, cdata, time, s_sqrt2, seed):
+    x, y = cuda.grid(2)
+    if ppo_offset + y >= len(sat): return
+    line, tdim = sat[ppo_offset + y]
+    if line < 0: return
+    if x >= state.shape[-1]: return
+    vector = x
+    m = 0.5
+    acc = 0.0
+    eat = TMAX
+    lst = TMIN
+    tog = 0
+    val = int(0)
+    final = int(0)
+    for tidx in range(tdim):
+        t = state[line + tidx, vector]
+        if t >= TMAX: break
+        m = -m
+        final ^= 1
+        if t < time:
+            val ^= 1
+        if t <= TMIN: continue
+        if s_sqrt2 > 0:
+            acc += m * (1 + math.erf((t - time) / s_sqrt2))
+        eat = min(eat, t)
+        lst = max(lst, t)
+        tog += 1
+    if s_sqrt2 > 0:
+        if m < 0:
+            acc += 1
+        if acc >= 0.99:
+            val = 1
+        elif acc > 0.01:
+            seed = (seed << 4) + (vector << 20) + (y << 1)
+            seed = int(0xDEECE66D) * seed + 0xB
+            seed = int(0xDEECE66D) * seed + 0xB
+            rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
+            val = rnd < acc
+        else:
+            val = 0
+    else:
+        acc = val
+        
+    cdata[y, vector, 0] = acc
+    cdata[y, vector, 1] = val
+    cdata[y, vector, 2] = final
+    cdata[y, vector, 3] = (val != final)
+    cdata[y, vector, 4] = eat
+    cdata[y, vector, 5] = lst
+
+
+@cuda.jit()
+def assign_kernel(state, sat, ppi_offset, intf_len, tdata, time):
+    x, y = cuda.grid(2)
+    if y >= intf_len: return
+    line = sat[ppi_offset + y, 0]
+    if line < 0: return
+    sdim = state.shape[-1]
+    if x >= sdim: return
+    vector = x
+    a0 = tdata[y, 0, vector // 8]
+    a1 = tdata[y, 1, vector // 8]
+    a2 = tdata[y, 2, vector // 8]
+    m = np.uint8(1 << (7 - (vector % 8)))
+    toggle = 0
+    if a0 & m:
+        state[line + toggle, x] = TMIN
+        toggle += 1
+    if (a2 & m) and ((a0 & m) == (a1 & m)):
+        state[line + toggle, x] = time
+        toggle += 1
+    state[line + toggle, x] = TMAX
+
+
+@cuda.jit(device=True)
+def rand_gauss(seed, sd):
+    clamp = 0.5
+    if sd <= 0.0:
+        return 1.0
+    while True:
+        x = -6.0
+        for i in range(12):
+            seed = int(0xDEECE66D) * seed + 0xB
+            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
+        x *= sd
+        if abs(x) <= clamp:
+            break
+    return x + 1.0
+
+
+@cuda.jit()
+def wave_kernel(ops, op_start, op_stop, state, sat, st_start, st_stop, line_times, sd, seed):
+    x, y = cuda.grid(2)
+    st_idx = st_start + x
+    op_idx = op_start + y
+    if st_idx >= st_stop: return
+    if op_idx >= op_stop: return
+    lut = ops[op_idx, 0]
+    z_idx = ops[op_idx, 1]
+    a_idx = ops[op_idx, 2]
+    b_idx = ops[op_idx, 3]
+
+    z_mem, z_cap = sat[z_idx]
+    a_mem = sat[a_idx, 0]
+    b_mem = sat[b_idx, 0]
+
+    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
+
+    a_cur = int(0)
+    b_cur = int(0)
+    z_cur = lut & 1
+    if z_cur == 1:
+        state[z_mem, st_idx] = TMIN
+
+    a = state[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss(_seed ^ a_mem ^ z_cur, sd)
+    b = state[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss(_seed ^ b_mem ^ z_cur, sd)
+
+    previous_t = TMIN
+
+    current_t = min(a, b)
+    inputs = int(0)
+
+    while current_t < TMAX:
+        z_val = z_cur & 1
+        if b < a:
+            b_cur += 1
+            b = state[b_mem + b_cur, st_idx]
+            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ b_mem ^ z_val ^ 1, sd)
+            thresh = line_times[b_idx, 1, z_val] * rand_gauss(_seed ^ b_mem ^ z_val, sd)
+            inputs ^= 2
+            next_t = b
+        else:
+            a_cur += 1
+            a = state[a_mem + a_cur, st_idx]
+            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ a_mem ^ z_val ^ 1, sd)
+            thresh = line_times[a_idx, 1, z_val] * rand_gauss(_seed ^ a_mem ^ z_val, sd)
+            inputs ^= 1
+            next_t = a
+
+        if (z_cur & 1) != ((lut >> inputs) & 1):
+            # we generate a toggle in z_mem, if:
+            #   ( it is the first toggle in z_mem OR
+            #   following toggle is earlier OR
+            #   pulse is wide enough ) AND enough space in z_mem.
+            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
+                if z_cur < (z_cap - 1):
+                    state[z_mem + z_cur, st_idx] = current_t
+                    previous_t = current_t
+                    z_cur += 1
+                else:
+                    # overflows += 1
+                    previous_t = state[z_mem + z_cur - 1, st_idx]
+                    z_cur -= 1
+            else:
+                z_cur -= 1
+                if z_cur > 0:
+                    previous_t = state[z_mem + z_cur - 1, st_idx]
+                else:
+                    previous_t = TMIN
+        current_t = min(a, b)
+
+    state[z_mem + z_cur, st_idx] = TMAX
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/b01.bench b/tests/b01.bench
new file mode 100644
index 0000000..48ae6ca
--- /dev/null
+++ b/tests/b01.bench
@@ -0,0 +1,64 @@
+# edf2bench v0.8
+#   (!) 1997-2003 Giovanni Squillero <giovanni.squillero@polito.it>
+#
+# Edf source: "b01/b01.edf"
+# 2 inputs
+# 2 outputs
+# 5 D-type flipflops
+# 10 inverters
+# 39 gates (1 and, 28 nand, 1 or, 10 not)
+
+
+INPUT(LINE1)
+INPUT(LINE2)
+
+OUTPUT(OUTP_REG)
+OUTPUT(OVERFLW_REG)
+
+OVERFLW_REG = DFF(U34)
+STATO_REG_2_ = DFF(U45)
+STATO_REG_1_ = DFF(U36)
+STATO_REG_0_ = DFF(U35)
+OUTP_REG = DFF(U44)
+
+U34 = AND(STATO_REG_1_, U38, STATO_REG_0_)
+U35 = NAND(U68, U67, U66, U65)
+U36 = NAND(U57, U55, U56)
+U37 = OR(LINE2, LINE1)
+U38 = NOT(STATO_REG_2_)
+U39 = NOT(STATO_REG_1_)
+U40 = NOT(LINE2)
+U41 = NOT(LINE1)
+U42 = NOT(STATO_REG_0_)
+U43 = NAND(STATO_REG_1_, U42)
+U44 = NAND(U73, U72)
+U45 = NAND(U60, U59)
+U46 = NAND(U70, U69)
+U47 = NAND(LINE1, LINE2)
+U48 = NAND(STATO_REG_2_, U43)
+U49 = NOT(U37)
+U50 = NAND(U49, U42)
+U51 = NOT(U47)
+U52 = NOT(U43)
+U53 = NAND(U47, U43)
+U54 = NAND(STATO_REG_2_, U47)
+U55 = NAND(STATO_REG_0_, U39, U47)
+U56 = NAND(U52, U54)
+U57 = NAND(U62, U61, STATO_REG_2_)
+U58 = NOT(U48)
+U59 = NAND(U53, U38)
+U60 = NAND(U50, U39, STATO_REG_2_)
+U61 = NAND(STATO_REG_1_, U49)
+U62 = NAND(U37, U42)
+U63 = NAND(STATO_REG_0_, U47)
+U64 = NAND(U51, U42)
+U65 = NAND(U64, U63, U39, U38)
+U66 = NAND(U43, U37, STATO_REG_2_)
+U67 = NAND(U34, U47)
+U68 = NAND(U51, U52)
+U69 = NAND(LINE1, U40)
+U70 = NAND(LINE2, U41)
+U71 = NOT(U46)
+U72 = NAND(U58, U71)
+U73 = NAND(U46, U48)
+
diff --git a/tests/b01.v b/tests/b01.v
new file mode 100644
index 0000000..aa809ad
--- /dev/null
+++ b/tests/b01.v
@@ -0,0 +1,96 @@
+//
+// Milkyway Hierarchical Verilog Dump:
+// Generated on 04/25/2017 at 20:29:24
+// Design Generated by Consolidated Verilog Reader
+// File produced by Consolidated Verilog Writer
+// Library Name :lib_mw
+// Cell Name    :b01
+// Hierarchy delimiter:'/'
+// Write Command : write_verilog b01.v -no_physical_only_cells -no_corner_pad_cells  
+//	-no_pad_filler_cells
+//
+
+
+module b01 (line1 , Scan_Out , overflw , outp , Scan_In , Scan_Enable , 
+    clock , reset , line2 );
+input  line1 ;
+output Scan_Out ;
+output overflw ;
+output outp ;
+input  Scan_In ;
+input  Scan_Enable ;
+input  clock ;
+input  reset ;
+input  line2 ;
+
+
+
+assign Scan_Out = overflw ;
+SDFFARX1 stato_reg_1_0 (.QN ( n137 ) , .Q ( stato_1_N ) , .CLK ( clock ) 
+    , .RSTB ( n86 ) , .SE ( Scan_Enable ) , .SI ( stato_0_N ) , .D ( n53 ) ) ;
+INVX0 U1 (.ZN ( n1 ) , .INP ( n105 ) ) ;
+SDFFARX1 outp_reg (.Q ( outp ) , .CLK ( clock ) , .RSTB ( n86 ) 
+    , .SE ( Scan_Enable ) , .SI ( Scan_In ) , .D ( n54 ) ) ;
+SDFFARX1 stato_reg_2_0 (.QN ( n148 ) , .Q ( stato_2_N ) , .CLK ( clock ) 
+    , .RSTB ( n86 ) , .SE ( Scan_Enable ) , .SI ( stato_1_N ) , .D ( n55 ) ) ;
+SDFFARX1 stato_reg_0_0 (.QN ( n149 ) , .Q ( stato_0_N ) , .CLK ( clock ) 
+    , .RSTB ( n86 ) , .SE ( Scan_Enable ) , .SI ( outp ) , .D ( n56 ) ) ;
+SDFFARX1 overflw_reg (.Q ( overflw ) , .CLK ( clock ) , .RSTB ( n86 ) 
+    , .SE ( Scan_Enable ) , .SI ( stato_2_N ) , .D ( n61 ) ) ;
+AND2X1 U108 (.IN1 ( n1 ) , .IN2 ( stato_0_N ) , .Q ( n133 ) ) ;
+AND2X1 U109 (.IN1 ( n142 ) , .IN2 ( n136 ) , .Q ( n100 ) ) ;
+NAND2X0 U110 (.IN1 ( n129 ) , .IN2 ( n128 ) , .QN ( n131 ) ) ;
+NAND2X0 U111 (.IN1 ( n144 ) , .IN2 ( n132 ) , .QN ( n92 ) ) ;
+NOR2X0 U112 (.QN ( n132 ) , .IN1 ( n133 ) , .IN2 ( n147 ) ) ;
+OR2X1 U113 (.IN2 ( n1 ) , .IN1 ( n96 ) , .Q ( n144 ) ) ;
+NAND2X0 U114 (.IN1 ( line1 ) , .IN2 ( n134 ) , .QN ( n128 ) ) ;
+INVX0 U115 (.ZN ( n134 ) , .INP ( line2 ) ) ;
+AND2X1 U116 (.IN1 ( n135 ) , .IN2 ( n148 ) , .Q ( n61 ) ) ;
+NOR2X0 U117 (.QN ( n135 ) , .IN1 ( n149 ) , .IN2 ( n137 ) ) ;
+NOR2X0 U118 (.QN ( n136 ) , .IN1 ( n137 ) , .IN2 ( n149 ) ) ;
+NAND2X0 U119 (.IN1 ( n92 ) , .IN2 ( n137 ) , .QN ( n91 ) ) ;
+AND2X1 U120 (.IN1 ( stato_2_N ) , .IN2 ( n131 ) , .Q ( n147 ) ) ;
+NAND2X1 U121 (.IN2 ( n148 ) , .IN1 ( n109 ) , .QN ( n108 ) ) ;
+NOR2X0 U122 (.QN ( n130 ) , .IN1 ( n1 ) , .IN2 ( n148 ) ) ;
+NOR2X0 U123 (.QN ( n97 ) , .IN1 ( n104 ) , .IN2 ( n105 ) ) ;
+AND2X1 U124 (.IN1 ( n106 ) , .IN2 ( n148 ) , .Q ( n104 ) ) ;
+INVX0 U125 (.ZN ( n86 ) , .INP ( reset ) ) ;
+NAND2X1 U127 (.IN2 ( n115 ) , .IN1 ( n114 ) , .QN ( n54 ) ) ;
+NAND2X0 U128 (.IN1 ( stato_1_N ) , .IN2 ( n149 ) , .QN ( n140 ) ) ;
+NAND2X0 U130 (.IN1 ( n129 ) , .IN2 ( n128 ) , .QN ( n142 ) ) ;
+NAND2X0 U131 (.IN1 ( n129 ) , .IN2 ( n128 ) , .QN ( n103 ) ) ;
+NAND2X0 U134 (.IN1 ( line2 ) , .IN2 ( line1 ) , .QN ( n105 ) ) ;
+NAND2X0 U135 (.IN1 ( line2 ) , .IN2 ( n84 ) , .QN ( n129 ) ) ;
+INVX0 U136 (.ZN ( n84 ) , .INP ( line1 ) ) ;
+NAND2X0 U137 (.IN1 ( stato_1_N ) , .IN2 ( n149 ) , .QN ( n106 ) ) ;
+NAND2X0 U138 (.IN1 ( n149 ) , .IN2 ( n148 ) , .QN ( n96 ) ) ;
+NOR2X0 U139 (.QN ( n125 ) , .IN1 ( stato_1_N ) , .IN2 ( n1 ) ) ;
+NAND2X0 U140 (.IN1 ( n105 ) , .IN2 ( n140 ) , .QN ( n109 ) ) ;
+NOR2X0 U141 (.QN ( n122 ) , .IN1 ( n130 ) , .IN2 ( n140 ) ) ;
+NOR2X0 U142 (.QN ( n114 ) , .IN1 ( n117 ) , .IN2 ( n118 ) ) ;
+NAND2X0 U143 (.IN1 ( n117 ) , .IN2 ( n105 ) , .QN ( n120 ) ) ;
+NAND2X0 U144 (.IN1 ( n107 ) , .IN2 ( n108 ) , .QN ( n55 ) ) ;
+NAND2X0 U145 (.IN1 ( n120 ) , .IN2 ( n119 ) , .QN ( n53 ) ) ;
+NAND2X0 U146 (.IN1 ( n91 ) , .IN2 ( n90 ) , .QN ( n56 ) ) ;
+NAND2X0 U147 (.IN1 ( n110 ) , .IN2 ( n137 ) , .QN ( n107 ) ) ;
+NOR2X0 U148 (.QN ( n119 ) , .IN1 ( n123 ) , .IN2 ( n122 ) ) ;
+NOR2X0 U149 (.QN ( n90 ) , .IN1 ( n98 ) , .IN2 ( n97 ) ) ;
+INVX0 U150 (.ZN ( n145 ) , .INP ( n112 ) ) ;
+NAND2X0 U151 (.IN1 ( n105 ) , .IN2 ( n146 ) , .QN ( n110 ) ) ;
+NOR2X0 U152 (.QN ( n146 ) , .IN1 ( n147 ) , .IN2 ( n145 ) ) ;
+NOR2X0 U153 (.QN ( n123 ) , .IN1 ( n124 ) , .IN2 ( n149 ) ) ;
+NOR2X0 U154 (.QN ( n98 ) , .IN1 ( n99 ) , .IN2 ( n1 ) ) ;
+NOR2X0 U155 (.QN ( n124 ) , .IN1 ( n126 ) , .IN2 ( n125 ) ) ;
+NOR2X0 U156 (.QN ( n99 ) , .IN1 ( n100 ) , .IN2 ( n61 ) ) ;
+NAND2X0 U157 (.IN1 ( stato_2_N ) , .IN2 ( n140 ) , .QN ( n116 ) ) ;
+NAND2X0 U158 (.IN1 ( stato_2_N ) , .IN2 ( stato_0_N ) , .QN ( n112 ) ) ;
+NAND2X0 U159 (.IN1 ( n137 ) , .IN2 ( stato_2_N ) , .QN ( n121 ) ) ;
+NAND2X0 U160 (.IN1 ( stato_2_N ) , .IN2 ( n1 ) , .QN ( n127 ) ) ;
+NAND2X0 U161 (.IN1 ( n142 ) , .IN2 ( n116 ) , .QN ( n115 ) ) ;
+NOR2X0 U162 (.QN ( n118 ) , .IN1 ( n131 ) , .IN2 ( n112 ) ) ;
+NOR2X0 U163 (.QN ( n117 ) , .IN1 ( n121 ) , .IN2 ( n142 ) ) ;
+NAND2X0 U164 (.IN1 ( n127 ) , .IN2 ( n113 ) , .QN ( n126 ) ) ;
+NAND2X0 U165 (.IN1 ( stato_2_N ) , .IN2 ( n103 ) , .QN ( n113 ) ) ;
+endmodule
+
+
diff --git a/tests/b14.sdf.gz b/tests/b14.sdf.gz
new file mode 100644
index 0000000..c0d6c7b
Binary files /dev/null and b/tests/b14.sdf.gz differ
diff --git a/tests/b14.stil.gz b/tests/b14.stil.gz
new file mode 100644
index 0000000..a8cea1b
Binary files /dev/null and b/tests/b14.stil.gz differ
diff --git a/tests/b14.v.gz b/tests/b14.v.gz
new file mode 100644
index 0000000..59c9911
Binary files /dev/null and b/tests/b14.v.gz differ
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..e82b2fc
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,8 @@
+import pytest
+
+
+@pytest.fixture
+def mydir():
+    import os
+    from pathlib import Path
+    return Path(os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))))
diff --git a/tests/gates.sdf b/tests/gates.sdf
new file mode 100644
index 0000000..1652ff0
--- /dev/null
+++ b/tests/gates.sdf
@@ -0,0 +1,29 @@
+(DELAYFILE
+(SDFVERSION "OVI 2.1")
+(DESIGN "gates")
+(DIVIDER /)
+(VOLTAGE 1.20:1.20:1.20)
+(PROCESS "TYPICAL")
+(TEMPERATURE 25.00:25.00:25.00)
+(TIMESCALE 1ns)
+(CELL
+  (CELLTYPE "NAND2X1")
+  (INSTANCE nandgate)
+  (DELAY
+    (ABSOLUTE
+    (IOPATH IN1 QN (0.099:0.103:0.103) (0.122:0.127:0.127))
+    (IOPATH IN2 QN (0.083:0.086:0.086) (0.100:0.104:0.104))
+    )
+  )
+)
+(CELL
+  (CELLTYPE "AND2X1")
+  (INSTANCE andgate)
+  (DELAY
+    (ABSOLUTE
+    (IOPATH IN1 Q (0.367:0.378:0.378) (0.351:0.377:0.377))
+    (IOPATH IN2 Q (0.366:0.375:0.375) (0.359:0.370:0.370))
+    )
+  )
+)
+)
\ No newline at end of file
diff --git a/tests/gates.v b/tests/gates.v
new file mode 100644
index 0000000..2fa07cd
--- /dev/null
+++ b/tests/gates.v
@@ -0,0 +1,11 @@
+module gates (a, b, o0, o1 );
+input a;
+input b;
+output o0;
+output o1;
+
+AND2X1 andgate (.IN1 ( a ) , .IN2 ( b ) , .Q ( o0 ) ) ;
+NAND2X1 nandgate (.IN1 ( a ) , .IN2 ( b ) , .QN ( o1 ) ) ;
+
+
+endmodule
\ No newline at end of file
diff --git a/tests/test_bench.py b/tests/test_bench.py
new file mode 100644
index 0000000..800f9be
--- /dev/null
+++ b/tests/test_bench.py
@@ -0,0 +1,15 @@
+from kyupy import bench
+
+
+def test_b01(mydir):
+    with open(mydir / 'b01.bench', 'r') as f:
+        c = bench.parse(f.read())
+        assert 92 == len(c.nodes)
+    c = bench.parse(mydir / 'b01.bench')
+    assert 92 == len(c.nodes)
+
+
+def test_simple():
+    c = bench.parse('input(a, b) output(z) z=and(a,b)')
+    assert len(c.nodes) == 4
+    assert len(c.interface) == 3
diff --git a/tests/test_circuit.py b/tests/test_circuit.py
new file mode 100644
index 0000000..d61e8aa
--- /dev/null
+++ b/tests/test_circuit.py
@@ -0,0 +1,56 @@
+from kyupy.circuit import Circuit, Node, Line
+
+
+def test_circuit():
+    c = Circuit()
+    in1 = Node(c, 'in1', 'buf')
+    in2 = Node(c, 'in2', 'buf')
+    out1 = Node(c, 'out1', 'buf')
+
+    assert 'in1' in c.cells
+    assert 'and1' not in c.cells
+
+    c.interface[0] = in1
+    c.interface[1] = in2
+    c.interface[2] = out1
+
+    and1 = Node(c, 'and1', kind='and')
+    Line(c, in1, and1)
+    Line(c, in2, and1)
+    Line(c, and1, out1)
+
+    assert len(in1.ins) == 0
+    assert len(in1.outs) == 1
+    assert len(in2.outs) == 1
+
+    assert in1.outs[0].reader == and1
+    assert in1.outs[0].driver == in1
+
+    assert len(and1.ins) == 2
+    assert len(and1.outs) == 1
+
+    or1 = Node(c, 'or1', 'or')
+    Line(c, and1, (or1, 1))
+
+    or2 = Node(c, 'or2', 'or')
+    or3 = Node(c, 'or3', 'or')
+
+    assert or2.index == 5
+    assert or3.index == 6
+
+    assert len(c.nodes) == 7
+    or2.remove()
+    or3 = c.cells['or3']
+    assert or3.index == 5
+    assert 'or2' not in c.cells
+    assert len(c.nodes) == 6
+
+    c.cells['or3'].remove()
+    assert 'or3' not in c.cells
+    assert len(c.nodes) == 5
+
+    repr(c)
+    str(c)
+
+    for n in c.topological_order():
+        repr(n)
diff --git a/tests/test_logic_sim.py b/tests/test_logic_sim.py
new file mode 100644
index 0000000..df50546
--- /dev/null
+++ b/tests/test_logic_sim.py
@@ -0,0 +1,161 @@
+from kyupy.logic_sim import LogicSim
+from kyupy import bench
+from kyupy.packed_vectors import PackedVectors
+
+
+def test_vd1():
+    c = bench.parse('input(x, y) output(a, o, n) a=and(x,y) o=or(x,y) n=not(x)')
+    s = LogicSim(c, 4)
+    assert len(s.interface) == 5
+    p = PackedVectors(4, len(s.interface))
+    p[0] = '00000'
+    p[1] = '01000'
+    p[2] = '10000'
+    p[3] = '11000'
+    s.assign(p)
+    s.propagate()
+    s.capture(p)
+    assert p[0] == '00001'
+    assert p[1] == '01011'
+    assert p[2] == '10010'
+    assert p[3] == '11110'
+
+
+def test_vd2():
+    c = bench.parse('input(x, y) output(a, o, n) a=and(x,y) o=or(x,y) n=not(x)')
+    s = LogicSim(c, 16, 2)
+    assert len(s.interface) == 5
+    p = PackedVectors(16, len(s.interface), 2)
+    p[0] = '00000'
+    p[1] = '01000'
+    p[2] = '0-000'
+    p[3] = '0X000'
+    p[4] = '10000'
+    p[5] = '11000'
+    p[6] = '1-000'
+    p[7] = '1X000'
+    p[8] = '-0000'
+    p[9] = '-1000'
+    p[10] = '--000'
+    p[11] = '-X000'
+    p[12] = 'X0000'
+    p[13] = 'X1000'
+    p[14] = 'X-000'
+    p[15] = 'XX000'
+    s.assign(p)
+    s.propagate()
+    s.capture(p)
+    assert p[0] == '00001'
+    assert p[1] == '01011'
+    assert p[2] == '0-0X1'
+    assert p[3] == '0X0X1'
+    assert p[4] == '10010'
+    assert p[5] == '11110'
+    assert p[6] == '1-X10'
+    assert p[7] == '1XX10'
+    assert p[8] == '-00XX'
+    assert p[9] == '-1X1X'
+    assert p[10] == '--XXX'
+    assert p[11] == '-XXXX'
+    assert p[12] == 'X00XX'
+    assert p[13] == 'X1X1X'
+    assert p[14] == 'X-XXX'
+    assert p[15] == 'XXXXX'
+
+    
+def test_vd3():
+    c = bench.parse('input(x, y) output(a, o, n, xo) a=and(x,y) o=or(x,y) n=not(x) xo=xor(x,y)')
+    s = LogicSim(c, 64, 3)
+    assert len(s.interface) == 6
+    p = PackedVectors(64, len(s.interface), 3)
+    p[0] = '000010'
+    p[1] = '010111'
+    p[2] = '0-0X1X'
+    p[3] = '0X0X1X'
+    p[4] = '0R0R1R'
+    p[5] = '0F0F1F'
+    p[6] = '0P0P1P'
+    p[7] = '0N0N1N'
+    p[8] = '100101'
+    p[9] = '111100'
+    p[10] = '1-X10X'
+    p[11] = '1XX10X'
+    p[12] = '1RR10F'
+    p[13] = '1FF10R'
+    p[14] = '1PP10N'
+    p[15] = '1NN10P'
+    p[16] = '-00XXX'
+    p[17] = '-1X1XX'
+    p[18] = '--XXXX'
+    p[19] = '-XXXXX'
+    p[20] = '-RXXXX'
+    p[21] = '-FXXXX'
+    p[22] = '-PXXXX'
+    p[23] = '-NXXXX'
+    p[24] = 'X00XXX'
+    p[25] = 'X1X1XX'
+    p[26] = 'X-XXXX'
+    p[27] = 'XXXXXX'
+    p[28] = 'XRXXXX'
+    p[29] = 'XFXXXX'
+    p[30] = 'XPXXXX'
+    p[31] = 'XNXXXX'
+    p[32] = 'R00RFR'
+    p[33] = 'R1R1FF'
+    p[34] = 'R-XXFX'
+    p[35] = 'RXXXFX'
+    p[36] = 'RRRRFP'
+    p[37] = 'RFPNFN'
+    p[38] = 'RPPRFR'
+    p[39] = 'RNRNFF'
+    p[40] = 'F00FRF'
+    p[41] = 'F1F1RR'
+    p[42] = 'F-XXRX'
+    p[43] = 'FXXXRX'
+    p[44] = 'FRPNRN'
+    p[45] = 'FFFFRP'
+    p[46] = 'FPPFRF'
+    p[47] = 'FNFNRR'
+    p[48] = 'P00PNP'
+    p[49] = 'P1P1NN'
+    p[50] = 'P-XXNX'
+    p[51] = 'PXXXNX'
+    p[52] = 'PRPRNR'
+    p[53] = 'PFPFNF'
+    p[54] = 'PPPPNP'
+    p[55] = 'PNPNNN'
+    p[56] = 'N00NPN'
+    p[57] = 'N1N1PP'
+    p[58] = 'N-XXPX'
+    p[59] = 'NXXXPX'
+    p[60] = 'NRRNPF'
+    p[61] = 'NFFNPR'
+    p[62] = 'NPPNPN'
+    p[63] = 'NNNNPP'
+    expect = p.copy()
+    s.assign(p)
+    s.propagate()
+    s.capture(p)
+    for i in range(64):
+        assert p[i] == expect[i]
+        
+
+def test_b01(mydir):
+    c = bench.parse(mydir / 'b01.bench')
+
+    # 2-valued
+    s = LogicSim(c, 8)
+    assert len(s.interface) == 9
+    t = PackedVectors(8, len(s.interface))
+    t.randomize()
+    s.assign(t)
+    s.propagate()
+    s.capture(t)
+
+    # 8-valued
+    s = LogicSim(c, 8, 3)
+    t = PackedVectors(8, len(s.interface), 3)
+    t.randomize()
+    s.assign(t)
+    s.propagate()
+    s.capture(t)
diff --git a/tests/test_packed_vectors.py b/tests/test_packed_vectors.py
new file mode 100644
index 0000000..2f2a4a0
--- /dev/null
+++ b/tests/test_packed_vectors.py
@@ -0,0 +1,88 @@
+from kyupy.packed_vectors import PackedVectors
+
+
+def test_basic():
+    ba = PackedVectors(8, 1, 1)
+    assert '0\n0\n0\n0\n0\n0\n0\n0' == str(ba)
+    ba.set_value(0, 0, 1)
+    ba.set_value(1, 0, 'H')
+    ba.set_value(2, 0, 'h')
+    ba.set_value(3, 0, True)
+    ba.set_value(4, 0, 0)
+    ba.set_value(5, 0, 'L')
+    ba.set_value(6, 0, 'l')
+    ba.set_value(7, 0, False)
+    assert '1\n1\n1\n1\n0\n0\n0\n0' == str(ba)
+    ba.set_value(1, 0, '0')
+    ba.set_value(5, 0, '1')
+    assert '1\n0\n1\n1\n0\n1\n0\n0' == str(ba)
+    ba = PackedVectors(8, 1, 2)
+    assert '-\n-\n-\n-\n-\n-\n-\n-' == str(ba)
+    ba.set_value(0, 0, 1)
+    ba.set_value(7, 0, 0)
+    ba.set_value(4, 0, 'X')
+    assert '1\n-\n-\n-\nX\n-\n-\n0' == str(ba)
+    ba.set_value(4, 0, '-')
+    assert '1\n-\n-\n-\n-\n-\n-\n0' == str(ba)
+    ba = PackedVectors(8, 2, 2)
+    assert '--\n--\n--\n--\n--\n--\n--\n--' == str(ba)
+    ba.set_value(0, 0, '1')
+    ba.set_value(7, 1, '0')
+    ba.set_values(1, 'XX')
+    assert '1-\nXX\n--\n--\n--\n--\n--\n-0' == str(ba)
+
+
+def test_8v():
+    ba = PackedVectors(1, 8, 3)
+    assert '--------' == str(ba)
+    ba.set_values(0, r'-x01^v\/')
+    assert r'-X01PNFR' == str(ba)
+    ba.set_values(0, '-XLHPNFR')
+    assert r'-X01PNFR' == str(ba)
+    ba.set_values(0, '-xlhpnfr')
+    assert r'-X01PNFR' == str(ba)
+    p1 = PackedVectors(1, 8, 1)
+    p2 = PackedVectors(1, 8, 1)
+    p1.set_values(0, '01010101')
+    p2.set_values(0, '00110011')
+    p = PackedVectors.from_pair(p1, p2)
+    assert r'0FR10FR1' == str(p)
+    p1 = PackedVectors(1, 8, 2)
+    p2 = PackedVectors(1, 8, 2)
+    p1.set_values(0, '0101-X-X')
+    p2.set_values(0, '00110011')
+    p = PackedVectors.from_pair(p1, p2)
+    assert r'0FR1----' == str(p)
+    p1.set_values(0, '0101-X-X')
+    p2.set_values(0, '-X-X--XX')
+    p = PackedVectors.from_pair(p1, p2)
+    assert r'--------' == str(p)
+
+
+def test_slicing():
+    lv = PackedVectors(3, 2, 1)
+    assert '00\n00\n00' == str(lv)
+    lv.set_value(1, 0, '1')
+    lv.set_value(1, 1, '1')
+    assert '00' == lv[0]
+    assert '11' == lv[1]
+    assert 3 == len(lv)
+    lv2 = lv[1:3]
+    assert 2 == len(lv2)
+    assert '11' == lv2[0]
+    assert '00' == lv2[1]
+
+
+def test_copy():
+    lv1 = PackedVectors(8, 1, 1)
+    lv1.set_values_for_position(0, '01010101')
+    lv2 = PackedVectors(8, 1, 1)
+    lv2.set_values_for_position(0, '00100101')
+    diff = lv1.diff(lv2)
+    lv3 = lv1.copy(selection_mask=diff)
+    assert str(lv3) == '1\n0\n1'
+    lv4 = lv1.copy(selection_mask=~diff)
+    assert str(lv4) == '0\n0\n1\n0\n1'
+    lv5 = lv3 + lv4
+    assert str(lv5) == '1\n0\n1\n0\n0\n1\n0\n1'
+
diff --git a/tests/test_sdf.py b/tests/test_sdf.py
new file mode 100644
index 0000000..61932fa
--- /dev/null
+++ b/tests/test_sdf.py
@@ -0,0 +1,100 @@
+from kyupy import sdf, verilog
+from kyupy.saed import pin_index
+
+
+def test_parse():
+    test = '''
+    (DELAYFILE
+    (SDFVERSION "OVI 2.1")
+    (DESIGN "test")
+    (DATE "Wed May 31 14:46:06 2017")
+    (VENDOR "saed90nm_max")
+    (PROGRAM "Synopsys Design Compiler cmos-annotated")
+    (VERSION "I-2013.12-ICC-SP3")
+    (DIVIDER /)
+    (VOLTAGE 1.20:1.20:1.20)
+    (PROCESS "TYPICAL")
+    (TEMPERATURE 25.00:25.00:25.00)
+    (TIMESCALE 1ns)
+    (CELL
+      (CELLTYPE "b14")
+      (INSTANCE)
+      (DELAY
+        (ABSOLUTE
+        (INTERCONNECT U621/ZN U19246/IN1 (0.000:0.000:0.000))
+        (INTERCONNECT U13292/QN U19246/IN2 (0.001:0.001:0.001))
+        (INTERCONNECT U15050/QN U19247/IN1 (0.000:0.000:0.000))
+        (INTERCONNECT U13293/QN U19247/IN2 (0.000:0.000:0.000) (0.000:0.000:0.000))
+        )
+      )
+    )
+    (CELL
+      (CELLTYPE "INVX2")
+      (INSTANCE U78)
+      (DELAY
+        (ABSOLUTE
+        (IOPATH INP ZN (0.201:0.227:0.227) (0.250:0.271:0.271))
+        )
+      )
+    )
+    (CELL
+      (CELLTYPE "SDFFARX1")
+      (INSTANCE reg3_reg_1_0)
+      (DELAY
+        (ABSOLUTE
+        (IOPATH (posedge CLK) Q (0.707:0.710:0.710) (0.737:0.740:0.740))
+        (IOPATH (negedge RSTB) Q () (0.909:0.948:0.948))
+        (IOPATH (posedge CLK) QN (0.585:0.589:0.589) (0.545:0.550:0.550))
+        (IOPATH (negedge RSTB) QN (1.546:1.593:1.593) ())
+        )
+      )
+      (TIMINGCHECK
+        (WIDTH (posedge CLK) (0.284:0.284:0.284))
+        (WIDTH (negedge CLK) (0.642:0.642:0.642))
+        (SETUP (posedge D) (posedge CLK) (0.544:0.553:0.553))
+        (SETUP (negedge D) (posedge CLK) (0.620:0.643:0.643))
+        (HOLD (posedge D) (posedge CLK) (-0.321:-0.331:-0.331))
+        (HOLD (negedge D) (posedge CLK) (-0.196:-0.219:-0.219))
+        (RECOVERY (posedge RSTB) (posedge CLK) (-1.390:-1.455:-1.455))
+        (HOLD (posedge RSTB) (posedge CLK) (1.448:1.509:1.509))
+        (SETUP (posedge SE) (posedge CLK) (0.662:0.670:0.670))
+        (SETUP (negedge SE) (posedge CLK) (0.698:0.702:0.702))
+        (HOLD (posedge SE) (posedge CLK) (-0.435:-0.444:-0.444))
+        (HOLD (negedge SE) (posedge CLK) (-0.291:-0.295:-0.295))
+        (SETUP (posedge SI) (posedge CLK) (0.544:0.544:0.544))
+        (SETUP (negedge SI) (posedge CLK) (0.634:0.688:0.688))
+        (HOLD (posedge SI) (posedge CLK) (-0.317:-0.318:-0.318))
+        (HOLD (negedge SI) (posedge CLK) (-0.198:-0.247:-0.247))
+        (WIDTH (negedge RSTB) (0.345:0.345:0.345))
+    )))
+    '''
+    df = sdf.parse(test)
+    assert df.name == 'test'
+    # print(f'DelayFile(name={df.name}, interconnects={len(df.interconnects)}, iopaths={len(df.iopaths)})')
+
+
+def test_b14(mydir):
+    df = sdf.parse(mydir / 'b14.sdf.gz')
+    assert df.name == 'b14'
+
+
+def test_gates(mydir):
+    c = verilog.parse(mydir / 'gates.v')
+    df = sdf.parse(mydir / 'gates.sdf')
+    lt = df.annotation(c, pin_index, dataset=1)
+    nand_a = c.cells['nandgate'].ins[0]
+    nand_b = c.cells['nandgate'].ins[1]
+    and_a = c.cells['andgate'].ins[0]
+    and_b = c.cells['andgate'].ins[1]
+
+    assert lt[nand_a.index, 0, 0] == 0.103
+    assert lt[nand_a.index, 0, 1] == 0.127
+
+    assert lt[nand_b.index, 0, 0] == 0.086
+    assert lt[nand_b.index, 0, 1] == 0.104
+
+    assert lt[and_a.index, 0, 0] == 0.378
+    assert lt[and_a.index, 0, 1] == 0.377
+
+    assert lt[and_b.index, 0, 0] == 0.375
+    assert lt[and_b.index, 0, 1] == 0.370
diff --git a/tests/test_stil.py b/tests/test_stil.py
new file mode 100644
index 0000000..08faef3
--- /dev/null
+++ b/tests/test_stil.py
@@ -0,0 +1,9 @@
+from kyupy import stil
+
+
+def test_b14(mydir):
+    s = stil.parse(mydir / 'b14.stil.gz')
+    assert 10 == len(s.signal_groups)
+    assert 1 == len(s.scan_chains)
+    assert 3259 == len(s.calls)
+
diff --git a/tests/test_verilog.py b/tests/test_verilog.py
new file mode 100644
index 0000000..1a4aef9
--- /dev/null
+++ b/tests/test_verilog.py
@@ -0,0 +1,9 @@
+from kyupy import verilog
+
+
+def test_b01(mydir):
+    with open(mydir / 'b01.v', 'r') as f:
+        modules = verilog.parse(f.read())
+    assert modules is not None
+    assert verilog.parse(mydir / 'b01.v') is not None
+
diff --git a/tests/test_wave_sim.py b/tests/test_wave_sim.py
new file mode 100644
index 0000000..38560da
--- /dev/null
+++ b/tests/test_wave_sim.py
@@ -0,0 +1,138 @@
+import numpy as np
+from kyupy.wave_sim import WaveSim, wave_eval, TMIN, TMAX
+from kyupy.logic_sim import LogicSim
+from kyupy import verilog
+from kyupy import sdf
+from kyupy.saed import pin_index
+from kyupy.packed_vectors import PackedVectors
+from kyupy.wave_sim_cuda import WaveSimCuda
+
+
+def test_wave_eval():
+    # SDF specifies IOPATH delays with respect to output polarity
+    # SDF pulse rejection value is determined by IOPATH causing last transition and polarity of last transition
+    line_times = np.zeros((3, 2, 2))
+    line_times[0, 0, 0] = 0.1  # A -> Z rise delay
+    line_times[0, 0, 1] = 0.2  # A -> Z fall delay
+    line_times[0, 1, 0] = 0.1  # A -> Z negative pulse limit (terminate in rising Z)
+    line_times[0, 1, 1] = 0.2  # A -> Z positive pulse limit
+    line_times[1, 0, 0] = 0.3  # as above for B -> Z
+    line_times[1, 0, 1] = 0.4
+    line_times[1, 1, 0] = 0.3
+    line_times[1, 1, 1] = 0.4
+    
+    state = np.zeros((3*16, 1)) + TMAX  # 3 waveforms of capacity 16
+    state[::16, 0] = 16  # first entry is capacity
+    a = state[0:16, 0]
+    b = state[16:32, 0]
+    z = state[32:, 0]
+    sat = np.zeros((3, 2), dtype='int')
+    sat[0] = 0, 16
+    sat[1] = 16, 16
+    sat[2] = 32, 16
+
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
+    assert TMIN == z[0]
+
+    a[0] = TMIN
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
+    assert TMIN == z[0]
+    
+    b[0] = TMIN
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
+    assert TMAX == z[0]
+
+    a[0] = 1  # A _/^^^
+    b[0] = 2  # B __/^^
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
+    assert TMIN == z[0]  # ^^^\___ B -> Z fall delay
+    assert 2.4 == z[1]
+    assert TMAX == z[2]
+
+    a[0] = TMIN  # A ^^^^^^
+    b[0] = TMIN  # B ^^^\__
+    b[1] = 2
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
+    assert 2.3 == z[0]  # ___/^^^ B -> Z rise delay
+    assert TMAX == z[1]
+
+    # pos pulse of 0.35 at B -> 0.45 after delays
+    a[0] = TMIN  # A ^^^^^^^^
+    b[0] = TMIN
+    b[1] = 2     # B ^^\__/^^
+    b[2] = 2.35
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
+    assert 2.3 == z[0]  # __/^^\__
+    assert 2.75 == z[1]
+    assert TMAX == z[2]
+
+    # neg pulse of 0.45 at B -> 0.35 after delays
+    a[0] = TMIN  # A ^^^^^^^^
+    b[0] = 2  # B __/^^\__
+    b[1] = 2.45
+    b[2] = TMAX
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
+    assert TMIN == z[0]  # ^^\__/^^
+    assert 2.4 == z[1]
+    assert 2.75 == z[2]
+    assert TMAX == z[3]
+
+    # neg pulse of 0.35 at B -> 0.25 after delays (filtered)
+    a[0] = TMIN  # A ^^^^^^^^
+    b[0] = 2  # B __/^^\__
+    b[1] = 2.35
+    b[2] = TMAX
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
+    assert TMIN == z[0]  # ^^^^^^
+    assert TMAX == z[1]
+
+    # pos pulse of 0.25 at B -> 0.35 after delays (filtered)
+    a[0] = TMIN  # A ^^^^^^^^
+    b[0] = TMIN
+    b[1] = 2  # B ^^\__/^^
+    b[2] = 2.25
+    wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times)
+    assert TMAX == z[0]  # ______
+
+
+def compare_to_logic_sim(wsim):
+    tests = PackedVectors(wsim.sims, len(wsim.interface), 3)
+    tests.randomize()
+    wsim.assign(tests)
+    wsim.propagate(8)
+    cap = np.zeros((len(wsim.interface), wsim.sims))
+    wsim.capture(probabilities=cap)
+
+    resp = tests.copy()
+
+    for iidx, inode in enumerate(wsim.interface):
+        if len(inode.ins) > 0:
+            for vidx in range(wsim.sims):
+                resp.set_value(vidx, iidx, 0 if cap[iidx, vidx] < 0.5 else 1)
+
+    lsim = LogicSim(wsim.circuit, len(tests), 3)
+    lsim.assign(tests)
+    lsim.propagate()
+    exp = tests.copy()
+    lsim.capture(exp)
+    
+    for i in range(8):
+        exp_str = exp[i].replace('R', '1').replace('F', '0').replace('P', '0').replace('N', '1')
+        res_str = resp[i].replace('R', '1').replace('F', '0').replace('P', '0').replace('N', '1')
+        assert res_str == exp_str
+
+
+def test_b14(mydir):
+    c = verilog.parse(mydir / 'b14.v.gz', branchforks=True)
+    df = sdf.parse(mydir / 'b14.sdf.gz')
+    lt = df.annotation(c, pin_index)
+    wsim = WaveSim(c, lt, 8)
+    compare_to_logic_sim(wsim)
+
+
+def test_b14_cuda(mydir):
+    c = verilog.parse(mydir / 'b14.v.gz', branchforks=True)
+    df = sdf.parse(mydir / 'b14.sdf.gz')
+    lt = df.annotation(c, pin_index)
+    wsim = WaveSimCuda(c, lt, 8)
+    compare_to_logic_sim(wsim)