updated b14 benchmark, update wavesim capture api, expand usage examples

5 years ago · e6ae009969
11 changed files with 624 additions and 91 deletions
--- a/UsageExamples.ipynb
+++ b/UsageExamples.ipynb
@ -18,15 +18,7 @@
				@@ -18,15 +18,7 @@
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0000000.334 W Cuda unavailable. Falling back to pure python\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from kyupy import bench\n",
    "\n",
@ -370,7 +362,7 @@
				@@ -370,7 +362,7 @@
    {
     "data": {
      "text/plain": [
-       "<Circuit 'b14' with 15864 nodes, 23087 lines, 91 ports>"
+       "<Circuit 'b14' with 31715 nodes, 46891 lines, 91 ports>"
      ]
     },
     "execution_count": 13,
@ -388,20 +380,42 @@
				@@ -388,20 +380,42 @@
  {
   "cell_type": "code",
   "execution_count": 14,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "chain length 229\n",
-      "['Scan_Out', 'u04_opt1329', 'u04_opt1328', 'wr_reg', 'u04_opt11', 'state_reg_0_0', 'reg3_reg_28_0', 'reg3_reg_27_0', 'reg3_reg_26_0', 'reg3_reg_25_0', 'reg3_reg_24_0', 'u04_opt1123', 'reg3_reg_23_0', 'reg3_reg_22_0', 'reg3_reg_21_0', 'u04_opt1118', 'reg3_reg_20_0', 'reg3_reg_19_0', 'reg3_reg_18_0', 'reg3_reg_17_0', 'reg3_reg_16_0', 'reg3_reg_15_0', 'reg3_reg_14_0', 'reg3_reg_13_0', 'reg3_reg_12_0', 'reg3_reg_11_0', 'reg3_reg_10_0', 'reg3_reg_9_0', 'reg3_reg_8_0', 'reg3_reg_7_0', 'reg3_reg_6_0', 'reg3_reg_5_0', 'reg3_reg_4_0', 'reg3_reg_3_0', 'reg3_reg_2_0', 'reg3_reg_1_0', 'reg3_reg_0_0', 'reg2_reg_31_0', 'reg2_reg_30_0', 'reg2_reg_29_0', 'reg2_reg_28_0', 'reg2_reg_27_0', 'reg2_reg_26_0', 'reg2_reg_25_0', 'reg2_reg_24_0', 'reg2_reg_23_0', 'reg2_reg_22_0', 'reg2_reg_21_0', 'reg2_reg_20_0', 'reg2_reg_19_0', 'reg2_reg_18_0', 'reg2_reg_17_0', 'reg2_reg_16_0', 'reg2_reg_15_0', 'reg2_reg_14_0', 'reg2_reg_13_0', 'reg2_reg_12_0', 'reg2_reg_11_0', 'reg2_reg_10_0', 'reg2_reg_9_0', 'reg2_reg_8_0', 'reg2_reg_7_0', 'reg2_reg_6_0', 'reg2_reg_5_0', 'reg2_reg_4_0', 'reg2_reg_3_0', 'reg2_reg_2_0', 'reg2_reg_1_0', 'reg2_reg_0_0', 'reg1_reg_31_0', 'reg1_reg_30_0', 'reg1_reg_29_0', 'reg1_reg_28_0', 'reg1_reg_27_0', 'reg1_reg_26_0', 'reg1_reg_25_0', 'reg1_reg_24_0', 'reg1_reg_23_0', 'reg1_reg_22_0', 'reg1_reg_21_0', 'reg1_reg_20_0', 'reg1_reg_19_0', 'reg1_reg_18_0', 'reg1_reg_17_0', 'reg1_reg_16_0', 'reg1_reg_15_0', 'reg1_reg_14_0', 'reg1_reg_13_0', 'reg1_reg_12_0', 'reg1_reg_11_0', 'reg1_reg_10_0', 'reg1_reg_9_0', 'reg1_reg_8_0', 'reg1_reg_7_0', 'reg1_reg_6_0', 'reg1_reg_5_0', 'reg1_reg_4_0', 'reg1_reg_3_0', 'reg1_reg_2_0', 'reg1_reg_1_0', 'reg1_reg_0_0', 'reg0_reg_31_0', 'reg0_reg_30_0', 'reg0_reg_29_0', 'reg0_reg_28_0', 'reg0_reg_27_0', 'reg0_reg_26_0', 'reg0_reg_25_0', 'reg0_reg_24_0', 'reg0_reg_23_0', 'reg0_reg_22_0', 'reg0_reg_21_0', 'reg0_reg_20_0', 'reg0_reg_19_0', 'reg0_reg_18_0', 'reg0_reg_17_0', 'reg0_reg_16_0', 'reg0_reg_15_0', 'reg0_reg_14_0', 'reg0_reg_13_0', 'reg0_reg_12_0', 'reg0_reg_11_0', 'reg0_reg_10_0', 'reg0_reg_9_0', 'reg0_reg_8_0', 'reg0_reg_7_0', 'reg0_reg_6_0', 'reg0_reg_5_0', 'reg0_reg_4_0', 'reg0_reg_3_0', 'reg0_reg_2_0', 'reg0_reg_1_0', 'reg0_reg_0_0', 'rd_reg', 'datao_reg_31_0', 'datao_reg_30_0', 'datao_reg_29_0', 'datao_reg_28_0', 'datao_reg_27_0', 'datao_reg_26_0', 'datao_reg_25_0', 'datao_reg_24_0', 'datao_reg_23_0', 'datao_reg_22_0', 'datao_reg_21_0', 'datao_reg_20_0', 'datao_reg_19_0', 'datao_reg_18_0', 'datao_reg_17_0', 'datao_reg_16_0', 'datao_reg_15_0', 'datao_reg_14_0', 'datao_reg_13_0', 'datao_reg_12_0', 'datao_reg_11_0', 'datao_reg_10_0', 'datao_reg_9_0', 'datao_reg_8_0', 'datao_reg_7_0', 'datao_reg_6_0', 'datao_reg_5_0', 'datao_reg_4_0', 'datao_reg_3_0', 'datao_reg_2_0', 'datao_reg_1_0', 'datao_reg_0_0', 'd_reg_1_0', 'd_reg_0_0', 'addr_reg_19_0', 'addr_reg_18_0', 'addr_reg_17_0', 'addr_reg_16_0', 'addr_reg_15_0', 'addr_reg_14_0', 'addr_reg_13_0', 'addr_reg_12_0', 'addr_reg_11_0', 'addr_reg_10_0', 'addr_reg_9_0', 'addr_reg_8_0', 'addr_reg_7_0', 'addr_reg_6_0', 'addr_reg_5_0', 'addr_reg_4_0', 'addr_reg_3_0', 'addr_reg_2_0', 'addr_reg_1_0', 'addr_reg_0_0', 'u04_opt1491', 'u04_opt1492', 'u04_opt1364', 'u04_opt1411', 'IR_reg_31_0', 'IR_reg_30_0', 'IR_reg_29_0', 'u04_opt1427', 'IR_reg_28_0', 'IR_reg_27_0', 'IR_reg_26_0', 'IR_reg_25_0', 'IR_reg_24_0', 'IR_reg_23_0', 'IR_reg_22_0', 'IR_reg_21_0', 'IR_reg_20_0', 'IR_reg_19_0', 'IR_reg_18_0', 'IR_reg_17_0', 'IR_reg_16_0', 'IR_reg_15_0', 'IR_reg_14_0', 'IR_reg_13_0', 'IR_reg_12_0', 'IR_reg_11_0', 'IR_reg_10_0', 'IR_reg_9_0', 'IR_reg_8_0', 'IR_reg_7_0', 'IR_reg_6_0', 'IR_reg_5_0', 'IR_reg_4_0', 'IR_reg_3_0', 'IR_reg_2_0', 'u04_opt1347', 'IR_reg_1_0', 'U14573', 'IR_reg_0_0', 'B_reg', 'Scan_In']\n"
+      "chain length 287\n",
+      "output test_so000\n",
+      "NBUFFX8_RVT HFSBUF_36_76\n",
+      "SDFFARX1_RVT wr_reg\n",
+      "INVX4_RVT HFSINV_691_254\n",
+      "INVX0_RVT HFSINV_2682_255\n",
+      "SDFFARX1_RVT state_reg\n",
+      "NBUFFX2_RVT ZBUF_55_inst_860\n",
+      "SDFFARX1_RVT reg3_reg_28_\n",
+      "SDFFARX1_RVT reg3_reg_27_\n",
+      "SDFFARX1_RVT reg3_reg_26_\n",
+      "...\n",
+      "NBUFFX2_RVT ZBUF_1656_inst_2160\n",
+      "SDFFARX1_RVT IR_reg_3_\n",
+      "NBUFFX2_RVT ZBUF_85_inst_865\n",
+      "SDFFARX1_RVT IR_reg_2_\n",
+      "SDFFARX1_RVT IR_reg_1_\n",
+      "SDFFARX1_RVT IR_reg_0_\n",
+      "NBUFFX2_RVT ZBUF_17_inst_905\n",
+      "NBUFFX4_RVT ZBUF_275_inst_906\n",
+      "SDFFARX1_RVT B_reg\n",
+      "input test_si000\n"
     ]
    }
   ],
   "source": [
    "chain = []\n",
-    "cell = b14.cells['Scan_Out']\n",
+    "cell = b14.cells['test_so000']\n",
    "chain.append(cell)\n",
    "while len(cell.ins) > 0:\n",
    "    cell = cell.ins[2 if 'SDFF' in cell.kind else 0].driver\n",
@ -409,14 +423,25 @@
				@@ -409,14 +423,25 @@
    "        chain.append(cell)\n",
    "        \n",
    "print('chain length', len(chain))\n",
-    "print([c.name for c in chain])"
+    "for c in chain[:10]:\n",
+    "    print(c.kind, c.name)\n",
+    "print('...')\n",
+    "for c in chain[-10:]:\n",
+    "    print(c.kind, c.name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Working With Test Data and Logic Simulation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Loading SDFs and STILs"
+    "Load a stuck-at fault test pattern set and expected fault-free responses from a STIL file."
   ]
  },
  {
@ -425,15 +450,21 @@
				@@ -425,15 +450,21 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from kyupy import verilog, sdf\n",
-    "from kyupy.saed import pin_index\n",
-    "from kyupy import stil\n",
+    "from kyupy import verilog, stil\n",
+    "from kyupy.logic_sim import LogicSim\n",
+    "from kyupy.packed_vectors import PackedVectors\n",
    "\n",
    "b14 = verilog.parse('tests/b14.v.gz')\n",
-    "df = sdf.parse('tests/b14.sdf.gz')\n",
-    "lt = df.annotation(b14, pin_index, interconnect=False)\n",
-    "s = stil.parse('tests/b14.stil.gz')\n",
-    "t = s.tests8v(b14)"
+    "s = stil.parse('tests/b14.stuck.stil.gz')\n",
+    "stuck_tests = s.tests(b14)\n",
+    "stuck_responses = s.responses(b14)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Tests and responses are instances of `PackedVectors`. Its length is the number of test vectors stored (`nvectors`), its `width` is the number of values in a vector, and its `vdim` is the number of bits used for storing one value. By default, the stil parser returns 4-valued test vectors (`vdim=2`)."
   ]
  },
  {
@ -444,25 +475,7 @@
				@@ -444,25 +475,7 @@
    {
     "data": {
      "text/plain": [
-       "array([[[0., 0.],\n",
-       "        [0., 0.]],\n",
-       "\n",
-       "       [[0., 0.],\n",
-       "        [0., 0.]],\n",
-       "\n",
-       "       [[0., 0.],\n",
-       "        [0., 0.]],\n",
-       "\n",
-       "       ...,\n",
-       "\n",
-       "       [[0., 0.],\n",
-       "        [0., 0.]],\n",
-       "\n",
-       "       [[0., 0.],\n",
-       "        [0., 0.]],\n",
-       "\n",
-       "       [[0., 0.],\n",
-       "        [0., 0.]]])"
+       "<PackedVectors nvectors=1081, width=306, vdim=2>"
      ]
     },
     "execution_count": 16,
@ -471,7 +484,14 @@
				@@ -471,7 +484,14 @@
    }
   ],
   "source": [
-    "lt"
+    "stuck_tests"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The data is stored in a bit-parallel fashion. This internal storage (an `ndarray` of `uint8`) is accessible via `bits`. The first axis is the width, the second axis is `vdim`, the last axis goes along the test set. This last axis is about `nvectors / 8` in length. "
   ]
  },
  {
@ -482,7 +502,7 @@
				@@ -482,7 +502,7 @@
    {
     "data": {
      "text/plain": [
-       "'00-RFRF01F10FFRFF1FR1F1RR010F0F1RRR-------F------------------------------------------------11110110011100110111111110111000010000001111010111001111110110010101100100001000101001101010010011010000001111110111101110110001011010100011010001111010011101001000011111011101111101010111001100100011111100000101110'"
+       "(306, 2, 136)"
      ]
     },
     "execution_count": 17,
@ -491,23 +511,371 @@
				@@ -491,23 +511,371 @@
    }
   ],
   "source": [
-    "t[0]"
+    "stuck_tests.bits.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The subscript accessor returns a string representation of the given test vector number. Possible values are '0', '1', '-', and 'X'."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'-0--------------------11011111011001100111010101011101----------------------------------00-10111011010110011101110010111010111011101100010000110101111111011010101001010101010101010101001010110101001010101010101010110100000111111111111111011010100100101010010010101101010101001010100111010001010010000011100'"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stuck_tests[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'--10000010010100010111--------------------------------0101010010101010110101001001010100--011111110011011111000111010101010111011101100010000110101111111011010101001010101010101010101001010110101001010101010101010110100000111111111111111011010100100101010010010101101010101001010101000111111111111111011101'"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stuck_responses[1]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The order of values in the vectors correspond to the circuit's interface followed by the scan flip-flops as they appear in `b14.cells`. The test data can be used directly in the simulators as they use the same ordering convention. The following code performs a 4-valued logic simulation and stores the results in a new instance of `PackedVectors`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "responses = PackedVectors(len(stuck_tests), stuck_tests.width, 2)\n",
+    "simulator = LogicSim(b14, len(responses), 2)\n",
+    "simulator.assign(stuck_tests)\n",
+    "simulator.propagate()\n",
+    "simulator.capture(responses)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'--10000010010100010111--------------------------------0101010010101010110101001001010100--011111110011011111000111010101010111011101100010000110101111111011010101001010101010101010101001010110101001010101010101010110100000111111111111111011010100100101010010010101101010101001010101000111111111111111011101'"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "responses[1]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compare simulation results to expected fault-free responses loaded from STIL. The first test fails, because it is a flush test while simulation implicitly assumes a standard test with a capture clock."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mismatch for test pattern 0\n",
+      "1080 of 1081 responses matched with simulator\n"
+     ]
+    }
+   ],
+   "source": [
+    "matches = 0\n",
+    "for i in range(len(responses)):\n",
+    "    if responses[i] == stuck_responses[i]:\n",
+    "        matches += 1\n",
+    "    else:\n",
+    "        print(f'mismatch for test pattern {i}')\n",
+    "print(f'{matches} of {len(responses)} responses matched with simulator')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Transition faults require test vector pairs for testing. These pairs are generated by `tests8v`, assuming a launch-on-capture scheme (two functional clock cycles after scan-in)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s = stil.parse('tests/b14.transition.stil.gz')\n",
+    "trans_tests = s.tests8v(b14)\n",
+    "trans_responses = s.responses(b14)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The returned test data is now 8-valued (`vdim=3`)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<PackedVectors nvectors=1392, width=306, vdim=3>"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trans_tests"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Possible values in the string representation are: '0', '1', '-', 'X', 'R' (rising transition), 'F' (falling transition), 'P' (positive pulse(s), 010), 'N' (negative pulse(s), 101)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'-0--------------------RRRRRRFRRRRRRRRRRRFFRFRRRRRRRRRR----------------------------------00-00000001110100011111011010000000000000000011001001100101111110101110110001000100010100110111111101101000000111110011100010111000111R1111111111111111111111110001100100000110100000111010101110RFF00F000F0F00F00000FF01F'"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trans_tests[1]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We validate these patterns with an 8-valued logic simulation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "responses = PackedVectors(len(trans_tests), trans_tests.width, 3)\n",
+    "simulator = LogicSim(b14, len(responses), 3)\n",
+    "simulator.assign(trans_tests)\n",
+    "simulator.propagate()\n",
+    "simulator.capture(responses)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'--F00000F00F0F000F00FF--------------------------------01110101011100000101100000100110R0--0RRRRRRRNNNRNRPRNNNNNRFFRFRRRRRRR000000000011001001100101111110101110110001000100010100110111111101101000000111110011100010111000NNNNNNNNNNNNNNNNNNNNNNNNNNNNP0011001000001101000001110101011101RRRRRRRRRRRRRRRRRRRRP01R'"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "responses[1]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The responses loaded from STIL only contain the final logic values. Use simple character replacements before comparing these. First test is again a flush test."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mismatch for test pattern 0\n",
+      "1391 of 1392 responses matched with simulator\n"
+     ]
+    }
+   ],
+   "source": [
+    "matches = 0\n",
+    "for i in range(len(responses)):\n",
+    "    if trans_responses[i] == responses[i].replace('P','0').replace('N','1').replace('R','1').replace('F','0'):\n",
+    "        matches += 1\n",
+    "    else:\n",
+    "        print(f'mismatch for test pattern {i}')\n",
+    "print(f'{matches} of {len(responses)} responses matched with simulator')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Working With Delay Information and Timing Simulation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## 32 Parallel Time Simulations with Waveform Capacity 16\n",
+    "Delay data for gates and interconnect can be loaded from SDF files. In kyupy's timing simulators, delays are associated with the lines between nodes, not with the nodes themselves. Each line in the circuit has a rising delay, a falling delay, a negative pulse threshold, and a positive pulse threshold. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from kyupy import sdf\n",
+    "from kyupy.saed import pin_index\n",
    "\n",
-    "This code will fall back to pure python if no CUDA card is available. This will be quite slow.\n",
+    "df = sdf.parse('tests/b14.sdf.gz')\n",
+    "lt = df.annotation(b14, pin_index, dataset=0, interconnect=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The returned delay information is an `ndarray` with a set of delay values for each line in the circuit."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(46891, 2, 2)"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lt.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Number of non-0 values loaded:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "119676"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(lt != 0).sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The available timing simulators are `WaveSim` and `WaveSimCuda`.\n",
+    "They work similarly to `LogicSim` in that they evaluate all cells in topological order.\n",
+    "Instead of propagating a logic value, however, they propagate waveforms.\n",
+    "\n",
+    "`WaveSim` uses the numba just-in-time compiler for acceleration on CPU.\n",
+    "It falls back to pure python if numba is not available. `WaveSimCuda` uses numba for GPU acceleration.\n",
+    "If no CUDA card is available, it will fall back to pure python (not jit-compiled for CPU!).\n",
+    "Pure python is too slow for most purposes.\n",
    "\n",
-    "Instanciate simulator:"
+    "Both simulators operate data-parallel.\n",
+    "The following instanciates a new engine for 32 independent timing simulations and each signal line in the circuit can carry at most 16 transitions. All simulators share the same circuit and the same line delay specification."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
@ -521,44 +889,90 @@
				@@ -521,44 +889,90 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Main Simulation Loop"
+    "These are various memories allocated, with waveforms usually being the largest. "
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Waveforms              : 93908.5 kiB\n",
+      "State Allocation Table : 1113.4 kiB\n",
+      "Circuit Timing         : 1484.5 kiB\n",
+      "Circuit Netlist        : 732.7 kiB\n",
+      "Capture Data           : 267.8 kiB\n",
+      "Test Stimuli Data      : 3.6 kiB\n"
+     ]
+    }
+   ],
+   "source": [
+    "def print_mem(name, arr):\n",
+    "    print(f'{name}: {arr.size * arr.itemsize / 1024:.1f} kiB')\n",
+    "    \n",
+    "print_mem('Waveforms              ', wsim.state)\n",
+    "print_mem('State Allocation Table ', wsim.sat)\n",
+    "print_mem('Circuit Timing         ', wsim.timing)\n",
+    "print_mem('Circuit Netlist        ', wsim.ops)\n",
+    "print_mem('Capture Data           ', wsim.cdata)\n",
+    "print_mem('Test Stimuli Data      ', wsim.tdata)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is a typical simulation loop where the number of patterns is larger than the number of simulators available.\n",
+    "We simulate `trans_tests`.\n",
+    "The timing simulator accepts 4-valued and 8-valued `PackedVectors`, but it will return response (capture) data in a different format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
-    "nvectors = 32 #len(t)\n",
-    "r = np.zeros((len(wsim.interface), nvectors, 1))\n",
+    "nvectors = 128  # len(trans_tests)  # Feel free to simulate all tests if CUDA is set up correctly.\n",
+    "\n",
+    "cdata = np.zeros((len(wsim.interface), nvectors, 7))  # space to store all capture data\n",
    "\n",
    "for offset in range(0, nvectors, wsim.sims):\n",
-    "    wsim.assign(t, offset=offset)\n",
+    "    wsim.assign(trans_tests, offset=offset)\n",
    "    wsim.propagate(sims=nvectors-offset)\n",
-    "    cdata = wsim.capture(time=TMAX, offset=offset)\n",
-    "    r = cdata[...,0]"
+    "    wsim.capture(time=2.5, cdata=cdata, offset=offset)  # capture at time 2.5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Output some captures data"
+    "The capture data contains for each PI, PO, and scan flip-flop (axis 0), and each test (axis 1) seven values:\n",
+    "0. Probability of capturing a 1 at the given capture time (same as next value, if no standard deviation given).\n",
+    "1. A capture value decided by random sampling according to above probability.\n",
+    "2. The final value (assume a very late capture time).\n",
+    "3. True, if there was a premature capture (capture error), i.e. final value is different from captured value.\n",
+    "4. Earliest arrival time. The time at which the output transitioned from its initial value.\n",
+    "5. Latest stabilization time. The time at which the output transitioned to its final value.\n",
+    "6. Overflow indicator. If non-zero, some signals in the input cone of this output had more transitions than specified in `wavecaps`. Some transitions have been discarded, the final values in the waveforms are still valid."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "(306, 32, 6)"
+       "(306, 128, 7)"
      ]
     },
-     "execution_count": 20,
+     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -567,46 +981,166 @@
				@@ -567,46 +981,166 @@
    "cdata.shape"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For validating against known logic values, take `cdata[...,1]`."
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mismatch for test pattern 0\n",
+      "127 of 128 responses matched with simulator\n"
+     ]
+    }
+   ],
+   "source": [
+    "matches = 0\n",
+    "\n",
+    "for i in range(cdata.shape[1]):\n",
+    "    response = ''.join('1' if x > 0.5 else '0' for x in cdata[..., i, 1])\n",
+    "    if trans_responses[i].replace('-','0') == response:\n",
+    "        matches += 1\n",
+    "    else:\n",
+    "        print(f'mismatch for test pattern {i}')\n",
+    "print(f'{matches} of {cdata.shape[1]} responses matched with simulator')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The circuit delay is the maximum among all latest stabilization times:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "array([[0., 0., 0., ..., 0., 0., 0.],\n",
-       "       [0., 0., 0., ..., 0., 0., 0.],\n",
-       "       [0., 0., 0., ..., 0., 0., 0.],\n",
-       "       ...,\n",
-       "       [1., 1., 1., ..., 1., 1., 1.],\n",
-       "       [0., 0., 0., ..., 0., 0., 0.],\n",
-       "       [0., 0., 0., ..., 1., 1., 1.]], dtype=float32)"
+       "2.0610005855560303"
      ]
     },
-     "execution_count": 21,
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cdata[...,5].max()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check for overflows. If too many of them occur, increase `wavecaps` during engine instanciation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.0"
+      ]
+     },
+     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "r"
+    "cdata[...,6].sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Check for CUDA Support\n",
+    "Check for capture failures:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.0"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cdata[...,3].sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# CUDA Support Notes\n",
    "\n",
-    "Try this code to check if CUDA is set up correctly."
+    "Try this code to check if CUDA is set up correctly.\n",
+    "\n",
+    "If there is an error related to `nvvm`, you probably need to set up some environment variables:\n",
+    "```\n",
+    "%env LD_LIBRARY_PATH=/usr/local/cuda/lib64\n",
+    "%env NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so\n",
+    "%env NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice\n",
+    "```\n",
+    "If problems persist, refer to documentations for numba and cuda. "
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 40,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 1 CUDA devices\n",
+      "id 0              b'TITAN V'                              [SUPPORTED]\n",
+      "                      compute capability: 7.0\n",
+      "                           pci device id: 0\n",
+      "                              pci bus id: 2\n",
+      "Summary:\n",
+      "\t1/1 devices are supported\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "from numba import cuda\n",
    "\n",
@ -637,7 +1171,7 @@
				@@ -637,7 +1171,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.6.8"
  }
 },
 "nbformat": 4,
--- a/kyupy/stil.py
+++ b/kyupy/stil.py
@ -109,7 +109,7 @@ class StilFile:
				@@ -109,7 +109,7 @@ class StilFile:
        interface, pi_map, po_map, scan_maps, scan_inversions = self._maps(c)
        resp = PackedVectors(len(self.patterns), len(interface), 2)
        for i, p in enumerate(self.patterns):
-            if (len(p.capture) > 0):
+            if len(p.capture) > 0:
                resp.set_values(i, p.capture['_po'], po_map)
            else:
                resp.set_values(i, p.launch['_po'], po_map)
--- a/kyupy/wave_sim.py
+++ b/kyupy/wave_sim.py
@ -284,15 +284,15 @@ class WaveSim:
				@@ -284,15 +284,15 @@ class WaveSim:
    def wave_ppo(self, o, vector):
        return self.wave(self.ppo_offset + o, vector)

-    def capture(self, time=TMAX, sd=0, seed=1, probabilities=None, offset=0):
+    def capture(self, time=TMAX, sd=0, seed=1, cdata=None, offset=0):
        for i, node in enumerate(self.interface):
            if len(node.ins) == 0: continue
            for p in range(self.sims):
                self.cdata[i, p] = self.capture_wave(self.ppo_offset + i, p, time, sd, seed)
-        if probabilities is not None:
-            assert offset < probabilities.shape[1]
-            cap_dim = min(probabilities.shape[1] - offset, self.sims)
-            probabilities[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim, 0]
+        if cdata is not None:
+            assert offset < cdata.shape[1]
+            cap_dim = min(cdata.shape[1] - offset, self.sims)
+            cdata[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim]
        self.lst_eat_valid = True
        return self.cdata

--- a/kyupy/wave_sim_cuda.py
+++ b/kyupy/wave_sim_cuda.py
@ -70,15 +70,15 @@ class WaveSimCuda(WaveSim):
				@@ -70,15 +70,15 @@ class WaveSimCuda(WaveSim):
            return None
        return self.d_state[mem:mem + wcap, vector]
    
-    def capture(self, time=TMAX, sd=0, seed=1, probabilities=None, offset=0):
+    def capture(self, time=TMAX, sd=0, seed=1, cdata=None, offset=0):
        grid_dim = self._grid_dim(self.sims, len(self.interface))
        capture_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppo_offset,
                                                  self.d_cdata, time, sd * math.sqrt(2), seed)
        self.cdata[...] = self.d_cdata
-        if probabilities is not None:
-            assert offset < probabilities.shape[1]
-            cap_dim = min(probabilities.shape[1] - offset, self.sims)
-            probabilities[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim, 0]
+        if cdata is not None:
+            assert offset < cdata.shape[1]
+            cap_dim = min(cdata.shape[1] - offset, self.sims)
+            cdata[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim]
        self.lst_eat_valid = True
        return self.cdata

--- a/tests/b14.sdf.gz
+++ b/tests/b14.sdf.gz
--- a/tests/b14.stil.gz
+++ b/tests/b14.stil.gz
--- a/tests/b14.stuck.stil.gz
+++ b/tests/b14.stuck.stil.gz
--- a/tests/b14.transition.stil.gz
+++ b/tests/b14.transition.stil.gz
--- a/tests/b14.v.gz
+++ b/tests/b14.v.gz
--- a/tests/test_stil.py
+++ b/tests/test_stil.py
@ -2,8 +2,8 @@ from kyupy import stil
				@@ -2,8 +2,8 @@ from kyupy import stil


 def test_b14(mydir):
-    s = stil.parse(mydir / 'b14.stil.gz')
+    s = stil.parse(mydir / 'b14.stuck.stil.gz')
    assert 10 == len(s.signal_groups)
    assert 1 == len(s.scan_chains)
-    assert 3259 == len(s.calls)
+    assert 2163 == len(s.calls)

--- a/tests/test_wave_sim.py
+++ b/tests/test_wave_sim.py
@ -100,15 +100,14 @@ def compare_to_logic_sim(wsim):
				@@ -100,15 +100,14 @@ def compare_to_logic_sim(wsim):
    tests.randomize()
    wsim.assign(tests)
    wsim.propagate(8)
-    cap = np.zeros((len(wsim.interface), wsim.sims))
-    wsim.capture(probabilities=cap)
+    cdata = wsim.capture()

    resp = tests.copy()

    for iidx, inode in enumerate(wsim.interface):
        if len(inode.ins) > 0:
            for vidx in range(wsim.sims):
-                resp.set_value(vidx, iidx, 0 if cap[iidx, vidx] < 0.5 else 1)
+                resp.set_value(vidx, iidx, 0 if cdata[iidx, vidx, 0] < 0.5 else 1)

    lsim = LogicSim(wsim.circuit, len(tests), 3)
    lsim.assign(tests)