diff --git a/src/kyupy/__init__.py b/src/kyupy/__init__.py
index 5b461bf..1dd07d5 100644
--- a/src/kyupy/__init__.py
+++ b/src/kyupy/__init__.py
@@ -10,6 +10,24 @@ import gzip
 import numpy as np
 
 
+_pop_count_lut = np.asarray([bin(x).count('1') for x in range(256)])
+
+
+def popcount(a):
+    return np.sum(_pop_count_lut[a])
+
+
+def readtext(file):
+    if hasattr(file, 'read'):
+        return file.read()
+    if str(file).endswith('.gz'):
+        with gzip.open(file, 'rt') as f:
+            return f.read()
+    else:
+        with open(file, 'rt') as f:
+            return f.read()
+
+
 def hr_sci(value):
     multiplier = 0
     while abs(value) >= 1000:
@@ -18,7 +36,7 @@ def hr_sci(value):
     while abs(value) < 1:
         value *= 1000
         multiplier -= 1
-    return f'{value:.3f}{" kMGTafpnµm"[multiplier]}'
+    return f'{value:.3f}{" kMGTPEafpnµm"[multiplier]}'
 
 
 def hr_bytes(nbytes):
@@ -89,6 +107,10 @@ class Log:
 log = Log()
 
 
+#
+# Code below mocks basic numba and cuda functions for pure-python fallback.
+#
+
 class MockNumba:
     @staticmethod
     def njit(func):
@@ -113,12 +135,10 @@ class MockCuda:
                     self.func = funcc
 
                 def __call__(self, *args, **kwargs):
-                    # print(f'device func call {self.func.__name__}')
                     return self.func(*args, **kwargs)
 
                 def __getitem__(self, item):
                     grid_dim, block_dim = item
-                    # print(f'kernel call {self.func.__name__} grid_dim:{grid_dim} block_dim:{block_dim}')
 
                     def inner(*args, **kwargs):
                         for grid_x in range(grid_dim[0]):
@@ -162,21 +182,3 @@ else:
     numba = MockNumba()
     cuda = MockCuda()
     log.warn('Numba unavailable. Falling back to pure Python.')
-
-
-_pop_count_lut = np.asarray([bin(x).count('1') for x in range(256)])
-
-
-def popcount(a):
-    return np.sum(_pop_count_lut[a])
-
-
-def readtext(file):
-    if hasattr(file, 'read'):
-        return file.read()
-    if str(file).endswith('.gz'):
-        with gzip.open(file, 'rt') as f:
-            return f.read()
-    else:
-        with open(file, 'rt') as f:
-            return f.read()
diff --git a/src/kyupy/circuit.py b/src/kyupy/circuit.py
index 31828b1..63b132a 100644
--- a/src/kyupy/circuit.py
+++ b/src/kyupy/circuit.py
@@ -75,6 +75,9 @@ class Node:
         """A list of output connections (:class:`Line` objects).
         """
 
+    def __index__(self):
+        return self.index
+
     def __repr__(self):
         ins = ' '.join([f'<{line.index}' if line is not None else '<None' for line in self.ins])
         outs = ' '.join([f'>{line.index}' if line is not None else '>None' for line in self.outs])
@@ -160,6 +163,9 @@ class Line:
         self.reader = None
         self.circuit = None
 
+    def __index__(self):
+        return self.index
+
     def __repr__(self):
         return f'{self.index}'
 
diff --git a/src/kyupy/logic_sim.py b/src/kyupy/logic_sim.py
index 484456c..1c27f42 100644
--- a/src/kyupy/logic_sim.py
+++ b/src/kyupy/logic_sim.py
@@ -1,4 +1,4 @@
-"""A High-Throughput combinational logic simulator.
+"""A high-throughput combinational logic simulator.
 
 The class :py:class:`~kyupy.logic_sim.LogicSim` performs parallel simulations of the combinational part of a circuit.
 The logic operations are performed bit-parallel on packed numpy arrays.
@@ -31,6 +31,8 @@ class LogicSim:
         self.sims = sims
         nbytes = (sims - 1) // 8 + 1
         self.interface = list(circuit.interface) + [n for n in circuit.nodes if 'dff' in n.kind.lower()]
+        self.width = len(self.interface)
+        """The number of bits in the circuit state (number of ports + number of state-elements)."""
         self.state = np.zeros((len(circuit.lines), mdim, nbytes), dtype='uint8')
         self.state_epoch = np.zeros(len(circuit.nodes), dtype='int8') - 1
         self.tmp = np.zeros((5, mdim, nbytes), dtype='uint8')
diff --git a/src/kyupy/sdf.py b/src/kyupy/sdf.py
index 9c4ecfd..d26c6ee 100644
--- a/src/kyupy/sdf.py
+++ b/src/kyupy/sdf.py
@@ -43,19 +43,29 @@ class DelayFile:
         Currently, only ABSOLUTE IOPATH and INTERCONNECT delays are supported.
         Pulse rejection limits are derived from absolute delays, explicit declarations (PATHPULSE etc.) are ignored.
 
-        :param circuit:
-        :param pin_index_f:
-        :param ffdelays:
-        :param interconnect:
-        :type dataset: int or tuple
+        :param circuit: The circuit to annotate. Names from the STIL file are matched to the node names.
+        :type circuit: :class:`~kyupy.circuit.Circuit`
+        :param pin_index_f: A function that returns a pin position by node type and pin name.
+        :param dataset: SDFs store multiple values for each delay (e.g. minimum, typical, maximum).
+            An integer selects the dataset to use (default is 1 for 'typical').
+            If a tuple is given, the annotator will calculate the average of multiple datasets.
+        :type dataset: ``int`` or ``tuple``
+        :param interconnect: Whether or not to include the delays of interconnects in the annotation.
+            To properly annotate interconnect delays, the circuit model has to include a '__fork__' node on
+            every signal and every fanout-branch. The Verilog parser aids in this by setting the parameter
+            `branchforks=True` in :py:func:`kyupy.verilog.parse`.
+        :type interconnect: ``bool``
+        :param ffdelays: Whether or not to include the delays of flip-flops in the annotation.
+        :type ffdelays: ``bool``
         :return: A 3-dimensional ndarray with timing data.
 
             * Axis 0: line index.
-            * Axis 1: type of timing data: 0=`delay`, 1=`pulse rejection limit`.
-            * Axis 2: The polarity of the output transition of the reading node: 0=`rising`, 1=`falling`.
+            * Axis 1: type of timing data: 0='delay', 1='pulse rejection limit'.
+            * Axis 2: The polarity of the output transition of the reading node: 0='rising', 1='falling'.
 
             The polarity for pulse rejection is determined by the latter transition of the pulse.
-            E.g., timing[42,1,0] is the rejection limit of a negative pulse at the output of the reader of line 42.
+            E.g., ``timing[42, 1, 0]`` is the rejection limit of a negative pulse at the output
+            of the reader of line 42.
         """
         def select_del(_delvals, idx):
             if isinstance(dataset, tuple):
diff --git a/src/kyupy/stil.py b/src/kyupy/stil.py
index 5faf56b..75bffc2 100644
--- a/src/kyupy/stil.py
+++ b/src/kyupy/stil.py
@@ -4,7 +4,7 @@ The main purpose of this parser is to load scan pattern sets from STIL files.
 It supports only a very limited subset of STIL.
 
 The functions :py:func:`load` and :py:func:`read` return an intermediate representation (:class:`StilFile` object).
-Call :py:func:`StilFile.tests4v`, :py:func:`StilFile.tests8v`, or :py:func:`StilFile.responses4v` to
+Call :py:func:`StilFile.tests`, :py:func:`StilFile.tests_loc`, or :py:func:`StilFile.responses` to
 obtain the appropriate vector sets.
 """
 
diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py
index d9e95cf..4902f1a 100644
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@@ -1,10 +1,10 @@
-"""High-Throughput combinational logic timing simulators.
+"""High-throughput combinational logic timing simulators.
 
 These simulators work similarly to :py:class:`~kyupy.logic_sim.LogicSim`.
 They propagate values through the combinational circuit from (pseudo) primary inputs to (pseudo) primary outputs.
 Instead of propagating logic values, these simulators propagate signal histories (waveforms).
-They are designed to run many simulations in parallel and while their latencies are quite high, they achieve
-high throughput performance.
+They are designed to run many simulations in parallel and while their latencies are quite high, they can achieve
+high throughput.
 
 The simulators are not event-based and are not capable of simulating sequential circuits directly.
 
@@ -20,9 +20,13 @@ import numpy as np
 from . import numba, cuda, hr_bytes
 
 
-TMAX = np.float32(2 ** 127)  # almost np.PINF for 32-bit floating point values
-TMAX_OVL = np.float32(1.1 * 2 ** 127)  # almost np.PINF with overflow mark
-TMIN = np.float32(-2 ** 127)  # almost np.NINF for 32-bit floating point values
+TMAX = np.float32(2 ** 127)
+"""A large 32-bit floating point value used to mark the end of a waveform."""
+TMAX_OVL = np.float32(1.1 * 2 ** 127)
+"""A large 32-bit floating point value used to mark the end of a waveform that
+may be incomplete due to an overflow."""
+TMIN = np.float32(-2 ** 127)
+"""A large negative 32-bit floating point value used at the beginning of waveforms that start with logic-1."""
 
 
 class Heap:
@@ -92,7 +96,23 @@ class Heap:
 
 
 class WaveSim:
-    """A waveform-based combinational logic timing simulator."""
+    """A waveform-based combinational logic timing simulator running on CPU.
+
+    :param circuit: The circuit to simulate.
+    :param timing: The timing annotation of the circuit (see :py:func:`kyupy.sdf.DelayFile.annotation` for details)
+    :param sims: The number of parallel simulations.
+    :param wavecaps: The number of floats available in each waveform. Waveforms are encoding the signal switching
+        history by storing transition times. The waveform capacity roughly corresponds to the number of transitions
+        that can be stored. A capacity of ``n`` can store at least ``n-2`` transitions. If more transitions are
+        generated during simulation, the latest glitch is removed (freeing up two transition times) and an overflow
+        flag is set. If an integer is given, all waveforms are set to that same capacity. With an array of length
+        ``len(circuit.lines)`` the capacity can be controlled for each intermediate waveform individually.
+    :param strip_forks: If enabled, the simulator will not evaluate fork nodes explicitly. This saves simulation time
+        by reducing the number of nodes to simulate, but (interconnect) delay annotations of lines read by fork nodes
+        are ignored.
+    :param keep_waveforms: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
+        memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
+    """
     def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
         self.circuit = circuit
         self.sims = sims
@@ -258,12 +278,24 @@ class WaveSim:
                f'levels={len(self.level_starts)} mem={hr_bytes(total_mem)}>'
 
     def get_line_delay(self, line, polarity):
+        """Returns the current delay of the given ``line`` and ``polarity`` in the simulation model."""
         return self.timing[line, 0, polarity]
 
     def set_line_delay(self, line, polarity, delay):
+        """Sets a new ``delay`` for the given ``line`` and ``polarity`` in the simulation model."""
         self.timing[line, 0, polarity] = delay
 
     def assign(self, vectors, time=0.0, offset=0):
+        """Assigns new values to the primary inputs and state-elements.
+
+        :param vectors: The values to assign preferably in 8-valued logic. The values are converted to
+            appropriate waveforms with or one transition (``RISE``, ``FALL``) no transitions
+            (``ZERO``, ``ONE``, and others).
+        :type vectors: :py:class:`~kyupy.logic.BPArray`
+        :param time: The transition time of the generated waveforms.
+        :param offset: The offset into the vector set. The vector assigned to the first simulator is
+            ``vectors[offset]``.
+        """
         nvectors = min(len(vectors) - offset, self.sims)
         for i in range(len(self.interface)):
             ppi_loc = self.sat[self.ppi_offset + i, 0]
@@ -287,6 +319,12 @@ class WaveSim:
                 self.state[ppi_loc + toggle, p] = TMAX
 
     def propagate(self, sims=None, sd=0.0, seed=1):
+        """Propagates all waveforms from the (pseudo) primary inputs to the (pseudo) primary outputs.
+
+        :param sims: Number of parallel simulations to execute. If None, all available simulations are performed.
+        :param sd: Standard deviation for injection of random delay variation. Active, if value is positive.
+        :param seed: Random seed for delay variations.
+        """
         sims = min(sims or self.sims, self.sims)
         for op_start, op_stop in zip(self.level_starts, self.level_stops):
             self.overflows += level_eval(self.ops, op_start, op_stop, self.state, self.sat, 0, sims,
@@ -294,6 +332,8 @@ class WaveSim:
         self.lst_eat_valid = False
 
     def wave(self, line, vector):
+        """Returns the desired waveform from the simulation state. Only valid, if simulator was
+        instanciated with ``keep_waveforms=True``."""
         if line < 0:
             return [TMAX]
         mem, wcap, _ = self.sat[line]
@@ -307,7 +347,34 @@ class WaveSim:
     def wave_ppo(self, o, vector):
         return self.wave(self.ppo_offset + o, vector)
 
-    def capture(self, time=TMAX, sd=0, seed=1, cdata=None, offset=0):
+    def capture(self, time=TMAX, sd=0.0, seed=1, cdata=None, offset=0):
+        """Simulates a capture operation at all state-elements and primary outputs.
+
+        The capture analyzes the propagated waveforms at and around the given capture time and returns
+        various results for each capture operation.
+
+        :param time: The desired capture time. By default, a capture of the settled value is performed.
+        :param sd: A standard deviation for uncertainty in the actual capture time.
+        :param seed: The random seed for a capture with uncertainty.
+        :param cdata: An array to copy capture data into (optional). See the return value for details.
+        :param offset: An offset into the supplied capture data array.
+        :return: The capture data as numpy array.
+
+            The 3-dimensional capture data array contains for each interface node (axis 0),
+            and each test (axis 1), seven values:
+
+            0. Probability of capturing a 1 at the given capture time (same as next value, if no
+               standard deviation given).
+            1. A capture value decided by random sampling according to above probability and given seed.
+            2. The final value (assume a very late capture time).
+            3. True, if there was a premature capture (capture error), i.e. final value is different
+               from captured value.
+            4. Earliest arrival time. The time at which the output transitioned from its initial value.
+            5. Latest stabilization time. The time at which the output transitioned to its final value.
+            6. Overflow indicator. If non-zero, some signals in the input cone of this output had more
+               transitions than specified in ``wavecaps``. Some transitions have been discarded, the
+               final values in the waveforms are still valid.
+        """
         for i, node in enumerate(self.interface):
             if len(node.ins) == 0: continue
             for p in range(self.sims):
@@ -320,6 +387,14 @@ class WaveSim:
         return self.cdata
 
     def reassign(self, time=0.0):
+        """Re-assigns the last capture to the appropriate pseudo-primary inputs. Generates a new set of
+        waveforms at the PPIs that start with the previous final value of that PPI, and transitions at the
+        given time to the value captured in a previous simulation. :py:func:`~WaveSim.capture` must be called
+        prior to this function. The final value of each PPI is taken from the randomly sampled concrete logic
+        values in the capture data.
+
+        :param time: The transition time at the inputs (usually 0.0).
+        """
         for i in range(len(self.interface)):
             ppi_loc = self.sat[self.ppi_offset + i, 0]
             ppo_loc = self.sat[self.ppo_offset + i, 0]
@@ -544,7 +619,12 @@ def wave_eval(op, state, sat, st_idx, line_times, sd=0.0, seed=0):
 
 
 class WaveSimCuda(WaveSim):
-    """A GPU-accelerated waveform-based combinational logic timing simulator."""
+    """A GPU-accelerated waveform-based combinational logic timing simulator.
+
+    The API is the same as for :py:class:`WaveSim`.
+    All internal memories are mirrored into GPU memory upon construction.
+    Some operations like access to single waveforms can involve large communication overheads.
+    """
     def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
         super().__init__(circuit, timing, sims, wavecaps, strip_forks, keep_waveforms)
 
@@ -602,10 +682,10 @@ class WaveSimCuda(WaveSim):
 
     def wave(self, line, vector):
         if line < 0:
-            return None
+            return [TMAX]
         mem, wcap, _ = self.sat[line]
         if mem < 0:
-            return None
+            return [TMAX]
         return self.d_state[mem:mem + wcap, vector]
 
     def capture(self, time=TMAX, sd=0, seed=1, cdata=None, offset=0):